From ad0254566ed9f1cb33338feb156c5751ec3747e4 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Thu, 4 Nov 2021 10:13:34 -0400
Subject: [PATCH 001/202] DOC v22.02 Updates

---
 CHANGELOG.md                             | 4 ++++
 conda/environments/cudf_dev_cuda11.0.yml | 2 +-
 conda/environments/cudf_dev_cuda11.2.yml | 2 +-
 cpp/CMakeLists.txt                       | 2 +-
 cpp/doxygen/Doxyfile                     | 4 ++--
 cpp/examples/basic/CMakeLists.txt        | 2 +-
 docs/cudf/source/conf.py                 | 4 ++--
 7 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b46ac22d767..4dd94954a82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuDF 22.02.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch.
+
 # cuDF 21.12.00 (Date TBD)
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 803e4f0ba26..60a5959a23f 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.12.*
+  - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 2281d361ebd..7904593c4c7 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.12.*
+  - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1a0c853ef48..bd08717ff43 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.02/RAPIDS.cmake
      ${CMAKE_BINARY_DIR}/RAPIDS.cmake
 )
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 1141f20e3b1..55e5119040e 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 21.12.00
+PROJECT_NUMBER         = 22.02.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.12
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/22.02
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index df44ac31d90..9bb021f1429 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -14,7 +14,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-21.12)
+set(CUDF_TAG branch-22.02)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 4a7d115ae3b..2c184252192 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '21.12'
+version = '22.02'
 # The full version, including alpha/beta/rc tags.
-release = '21.12.00'
+release = '22.02.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From d64e2749a608d0eca79f5baa01ce5e13afaadc96 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 12 Nov 2021 19:47:46 -0600
Subject: [PATCH 002/202] Fix links in C++ Developer Guide. (#9675)

This PR improves the C++ developer guide.

My primary goal was to fix some invalid links.

The diff is a bit large because of some minor changes in the interest of establishing consistent style and improving the reading/editing experience. (e.g. replacing a few instances of tabs with spaces, trimming trailing whitespace, wrapping sections that were not wrapped like the rest of the file, and correcting typos that I came across while reading). To save time, I recommend that reviewers use the option in GitHub's review tab that will ignore whitespace changes.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9675
---
 cpp/docs/BENCHMARKING.md    |  22 +-
 cpp/docs/DEVELOPER_GUIDE.md | 580 ++++++++++++++++++------------------
 cpp/docs/TESTING.md         | 160 +++++-----
 3 files changed, 384 insertions(+), 378 deletions(-)

diff --git a/cpp/docs/BENCHMARKING.md b/cpp/docs/BENCHMARKING.md
index ddf7e177516..ed99ff5f1be 100644
--- a/cpp/docs/BENCHMARKING.md
+++ b/cpp/docs/BENCHMARKING.md
@@ -8,16 +8,16 @@ other benchmarks in `cpp/benchmarks` to understand the options.
 
 ## Directory and File Naming
 
-The naming of unit benchmark directories and source files should be consistent with the feature 
-being benchmarked. For example, the benchmarks for APIs in `copying.hpp` should live in 
-`cudf/cpp/benchmarks/copying`. Each feature (or set of related features) should have its own 
+The naming of unit benchmark directories and source files should be consistent with the feature
+being benchmarked. For example, the benchmarks for APIs in `copying.hpp` should live in
+`cudf/cpp/benchmarks/copying`. Each feature (or set of related features) should have its own
 benchmark source file named `<feature>_benchmark.cu/cpp`. For example,
-`cudf/cpp/src/copying/scatter.cu` has benchmarks in 
+`cudf/cpp/src/copying/scatter.cu` has benchmarks in
 `cudf/cpp/benchmarks/copying/scatter_benchmark.cu`.
 
-In the interest of improving compile time, whenever possible, test source files should be `.cpp` 
+In the interest of improving compile time, whenever possible, test source files should be `.cpp`
 files because `nvcc` is slower than `gcc` in compiling host code. Note that `thrust::device_vector`
-includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, 
+includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`,
 `rmm::device_buffer` and the various `column_wrapper` types described in [Testing](TESTING.md)
 can be used in `.cpp` files, and are therefore preferred in test code over `thrust::device_vector`.
 
@@ -25,7 +25,7 @@ can be used in `.cpp` files, and are therefore preferred in test code over `thru
 
 CUDA computations and operations like copies are typically asynchronous with respect to host code,
 so it is important to carefully synchronize in order to ensure the benchmark timing is not stopped
-before the feature you are benchmarking has completed. An RAII helper class `cuda_event_timer` is 
+before the feature you are benchmarking has completed. An RAII helper class `cuda_event_timer` is
 provided in `cpp/benchmarks/synchronization/synchronization.hpp` to help with this. This class
 can also optionally clear the GPU L2 cache in order to ensure cache hits do not artificially inflate
 performance in repeated iterations.
@@ -35,10 +35,10 @@ performance in repeated iterations.
 In general, we should benchmark all features over a range of data sizes and types, so that we can
 catch regressions across libcudf changes. However, running many benchmarks is expensive, so ideally
 we should sample the parameter space in such a way to get good coverage without having to test
-exhaustively. 
+exhaustively.
 
-A rule of thumb is that we should benchmark with enough data to reach the point where the algorithm 
-reaches its saturation bottleneck, whether that bottleneck is bandwidth or computation. Using data 
+A rule of thumb is that we should benchmark with enough data to reach the point where the algorithm
+reaches its saturation bottleneck, whether that bottleneck is bandwidth or computation. Using data
 sets larger than this point is generally not helpful, except in specific cases where doing so
-exercises different code and can therefore uncover regressions that smaller benchmarks will not 
+exercises different code and can therefore uncover regressions that smaller benchmarks will not
 (this should be rare).
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 18860504bf1..5e465ed6991 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -1,31 +1,31 @@
 # libcudf C++ Developer Guide
 
-This document serves as a guide for contributors to libcudf C++ code. Developers should also refer 
+This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
 
 * [Documentation Guide](DOCUMENTATION.md) for guidelines on documenting libcudf code.
 * [Testing Guide](TESTING.md) for guidelines on writing unit tests.
-* [Benchmarking Guide](TODO) for guidelines on writing unit benchmarks.
+* [Benchmarking Guide](BENCHMARKING.md) for guidelines on writing unit benchmarks.
 
 # Overview
 
-libcudf is a C++ library that provides GPU-accelerated data-parallel algorithms for processing 
-column-oriented tabular data. libcudf provides algorithms including slicing, filtering, sorting, 
+libcudf is a C++ library that provides GPU-accelerated data-parallel algorithms for processing
+column-oriented tabular data. libcudf provides algorithms including slicing, filtering, sorting,
 various types of aggregations, and database-type operations such as grouping and joins. libcudf
 serves a number of clients via multiple language interfaces, including Python and Java. Users may
 also use libcudf directly from C++ code.
 
 ## Lexicon
 
-This section defines terminology used within libcudf
+This section defines terminology used within libcudf.
 
 ### Column
 
-A column is an array of data of a single type. Along with Tables, columns are the fundamental data 
+A column is an array of data of a single type. Along with Tables, columns are the fundamental data
 structures used in libcudf. Most libcudf algorithms operate on columns. Columns may have a validity
-mask representing whether each element is valid or null (invalid). Columns of nested types are 
+mask representing whether each element is valid or null (invalid). Columns of nested types are
 supported, meaning that a column may have child columns. A column is the C++ equivalent to a cuDF
-Python [series](https://docs.rapids.ai/api/cudf/stable/api.html#series)
+Python [Series](https://docs.rapids.ai/api/cudf/stable/api_docs/series.html).
 
 ### Element
 
@@ -37,29 +37,29 @@ A type representing a single element of a data type.
 
 ### Table
 
-A table is a collection of columns with equal number of elements. A table is the C++ equivalent to 
-a cuDF Python [data frame](https://docs.rapids.ai/api/cudf/stable/api.html#dataframe).
+A table is a collection of columns with equal number of elements. A table is the C++ equivalent to
+a cuDF Python [DataFrame](https://docs.rapids.ai/api/cudf/stable/api_docs/dataframe.html).
 
 ### View
 
-A view is a non-owning object that provides zero-copy access (possibly with slicing or offsets) data 
-owned by another object. Examples are column views and table views.
+A view is a non-owning object that provides zero-copy access (possibly with slicing or offsets) to
+data owned by another object. Examples are column views and table views.
 
 # Directory Structure and File Naming
 
-External/public libcudf APIs are grouped based on functionality into an appropriately titled 
-header file  in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/copying.hpp` 
-contains the APIs for functions related to copying from one column to another. Note the  `.hpp` 
+External/public libcudf APIs are grouped based on functionality into an appropriately titled
+header file in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/copying.hpp`
+contains the APIs for functions related to copying from one column to another. Note the `.hpp`
 file extension used to indicate a C++ header file.
 
-Header files should use the `#pragma once` include guard. 
+Header files should use the `#pragma once` include guard.
 
-The naming of external API headers should be consistent with the name of the folder that contains 
+The naming of external API headers should be consistent with the name of the folder that contains
 the source files that implement the API. For example, the implementation of the APIs found in
-`cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for 
+`cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for
 the APIs reside in `cudf/tests/copying/`.
 
-Internal API headers containing `detail` namespace definitions that are used across translation 
+Internal API headers containing `detail` namespace definitions that are used across translation
 units inside libcudf should be placed in `include/cudf/detail`.
 
 ## File extensions
@@ -75,22 +75,24 @@ execution policy (always `rmm::exec_policy` in libcudf).
 
 ## Code and Documentation Style and Formatting
 
-libcudf code uses [snake_case](https://en.wikipedia.org/wiki/Snake_case) for all names except in a 
-few cases: template parameters, unit tests and test case names may use Pascal case, aka 
-[UpperCamelCase](https://en.wikipedia.org/wiki/Camel_case). We do not use [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation), except sometimes when naming device data variables and their corresponding
-host copies. Private member variables are typically prefixed with an underscore.
+libcudf code uses [snake_case](https://en.wikipedia.org/wiki/Snake_case) for all names except in a
+few cases: template parameters, unit tests and test case names may use Pascal case, aka
+[UpperCamelCase](https://en.wikipedia.org/wiki/Camel_case). We do not use
+[Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation), except sometimes when naming
+device data variables and their corresponding host copies. Private member variables are typically
+prefixed with an underscore.
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr) 
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
 {
   ...
 }
 
-class utility_class 
+class utility_class
 {
   ...
- private:
+private:
   int _rating{};
   std::unique_ptr<cudf::column> _column{};
 }
@@ -103,26 +105,26 @@ TYPED_TEST(RepeatTypedTestFixture, RepeatScalarCount)
 }
 ```
 
-C++ formatting is enforced using `clang-format`. You should configure `clang-format` on your 
-machine to use the `cudf/cpp/.clang-format` configuration file, and run `clang-format` on all 
-changed code before committing it. The easiest way to do this is to configure your editor to 
-"format on save".
+C++ formatting is enforced using `clang-format`. You should configure `clang-format` on your
+machine to use the `cudf/cpp/.clang-format` configuration file, and run `clang-format` on all
+changed code before committing it. The easiest way to do this is to configure your editor to
+"format on save."
 
 Aspects of code style not discussed in this document and not automatically enforceable are typically
 caught during code review, or not enforced.
 
 ### C++ Guidelines
 
-In general, we recommend following 
-[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines). We also 
-recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/watch?v=W2tWOdzgXHA), 
-and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives." 
+In general, we recommend following
+[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines). We also
+recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/watch?v=W2tWOdzgXHA),
+and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives."
 
  * Prefer algorithms from STL and Thrust to raw loops.
- * Prefer libcudf and RMM [owning data structures and views](libcudf-data-structures) to raw pointers
+ * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw pointers
    and raw memory allocation.
  * libcudf doesn't have a lot of CPU-thread concurrency, but there is some. And currently libcudf
-   does use raw synchronization primitives. So we should revisit Parent's third rule and improve 
+   does use raw synchronization primitives. So we should revisit Parent's third rule and improve
    here.
 
 Documentation is discussed in the [Documentation Guide](DOCUMENTATION.md).
@@ -131,28 +133,28 @@ Documentation is discussed in the [Documentation Guide](DOCUMENTATION.md).
 
 The following guidelines apply to organizing `#include` lines.
 
- * Group includes by library (e.g. cuDF, RMM, Thrust, STL). `clang-format` will respect the 
+ * Group includes by library (e.g. cuDF, RMM, Thrust, STL). `clang-format` will respect the
    groupings and sort the individual includes within a group lexicographically.
  * Separate groups by a blank line.
- * Order the groups from "nearest" to "farthest". In other words, local includes, then includes 
-   from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then 
-   includes from dependencies installed with cuDF, and then standard headers (for example `<string>`, 
+ * Order the groups from "nearest" to "farthest". In other words, local includes, then includes
+   from other RAPIDS libraries, then includes from related libraries, like `<thrust/...>`, then
+   includes from dependencies installed with cuDF, and then standard headers (for example `<string>`,
    `<iostream>`).
- * Use <> instead of "" unless the header is in the same directory as the source file.
+ * Use `<>` instead of `""` unless the header is in the same directory as the source file.
  * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping
    and brackets wrong.
- * Always check that includes are only necessary for the file in which they are included. 
-   Try to avoid excessive including especially in header files. Double check this when you remove 
+ * Always check that includes are only necessary for the file in which they are included.
+   Try to avoid excessive including especially in header files. Double check this when you remove
    code.
  * Use quotes `"` to include local headers from the same relative source directory. This should only
-   occur in source files and non-public header files. Otherwise use angle brackets `<>` around 
+   occur in source files and non-public header files. Otherwise use angle brackets `<>` around
    included header filenames.
- * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including 
-   (internal) headers from source paths not in the same directory as the including file, 
+ * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including
+   (internal) headers from source paths not in the same directory as the including file,
    because source paths are not passed with `-I`.
  * Avoid including library internal headers from non-internal files. For example, try not to include
-   headers from libcudf `src` directories in tests or in libcudf public headers. If you find 
-   yourself doing this, start a discussion about moving (parts of) the included internal header 
+   headers from libcudf `src` directories in tests or in libcudf public headers. If you find
+   yourself doing this, start a discussion about moving (parts of) the included internal header
    to a public header.
 
 # libcudf Data Structures
@@ -162,14 +164,14 @@ data structures you will use when developing libcudf code.
 
 ## Views and Ownership
 
-Resource ownership is an essential concept in libcudf. In short, an "owning" object owns a 
-resource (such as device memory). It acquires that resource during construction and releases the 
+Resource ownership is an essential concept in libcudf. In short, an "owning" object owns a
+resource (such as device memory). It acquires that resource during construction and releases the
 resource in destruction ([RAII](https://en.cppreference.com/w/cpp/language/raii)). A "non-owning"
 object does not own resources. Any class in libcudf with the `*_view` suffix is non-owning. For more
 detail see the [`libcudf++` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing)
 
 libcudf functions typically take views as input (`column_view`, `table_view`, or `scalar_view`)
-and produce `unique_ptr`s to owning objects as output. For example, 
+and produce `unique_ptr`s to owning objects as output. For example,
 
 ```c++
 std::unique_ptr<table> sort(table_view const& input);
@@ -177,25 +179,25 @@ std::unique_ptr<table> sort(table_view const& input);
 
 ## `rmm::device_memory_resource`<a name="memory_resource"></a>
 
-libcudf Allocates all device memory via RMM memory resources (MR). See the 
+libcudf allocates all device memory via RMM memory resources (MR). See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
 
 RMM provides a "default" memory resource for each device that can be accessed and updated via the
-`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions, 
-respectively. All memory resource parameters should be defaulted to use the return value of 
-`rmm::mr::get_current_device_resource()`. 
+`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions,
+respectively. All memory resource parameters should be defaulted to use the return value of
+`rmm::mr::get_current_device_resource()`.
 
 ## `cudf::column`
 
-`cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either 
-a `cudf::column` or a `cudf::table` as output. A `column` contains `device_buffer`s which own the 
-device memory for the elements of a column and an optional null indicator bitmask. 
+`cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
+a `cudf::column` or a `cudf::table` as output. A `column` contains `device_buffer`s which own the
+device memory for the elements of a column and an optional null indicator bitmask.
 
-Implicitly convertible to `column_view` and `mutable_column_view`. 
+Implicitly convertible to `column_view` and `mutable_column_view`.
 
-Movable and copyable. A copy performs a deep copy of the column's contents, whereas a move moves 
+Movable and copyable. A copy performs a deep copy of the column's contents, whereas a move moves
 the contents from one column to another.
 
 Example:
@@ -214,13 +216,13 @@ A `column` may have nested (child) columns, depending on the data type of the co
 
 ### `cudf::column_view`
 
-`cudf::column_view` is a core non-owning data structure in libcudf. It is an immutable, 
+`cudf::column_view` is a core non-owning data structure in libcudf. It is an immutable,
 non-owning view of device memory as a column. Most libcudf public APIs take views as inputs.
 
-A `column_view` may be a view of a "slice" of a column. For example, it might view rows 75-150 of a 
-column with 1000 rows. The `size()` of this `column_view` would be `75`, and accessing index `0` of 
-the view would return the element at index `75` of the owning `column`. Internally, this is 
-implemented by storing in the view a pointer, an offset, and a size. `column_view::data<T>()` 
+A `column_view` may be a view of a "slice" of a column. For example, it might view rows 75-150 of a
+column with 1000 rows. The `size()` of this `column_view` would be `75`, and accessing index `0` of
+the view would return the element at index `75` of the owning `column`. Internally, this is
+implemented by storing in the view a pointer, an offset, and a size. `column_view::data<T>()`
 returns a pointer iterator to `column_view::head<T>() + offset`.
 
 ### `cudf::mutable_column_view`
@@ -230,29 +232,29 @@ APIs that modify columns in place.
 
 ### `cudf::column_device_view`
 
-An immutable, non-owning view of device data as a column of elements that is trivially copyable and 
-usable in CUDA device code. Used to pass `column_view` data as input to CUDA kernels and device 
+An immutable, non-owning view of device data as a column of elements that is trivially copyable and
+usable in CUDA device code. Used to pass `column_view` data as input to CUDA kernels and device
 functions (including Thrust algorithms)
 
 ### `cudf::mutable_column_device_view`
 
-A mutable, non-owning view of device data as a column of elements that is trivially copyable and 
+A mutable, non-owning view of device data as a column of elements that is trivially copyable and
 usable in CUDA device code. Used to pass `column_view` data to be modified on the device by CUDA
 kernels and device functions (including Thrust algorithms).
 
 ## `cudf::table`
 
-Owning class for a set of `cudf::column`s all with equal number of elements. This is the C++ 
-equivalent to a data frame. 
+Owning class for a set of `cudf::column`s all with equal number of elements. This is the C++
+equivalent to a data frame.
 
 Implicitly convertible to `cudf::table_view` and `cudf::mutable_table_view`
 
-Movable and copyable. A copy performs a deep copy of all columns, whereas a move moves all columns 
+Movable and copyable. A copy performs a deep copy of all columns, whereas a move moves all columns
 from one table to another.
 
 ### `cudf::table_view`
 
-An *immutable*, non-owning view of a table. 
+An *immutable*, non-owning view of a table.
 
 ### `cudf::mutable_table_view`
 
@@ -261,20 +263,20 @@ A *mutable*, non-owning view of a table.
 ## Spans
 
 libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight
-view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and 
-`device_span`, which can be constructed from multiple container types, or from a pointer 
-(host or device, respectively) and size, or from iterators. `span` types are useful for defining 
+view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and
+`device_span`, which can be constructed from multiple container types, or from a pointer
+(host or device, respectively) and size, or from iterators. `span` types are useful for defining
 generic (internal) interfaces which work with multiple input container types. `device_span` can be
-constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`. 
+constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`.
 `host_span` can be constructed from `thrust::host_vector`, `std::vector`, or `std::basic_string`.
 
-If you are definining internal (detail) functions that operate on vectors, use spans for the input 
+If you are defining internal (detail) functions that operate on vectors, use spans for the input
 vector parameters rather than a specific vector type, to make your functions more widely applicable.
 
 When a `span` refers to immutable elements, use `span<T const>`, not `span<T> const`. Since a span
 is lightweight view, it does not propagate `const`-ness. Therefore, `const` should be applied to
-the template type parameter, not to the `span` itself. Also, `span` should be passed by value 
-because it is a lightweight view. APIS in libcudf that take spans as input will look like the 
+the template type parameter, not to the `span` itself. Also, `span` should be passed by value
+because it is a lightweight view. APIS in libcudf that take spans as input will look like the
 following function that copies device data to a host `std::vector`.
 
 ```c++
@@ -284,15 +286,15 @@ std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_vi
 
 ## `cudf::scalar`
 
-A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types 
-currently supported by cudf. Each type of value is represented by a separate type of scalar class 
-which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value, 
+A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types
+currently supported by cudf. Each type of value is represented by a separate type of scalar class
+which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value,
 a `string_scalar` holds a single string. The data for the stored value resides in device memory.
 
-A `list_scalar` holds the underlying data of a single list. This means the underlying data can be any type
-that cudf supports. For example, a `list_scalar` representing a list of integers stores a `cudf::column`
-of type `INT32`. A `list_scalar` representing a list of lists of integers stores a `cudf::column` of
-type `LIST`, which in turn stores a column of type `INT32`.
+A `list_scalar` holds the underlying data of a single list. This means the underlying data can be
+any type that cudf supports. For example, a `list_scalar` representing a list of integers stores a
+`cudf::column` of type `INT32`. A `list_scalar` representing a list of lists of integers stores a
+`cudf::column` of type `LIST`, which in turn stores a column of type `INT32`.
 
 |Value type|Scalar class|Notes|
 |-|-|-|
@@ -305,16 +307,16 @@ type `LIST`, which in turn stores a column of type `INT32`.
 |list|`list_scalar`| Underlying data can be any type supported by cudf |
 
 ### Construction
-`scalar`s can be created using either their respective constructors or using factory functions like 
-`make_numeric_scalar()`, `make_timestamp_scalar()` or `make_string_scalar()`. 
+`scalar`s can be created using either their respective constructors or using factory functions like
+`make_numeric_scalar()`, `make_timestamp_scalar()` or `make_string_scalar()`.
 
 ### Casting
-All the factory methods return a `unique_ptr<scalar>` which needs to be statically downcasted to 
-its respective scalar class type before accessing its value. Their validity (nullness) can be 
-accessed without casting. Generally, the value needs to be accessed from a function that is aware 
-of the value type e.g. a functor that is dispatched from `type_dispatcher`. To cast to the 
-requisite scalar class type given the value type, use the mapping utility `scalar_type_t` provided 
-in `type_dispatcher.hpp` : 
+All the factory methods return a `unique_ptr<scalar>` which needs to be statically downcasted to
+its respective scalar class type before accessing its value. Their validity (nullness) can be
+accessed without casting. Generally, the value needs to be accessed from a function that is aware
+of the value type e.g. a functor that is dispatched from `type_dispatcher`. To cast to the
+requisite scalar class type given the value type, use the mapping utility `scalar_type_t` provided
+in `type_dispatcher.hpp` :
 
 ```c++
 //unique_ptr<scalar> s = make_numeric_scalar(...);
@@ -326,8 +328,8 @@ auto s1 = static_cast<ScalarType *>(s.get());
 
 ### Passing to device
 Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows
-access to the value and its validity from the device. This can be obtained using the function 
-`get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar 
+access to the value and its validity from the device. This can be obtained using the function
+`get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar
 object, only for the derived typed scalar class objects.
 
 The underlying data for `list_scalar` can be accessed via `view()` method. For non-nested data,
@@ -339,17 +341,17 @@ data, a specialized device view for list columns can be constructed via
 
 ## Streams
 
-CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition 
-to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be 
+CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition
+to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be
 implemented using asynchronous APIs on the default stream (e.g., stream 0).
 
-The recommended pattern for doing this is to make the definition of the external API invoke an 
-internal API in the `detail` namespace. The internal `detail` API has the same parameters as the 
-public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to 
-`rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API 
+The recommended pattern for doing this is to make the definition of the external API invoke an
+internal API in the `detail` namespace. The internal `detail` API has the same parameters as the
+public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to
+`rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API
 definition and use only asynchronous versions of CUDA APIs with the stream parameter.
 
-In order to make the `detail` API callable from other libcudf functions, it should be exposed in a 
+In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
 
 For example:
@@ -382,55 +384,57 @@ void external_function(...){
 ```
 
 **Note:** It is important to synchronize the stream if *and only if* it is necessary. For example,
-when a non-pointer value is returned from the API that is the result of an asynchronous 
+when a non-pointer value is returned from the API that is the result of an asynchronous
 device-to-host copy, the stream used for the copy should be synchronized before returning. However,
-when a column is returned, the stream should not be synchronized because doing so will break 
+when a column is returned, the stream should not be synchronized because doing so will break
 asynchrony if and when we add an asynchronous API to libcudf.
 
 **Note:** `cudaDeviceSynchronize()` should *never* be used.
- This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
+This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs.
 
  ### NVTX Ranges
 
- In order to aid in performance optimization and debugging, all compute intensive libcudf functions should have a corresponding NVTX range.
- In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()` that will automatically annotate the lifetime of the enclosing function and use the functions name as the name of the NVTX range. 
- For more information about NVTX, see [here](https://github.com/NVIDIA/NVTX/tree/dev/cpp).
+In order to aid in performance optimization and debugging, all compute intensive libcudf functions
+should have a corresponding NVTX range. In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()`
+that will automatically annotate the lifetime of the enclosing function and use the function's name
+as the name of the NVTX range. For more information about NVTX, see
+[here](https://github.com/NVIDIA/NVTX/tree/dev/cpp).
 
  ### Stream Creation
 
-There may be times in implementing libcudf features where it would be advantageous to use streams 
-*internally*, i.e., to accomplish overlap in implementing an algorithm. However, dynamically 
-creating a stream can be expensive. RMM has a stream pool class to help avoid dynamic stream 
-creation. However, this is not yet exposed in libcudf, so for the time being, libcudf features 
+There may be times in implementing libcudf features where it would be advantageous to use streams
+*internally*, i.e., to accomplish overlap in implementing an algorithm. However, dynamically
+creating a stream can be expensive. RMM has a stream pool class to help avoid dynamic stream
+creation. However, this is not yet exposed in libcudf, so for the time being, libcudf features
 should avoid creating streams (even if it is slightly less efficient). It is a good idea to leave a
 `// TODO:` note indicating where using a stream would be beneficial.
 
 ## Memory Allocation
 
-Device [memory resources](#memory_resource) are used in libcudf to abstract and control how device 
-memory is allocated. 
+Device [memory resources](#memory_resource) are used in libcudf to abstract and control how device
+memory is allocated.
 
 ### Output Memory
 
-Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a 
+Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a
 `device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
 to allocate any memory for returned objects. It should therefore be passed into functions whose
 outputs will be returned. Example:
 
 ```c++
-// Returned `column` contains newly allocated memory, 
+// Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
   ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
-void does_not_allocate_output_memory(...);                                              
+void does_not_allocate_output_memory(...);
 ```
 
 ### Temporary Memory
 
-Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must 
+Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
 allocate temporary, scratch memory for intermediate results. Always use the default resource
 obtained from `rmm::mr::get_current_device_resource()` for temporary memory allocations. Example:
 
@@ -451,70 +455,70 @@ libcudf code generally eschews raw pointers and direct memory allocation. Use RM
 use `device_memory_resource`(*)s for device memory allocation with automated lifetime management.
 
 #### `rmm::device_buffer`
-Allocates a specified number of bytes of untyped, uninitialized device memory using a 
-`device_memory_resource`. If no resource is explicitly provided, uses 
-`rmm::mr::get_current_device_resource()`. 
+Allocates a specified number of bytes of untyped, uninitialized device memory using a
+`device_memory_resource`. If no resource is explicitly provided, uses
+`rmm::mr::get_current_device_resource()`.
 
-`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the 
-`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the 
+`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the
+`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the
 device memory from one `device_buffer` to another.
 
 ```c++
-// Allocates at least 100 bytes of uninitialized device memory 
+// Allocates at least 100 bytes of uninitialized device memory
 // using the specified resource and stream
-rmm::device_buffer buff(100, stream, mr); 
+rmm::device_buffer buff(100, stream, mr);
 void * raw_data = buff.data(); // Raw pointer to underlying device memory
 
 // Deep copies `buff` into `copy` on `stream`
-rmm::device_buffer copy(buff, stream); 
+rmm::device_buffer copy(buff, stream);
 
 // Moves contents of `buff` into `moved_to`
-rmm::device_buffer moved_to(std::move(buff)); 
+rmm::device_buffer moved_to(std::move(buff));
 
 custom_memory_resource *mr...;
 // Allocates 100 bytes from the custom_memory_resource
-rmm::device_buffer custom_buff(100, mr, stream); 
+rmm::device_buffer custom_buff(100, mr, stream);
 ```
 
 #### `rmm::device_scalar<T>`
-Allocates a single element of the specified type initialized to the specified value. Use this for 
-scalar input/outputs into device kernels, e.g., reduction results, null count, etc. This is 
+Allocates a single element of the specified type initialized to the specified value. Use this for
+scalar input/outputs into device kernels, e.g., reduction results, null count, etc. This is
 effectively a convenience wrapper around a `rmm::device_vector<T>` of length 1.
 
 ```c++
 // Allocates device memory for a single int using the specified resource and stream
 // and initializes the value to 42
-rmm::device_scalar<int> int_scalar{42, stream, mr}; 
+rmm::device_scalar<int> int_scalar{42, stream, mr};
 
 // scalar.data() returns pointer to value in device memory
 kernel<<<...>>>(int_scalar.data(),...);
 
-// scalar.value() synchronizes the scalar's stream and copies the 
+// scalar.value() synchronizes the scalar's stream and copies the
 // value from device to host and returns the value
 int host_value = int_scalar.value();
 ```
 
 #### `rmm::device_vector<T>`
 
-Allocates a specified number of elements of the specified type. If no initialization value is 
+Allocates a specified number of elements of the specified type. If no initialization value is
 provided, all elements are default initialized (this incurs a kernel launch).
 
 **Note**: We have removed all usage of `rmm::device_vector` and `thrust::device_vector` from
-libcudf, and you should not use it in new code in libcudf without careful consideration. Instead, 
-use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These 
+libcudf, and you should not use it in new code in libcudf without careful consideration. Instead,
+use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These
 utilities enable creation of `uvector`s from host-side vectors, or creating zero-initialized
 `uvector`s, so that they are as convenient to use as `device_vector`. Avoiding `device_vector` has
 a number of benefits, as described in the following section on `rmm::device_uvector`.
 
 #### `rmm::device_uvector<T>`
 
-Similar to a `device_vector`, allocates a contiguous set of elements in device memory but with key 
+Similar to a `device_vector`, allocates a contiguous set of elements in device memory but with key
 differences:
 - As an optimization, elements are uninitialized and no synchronization occurs at construction.
 This limits the types `T` to trivially copyable types.
-- All operations are stream ordered (i.e., they accept a `cuda_stream_view` specifying the stream 
+- All operations are stream ordered (i.e., they accept a `cuda_stream_view` specifying the stream
 on which the operation is performed). This improves safety when using non-default streams.
-- `device_uvector.hpp` does not include any `__device__` code, unlike `thrust/device_vector.hpp`, 
+- `device_uvector.hpp` does not include any `__device__` code, unlike `thrust/device_vector.hpp`,
   which means `device_uvector`s can be used in `.cpp` files, rather than just in `.cu` files.
 
 ```c++
@@ -523,21 +527,21 @@ cuda_stream s;
 // default resource
 rmm::device_uvector<int32_t> v(100, s);
 // Initializes the elements to 0
-thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0}); 
+thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0});
 
 rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
 // Allocates uninitialized storage for 100 `int32_t` elements on stream `s` using the resource `mr`
-rmm::device_uvector<int32_t> v2{100, s, mr}; 
+rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
 
 ## Input/Output Style<a name="inout_style"></a>
 
 The preferred style for how inputs are passed in and outputs are returned is the following:
--   Inputs
-	- Columns:
-		- `column_view const&`
-	- Tables:
-		- `table_view const&`
+- Inputs
+  - Columns:
+    - `column_view const&`
+  - Tables:
+    - `table_view const&`
     - Scalar:
         - `scalar const&`
     - Everything else:
@@ -545,30 +549,30 @@ The preferred style for how inputs are passed in and outputs are returned is the
           - Pass by value
        - Non-trivial or expensive to copy types
           - Pass by `const&`
--   In/Outs  
-	- Columns:
-		- `mutable_column_view&`
-	- Tables:
-		- `mutable_table_view&`
+- In/Outs
+  - Columns:
+    - `mutable_column_view&`
+  - Tables:
+    - `mutable_table_view&`
     - Everything else:
         - Pass by via raw pointer
--   Outputs 
-	- Outputs should be *returned*, i.e., no output parameters
-	- Columns:
-		- `std::unique_ptr<column>`
-	- Tables:
-		- `std::unique_ptr<table>`
+- Outputs
+  - Outputs should be *returned*, i.e., no output parameters
+  - Columns:
+    - `std::unique_ptr<column>`
+  - Tables:
+    - `std::unique_ptr<table>`
     - Scalars:
         - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
 
-Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be 
-done in C++ (including creating a `struct` for the output). One convenient way to do this is 
-using `std::tie`  and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke 
-either the copy constructor or the move constructor of the object, and it may be preferable to move 
-non-trivially copyable objects (and required for types with deleted copy constructors, like 
+Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be
+done in C++ (including creating a `struct` for the output). One convenient way to do this is
+using `std::tie`  and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke
+either the copy constructor or the move constructor of the object, and it may be preferable to move
+non-trivially copyable objects (and required for types with deleted copy constructors, like
 `std::unique_ptr`).
 
 ```c++
@@ -577,7 +581,7 @@ std::pair<table, table> return_two_tables(void){
   cudf::table out1;
   ...
   // Do stuff with out0, out1
-  
+
   // Return a std::pair of the two outputs
   return std::make_pair(std::move(out0), std::move(out1));
 }
@@ -587,19 +591,20 @@ cudf::table out1;
 std::tie(out0, out1) = cudf::return_two_outputs();
 ```
 
-Note:  `std::tuple`  _could_  be used if not for the fact that Cython does not support 
-`std::tuple`. Therefore, libcudf APIs must use `std::pair`, and are therefore limited to return 
-only two objects of different types. Multiple objects of the same type may be returned via a 
+Note: `std::tuple` _could_ be used if not for the fact that Cython does not support
+`std::tuple`. Therefore, libcudf APIs must use `std::pair`, and are therefore limited to return
+only two objects of different types. Multiple objects of the same type may be returned via a
 `std::vector<T>`.
 
-Alternatively, with C++17 (supported from cudf v0.20), [structured binding](https://en.cppreference.com/w/cpp/language/structured_binding) 
+Alternatively, with C++17 (supported from cudf v0.20),
+[structured binding](https://en.cppreference.com/w/cpp/language/structured_binding)
 may be used to disaggregate multiple return values:
 
 ```c++
 auto [out0, out1] = cudf::return_two_outputs();
 ```
 
-Note that the compiler might not support capturing aliases defined in a structured binding 
+Note that the compiler might not support capturing aliases defined in a structured binding
 in a lambda. One may work around this by using a capture with an initializer instead:
 
 ```c++
@@ -618,10 +623,10 @@ auto foo = [&out0 = out0] {
 
 ## Iterator-based interfaces
 
-Increasingly, libcudf is moving toward internal (`detail`) APIs with iterator parameters rather 
-than explicit `column`/`table`/`scalar` parameters. As with STL, iterators enable generic 
-algorithms to be applied to arbitrary containers. A good example of this is `cudf::copy_if_else`. 
-This function takes two inputs, and a Boolean mask. It copies the corresponding element from the 
+Increasingly, libcudf is moving toward internal (`detail`) APIs with iterator parameters rather
+than explicit `column`/`table`/`scalar` parameters. As with STL, iterators enable generic
+algorithms to be applied to arbitrary containers. A good example of this is `cudf::copy_if_else`.
+This function takes two inputs, and a Boolean mask. It copies the corresponding element from the
 first or second input depending on whether the mask at that index is `true` or `false`. Implementing
 `copy_if_else` for all combinations of `column` and `scalar` parameters is simplified by using
 iterators in the `detail` API.
@@ -636,15 +641,15 @@ std::unique_ptr<column> copy_if_else(
   FilterFn filter,
   ...);
 ```
-`LeftIter` and `RightIter` need only implement the necessary interface for an iterator. libcudf 
-provides a number of iterator types and utilities that are useful with iterator-based APIs from 
-libcudf as well as Thrust algorithms. Most are defined in `include/detail/iterator.cuh`. 
+`LeftIter` and `RightIter` need only implement the necessary interface for an iterator. libcudf
+provides a number of iterator types and utilities that are useful with iterator-based APIs from
+libcudf as well as Thrust algorithms. Most are defined in `include/detail/iterator.cuh`.
 
 ### Pair iterator
 
-The pair iterator is used to access elements of nullable columns as a pair containing an element's 
-value and validity. `cudf::detail::make_pair_iterator` can be used to create a pair iterator from a 
-`column_device_view` or a `cudf::scalar`. `make_pair_iterator` is not available for 
+The pair iterator is used to access elements of nullable columns as a pair containing an element's
+value and validity. `cudf::detail::make_pair_iterator` can be used to create a pair iterator from a
+`column_device_view` or a `cudf::scalar`. `make_pair_iterator` is not available for
 `mutable_column_device_view`.
 
 ### Null-replacement iterator
@@ -654,20 +659,20 @@ This iterator replaces the null/validity value for each element with a specified
 
 ### Validity iterator
 
-This iterator returns the validity of the underlying element (`true` or `false`). Created using 
+This iterator returns the validity of the underlying element (`true` or `false`). Created using
 `cudf::detail::make_validity_iterator`.
 
 ### Index-normalizing iterators
 
 The proliferation of data types supported by libcudf can result in long compile times. One area
 where compile time was a problem is in types used to store indices, which can be any integer type.
-The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be 
-used for index types (integers) without requiring a type-specific instance. It can be used for any 
-iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`, 
-`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a 
+The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
+used for index types (integers) without requiring a type-specific instance. It can be used for any
+iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
+`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a
 `cudf::size_type` integer.
 
-Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example 
+Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example
 input iterator usage:
 
 ```c++
@@ -699,20 +704,20 @@ namespace cudf{
 } // namespace cudf
 ```
 
-The top-level `cudf` namespace is sufficient for most of the public API. However, to logically 
-group a broad set of functions, further namespaces may be used. For example, there are numerous 
-functions that are specific to columns of Strings. These functions reside in the `cudf::strings::` 
-namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::` 
-namespace. 
+The top-level `cudf` namespace is sufficient for most of the public API. However, to logically
+group a broad set of functions, further namespaces may be used. For example, there are numerous
+functions that are specific to columns of Strings. These functions reside in the `cudf::strings::`
+namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
+namespace.
 
 ### Internal
 
-Many functions are not meant for public use, so place them in either the `detail` or an *anonymous* 
+Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*
 namespace, depending on the situation.
 
 #### `detail` namespace
 
-Functions or objects that will be used across *multiple* translation units (i.e., source files), 
+Functions or objects that will be used across *multiple* translation units (i.e., source files),
 should be exposed in an internal header file and placed in the `detail` namespace. Example:
 
 ```c++
@@ -726,7 +731,7 @@ void reusable_helper_function(...);
 
 #### Anonymous namespace
 
-Functions or objects that will only be used in a *single* translation unit should be defined in an 
+Functions or objects that will only be used in a *single* translation unit should be defined in an
 *anonymous* namespace in the source file where it is used. Example:
 
 ```c++
@@ -736,12 +741,12 @@ void isolated_helper_function(...);
 } // anonymous namespace
 ```
 
-[**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file) 
+[**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file)
 
 # Error Handling
 
-libcudf follows conventions (and provides utilities) enforcing compile-time and run-time 
-conditions and detecting and handling CUDA errors. Communication of errors is always via C++ 
+libcudf follows conventions (and provides utilities) enforcing compile-time and run-time
+conditions and detecting and handling CUDA errors. Communication of errors is always via C++
 exceptions.
 
 ## Runtime Conditions
@@ -753,13 +758,14 @@ Example usage:
 CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
 ```
 
-The first argument is the conditional expression expected to resolve to  `true`  under normal 
-conditions. If the conditional evaluates to  `false`, then an error has occurred and an instance of  `cudf::logic_error` is thrown. The second argument to  `CUDF_EXPECTS` is a short description of the 
-error that has occurred and is used for the exception's `what()` message. 
+The first argument is the conditional expression expected to resolve to `true` under normal
+conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
+`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
+error that has occurred and is used for the exception's `what()` message.
 
-There are times where a particular code path, if reached, should indicate an error no matter what. 
-For example, often the `default` case of a `switch` statement represents an invalid alternative. 
-Use the `CUDF_FAIL` macro for such errors. This is effectively the same as calling 
+There are times where a particular code path, if reached, should indicate an error no matter what.
+For example, often the `default` case of a `switch` statement represents an invalid alternative.
+Use the `CUDF_FAIL` macro for such errors. This is effectively the same as calling
 `CUDF_EXPECTS(false, reason)`.
 
 Example:
@@ -769,9 +775,9 @@ CUDF_FAIL("This code path should not be reached.");
 
 ### CUDA Error Checking
 
-Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This 
-macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The 
-thrown exception includes a description of the CUDA error code in it's  `what()`  message.
+Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This
+macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The
+thrown exception includes a description of the CUDA error code in its `what()` message.
 
 Example:
 
@@ -786,7 +792,7 @@ Use `static_assert` to enforce compile-time conditions. For example,
 ```c++
 template <typename T>
 void trivial_types_only(T t){
-   static_assert(std::is_trivial<T>::value, "This function requires a trivial type.");
+  static_assert(std::is_trivial<T>::value, "This function requires a trivial type.");
 ...
 }
 ```
@@ -805,7 +811,7 @@ Columns may contain data of a number of types (see `enum class type_id` in `incl
  * Lists of any type
  * Structs of columns of any type
 
-Most algorithms must support columns of any data type. This leads to complexity in the code, and 
+Most algorithms must support columns of any data type. This leads to complexity in the code, and
 is one of the primary challenges a libcudf developer faces. Sometimes we develop new algorithms with
 gradual support for more data types to make this easier. Typically we start with fixed-width data
 types such as numeric types and timestamps/durations, adding support for nested types later.
@@ -815,21 +821,21 @@ as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dis
 
 # Type Dispatcher
 
-libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This 
-*type-erasure* enables interoperability with other languages and type systems, such as Python and 
-Java. In order to determine the type, libcudf algorithms must use the run-time information stored 
-in the column `type()` to reconstruct the data type `T` by casting the `void*` to the appropriate 
+libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
+*type-erasure* enables interoperability with other languages and type systems, such as Python and
+Java. In order to determine the type, libcudf algorithms must use the run-time information stored
+in the column `type()` to reconstruct the data type `T` by casting the `void*` to the appropriate
 `T*`.
 
-This so-called *type dispatch* is pervasive throughout libcudf. The `type_dispatcher` is a 
-central utility that automates the process of mapping the runtime type information in `data_type` 
+This so-called *type dispatch* is pervasive throughout libcudf. The `type_dispatcher` is a
+central utility that automates the process of mapping the runtime type information in `data_type`
 to a concrete C++ type.
 
-At a high level, you call the `type_dispatcher` with a `data_type` and a function object (also 
-known as a *functor*) with an `operator()` template. Based on the value of `data_type::id()`, the 
-type dispatcher invokes the corresponding instantiation of the `operator()` template. 
+At a high level, you call the `type_dispatcher` with a `data_type` and a function object (also
+known as a *functor*) with an `operator()` template. Based on the value of `data_type::id()`, the
+type dispatcher invokes the corresponding instantiation of the `operator()` template.
 
-This simplified example shows how the value of `data_type::id()` determines which instantiation of 
+This simplified example shows how the value of `data_type::id()` determines which instantiation of
 the `F::operator()` template is invoked.
 
 ```c++
@@ -843,7 +849,7 @@ void type_dispatcher(data_type t, F f){
 }
 ```
 
-The following example shows a function object called `size_of_functor` that returns the size of the 
+The following example shows a function object called `size_of_functor` that returns the size of the
 dispatched type.
 
 ```c++
@@ -857,9 +863,9 @@ cudf::type_dispatcher(data_type{type_id::INT32}, size_of_functor{});  // returns
 cudf::type_dispatcher(data_type{type_id::FLOAT64}, size_of_functor{});  // returns 8
 ```
 
-By default, `type_dispatcher` uses `cudf::type_to_id<t>` to provide the mapping of `cudf::type_id` 
-to dispatched C++ types. However, this mapping may be customized by explicitly specifying a 
-user-defined trait for the `IdTypeMap`. For example, to always dispatch `int32_t` for all values of 
+By default, `type_dispatcher` uses `cudf::type_to_id<t>` to provide the mapping of `cudf::type_id`
+to dispatched C++ types. However, this mapping may be customized by explicitly specifying a
+user-defined trait for the `IdTypeMap`. For example, to always dispatch `int32_t` for all values of
 `cudf::type_id`:
 
 ```c++
@@ -871,18 +877,18 @@ cudf::type_dispatcher<always_int>(data_type, f);
 
 ## Avoid Multiple Type Dispatch
 
-Avoid multiple type-dispatch if possible. The compiler creates a code path for every type 
-dispatched, so a second-level type dispatch results in quadratic growth in compilation time and 
+Avoid multiple type-dispatch if possible. The compiler creates a code path for every type
+dispatched, so a second-level type dispatch results in quadratic growth in compilation time and
 object code size. As a large library with many types and functions, we are constantly working to
 reduce compilation time and code size.
 
 ## Specializing Type-Dispatched Code Paths
 
-It is often necessary to customize the dispatched `operator()` for different types. This can be 
+It is often necessary to customize the dispatched `operator()` for different types. This can be
 done in several ways.
 
-The first method is to use explicit, full template specialization. This is useful for specializing 
-behavior for single types. The following example function object prints `"int32_t"` or `"double"` 
+The first method is to use explicit, full template specialization. This is useful for specializing
+behavior for single types. The following example function object prints `"int32_t"` or `"double"`
 when invoked with either of those types, or `"unhandled type"` otherwise.
 
 ```c++
@@ -900,8 +906,8 @@ template <>
 void type_printer::operator()<double>() { std::cout << "double\n"; }
 ```
 
-The second method is to use [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) with 
-`std::enable_if_t`. This is useful to partially specialize for a set of types with a common trait. 
+The second method is to use [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) with
+`std::enable_if_t`. This is useful to partially specialize for a set of types with a common trait.
 The following example functor prints `integral` or `floating point` for integral or floating point
 types, respectively.
 
@@ -909,7 +915,7 @@ types, respectively.
 struct integral_or_floating_point {
 template <typename ColumnType,
           std::enable_if_t<not std::is_integral<ColumnType>::value and
-                           not std::is_floating_point<ColumnType>::value>* = nullptr> 
+                           not std::is_floating_point<ColumnType>::value>* = nullptr>
 void operator()() { std::cout << "neither integral nor floating point\n"; }
 
 template <typename ColumnType,
@@ -917,33 +923,33 @@ template <typename ColumnType,
 void operator()() { std::cout << "integral\n"; }
 
 template < typename ColumnType,
-           std::enable_if_t<std::is_floating_point<ColumnType>::value>* = nullptr> 
+           std::enable_if_t<std::is_floating_point<ColumnType>::value>* = nullptr>
 void operator()() { std::cout << "floating point\n"; }
 };
 ```
 
 For more info on SFINAE with `std::enable_if`, [see this post](https://eli.thegreenplace.net/2014/sfinae-and-enable_if).
 
-There are a number of traits defined in `include/cudf/utilities/traits.hpp` that are useful for 
-partial specialization of dispatched function objects. For example `is_numeric<T>()` can be used to 
+There are a number of traits defined in `include/cudf/utilities/traits.hpp` that are useful for
+partial specialization of dispatched function objects. For example `is_numeric<T>()` can be used to
 specialize for any numeric type.
 
 # Variable-Size and Nested Data Types
 
-libcudf supports a number of variable-size and nested data types, including strings, lists, and 
-structs. 
- 
- * `string`: Simply a character string, but a column of strings may have a different-length string 
+libcudf supports a number of variable-size and nested data types, including strings, lists, and
+structs.
+
+ * `string`: Simply a character string, but a column of strings may have a different-length string
    in each row.
- * `list`: A list of elements of any type, so a column of lists of integers has rows with a list of 
-   integers, possibly of a different length, in each row. 
+ * `list`: A list of elements of any type, so a column of lists of integers has rows with a list of
+   integers, possibly of a different length, in each row.
  * `struct`: In a column of structs, each row is a structure comprising one or more fields. These
    fields are stored in structure-of-arrays format, so that the column of structs has a nested
-   column for each field of the structure. 
+   column for each field of the structure.
 
-As the heading implies, list and struct columns may be nested arbitrarily. One may create a column 
-of lists of structs, where the fields of the struct may be of any type, including strings, lists and 
-structs. Thinking about deeply nested data types can be confusing for column-based data, even with 
+As the heading implies, list and struct columns may be nested arbitrarily. One may create a column
+of lists of structs, where the fields of the struct may be of any type, including strings, lists and
+structs. Thinking about deeply nested data types can be confusing for column-based data, even with
 experience. Therefore it is important to carefully write algorithms, and to test and document them
 well.
 
@@ -952,13 +958,13 @@ well.
 In order to represent variable-width elements, libcudf columns contain a vector of child columns.
 For list columns, the parent column's type is `LIST` and contains no data, but its size represents
 the number of lists in the column, and its null mask represents the validity of each list element.
-The parent has two children. 
+The parent has two children.
 
 1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each list
    in a dense column of elements.
-2. A column containing the actual data and optional null mask for all elements of all the lists 
+2. A column containing the actual data and optional null mask for all elements of all the lists
    packed together.
-   
+
 With this representation, `data[offsets[i]]` is the first element of list `i`, and the size of list
 `i` is given by `offsets[i+1] - offsets[i]`.
 
@@ -967,9 +973,9 @@ of any type. Note also that not only is each list nullable (using the null mask
 each list element may be nullable. So you may have a lists column with null row 3, and also null
 element 2 of row 4.
 
-The underlying data for a lists column is always bundled into a single leaf column at the very 
-bottom of the hierarchy (ignoring structs, which conceptually "reset" the root of the hierarchy), 
-regardless of the level of nesting. So a `List<List<List<List<int>>>>>` column has a single `int` 
+The underlying data for a lists column is always bundled into a single leaf column at the very
+bottom of the hierarchy (ignoring structs, which conceptually "reset" the root of the hierarchy),
+regardless of the level of nesting. So a `List<List<List<List<int>>>>` column has a single `int`
 column at the very bottom. The following is a visual representation of this.
 
 ```
@@ -997,17 +1003,17 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
 
 ## Strings columns
 
-Strings are represented in much the same way as lists, except that the data child column is always 
+Strings are represented in much the same way as lists, except that the data child column is always
 a non-nullable column of `INT8` data. The parent column's type is `STRING` and contains no data,
-but its size represents the number of strings in the column, and its null mask represents the 
+but its size represents the number of strings in the column, and its null mask represents the
 validity of each string. To summarize, the strings column children are:
 
-1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each 
+1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each
    string in a dense column of all characters.
-2. A non-nullable column of `INT8` elements of all the characters across all the strings packed 
+2. A non-nullable column of `INT8` elements of all the characters across all the strings packed
    together.
 
-With this representation, `characters[offsets[i]]` is the first character of string `i`, and the 
+With this representation, `characters[offsets[i]]` is the first character of string `i`, and the
 size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
 this compound column representation of strings.
 
@@ -1026,10 +1032,10 @@ null mask represents the validity of each struct element.
 With this representation, `child[0][10]` is row 10 of the first field of the struct, `child[1][42]`
 is row 42 of the second field of the struct.
 
-Notice that in addition to the struct column's null mask, each struct field column has its own optional null
-mask. A struct field's validity can vary independently from the corresponding struct row. For
-instance, a non-null struct row might have a null field. However, the fields of a null struct row
-are deemed to be null as well. For example, consider a struct column of type 
+Notice that in addition to the struct column's null mask, each struct field column has its own
+optional null mask. A struct field's validity can vary independently from the corresponding struct
+row. For instance, a non-null struct row might have a null field. However, the fields of a null
+struct row are deemed to be null as well. For example, consider a struct column of type
 `STRUCT<FLOAT32, INT32>`. If the contents are `[ {1.0, 2}, {4.0, 5}, null, {8.0, null} ]`, the
 struct column's layout is as follows. (Note that null masks should be read from right to left.)
 
@@ -1039,46 +1045,46 @@ struct column's layout is as follows. (Note that null masks should be read from
   null_mask = [1, 1, 0, 1]
   null_count = 1
   children = {
-    {   
+    {
       type = FLOAT32
       data =       [1.0, 4.0, X, 8.0]
       null_mask  = [  1,   1, 0,   1]
       null_count = 1
-    },  
-    {   
+    },
+    {
       type = INT32
       data =       [2, 5, X, X]
       null_mask  = [1, 1, 0, 0]
       null_count = 2
-    }  
-  }   
+    }
+  }
 }
 ```
 
-The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of 
-the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in 
+The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of
+the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in
 the null masks of both struct fields.
 
 ## Dictionary columns
 
-Dictionaries provide an efficient way to represent low-cardinality data by storing a single copy 
-of each value. A dictionary comprises a column of sorted keys and a column containing an index into 
-the keys column for each row of the parent column. The keys column may have any libcudf data type, 
-such as a numerical type or strings. The indices represent the corresponding positions of each 
-element's value in the keys. The indices child column can have any unsigned integer type 
+Dictionaries provide an efficient way to represent low-cardinality data by storing a single copy
+of each value. A dictionary comprises a column of sorted keys and a column containing an index into
+the keys column for each row of the parent column. The keys column may have any libcudf data type,
+such as a numerical type or strings. The indices represent the corresponding positions of each
+element's value in the keys. The indices child column can have any unsigned integer type
 (`UINT8`, `UINT16`, `UINT32`, or `UINT64`).
 
 ## Nested column challenges
 
-The first challenge with nested columns is that it is effectively impossible to do any operation 
-that modifies the length of any string or list in place. For example, consider trying to append the 
+The first challenge with nested columns is that it is effectively impossible to do any operation
+that modifies the length of any string or list in place. For example, consider trying to append the
 character `'a'` to the end of each string. This requires dynamically resizing the characters column
-to allow inserting `'a'` at the end of each string, and then modifying the offsets column to 
+to allow inserting `'a'` at the end of each string, and then modifying the offsets column to
 indicate the new size of each element. As a result, every operation that can modify the strings or
 lists in a column must be done out-of-place.
 
 The second challenge is that in an out-of-place operation on a strings column, unlike with fixed-
-width elements, the size of the output cannot be known *a priori*. For example, consider scattering 
+width elements, the size of the output cannot be known *a priori*. For example, consider scattering
 into a column of strings:
 
 ```c++
@@ -1090,7 +1096,7 @@ result:         {"this", "red", "a", "green", "of", "blue"}
 ```
 
 In this example, the strings "red", "green", and "blue" will respectively be scattered into
-positions `1`, `3`, and `5` of `destination`. Recall from above that this operation cannot be done 
+positions `1`, `3`, and `5` of `destination`. Recall from above that this operation cannot be done
 in place, therefore `result` will be generated by selectively copying strings from `destination` and
 `scatter_values`. Notice that `result`'s child column of characters requires storage for `19`
 characters. However, there is no way to know ahead of time that `result` will require `19`
@@ -1102,9 +1108,9 @@ approach:
 2. Allocate sufficient storage for all of the output characters and materialize each output string.
 
 In scatter, the first phase consists of using the `scatter_map` to determine whether string `i` in
-the output will come from `destination` or from `scatter_values` and use the corresponding size(s) 
-to materialize the offsets column and determine the size of the output. Then, in the second phase, 
-sufficient storage is allocated for the output's characters, and then the characters are filled 
+the output will come from `destination` or from `scatter_values` and use the corresponding size(s)
+to materialize the offsets column and determine the size of the output. Then, in the second phase,
+sufficient storage is allocated for the output's characters, and then the characters are filled
 with the corresponding strings from either `destination` or `scatter_values`.
 
 ## Nested Type Views
@@ -1113,15 +1119,15 @@ libcudf provides view types for nested column types as well as for the data elem
 
 ### `cudf::strings_column_view` and `cudf::string_view`
 
-`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of 
-any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore 
+`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
+any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
 `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object 
+data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object
 instance that points to device memory inside the strings column. It's lifespan is the same (or less)
 as the column it views.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
-column, do not call `element()` on a row that is null. 
+column, do not call `element()` on a row that is null.
 
 ```c++
    cudf::column_device_view d_strings;
@@ -1132,11 +1138,11 @@ column, do not call `element()` on a row that is null.
    }
 ```
 
-A null string is not the same as an empty string. Use the `string_scalar` class if you need an 
+A null string is not the same as an empty string. Use the `string_scalar` class if you need an
 instance of a class object to represent a null string.
 
-The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf 
-functions like `sort` without string-specific code. The data for a `string_view` instance is 
+The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
+functions like `sort` without string-specific code. The data for a `string_view` instance is
 required to be [UTF-8](#UTF-8) and all operators and methods expect this encoding. Unless documented
 otherwise, position and length parameters are specified in characters and not bytes. The class also
 includes a `string_view::const_iterator` which can be used to navigate through individual characters
@@ -1146,13 +1152,13 @@ within the string.
 
 #### UTF-8
 
-The libcudf strings column only supports UTF-8 encoding for strings data. 
-[UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each 
+The libcudf strings column only supports UTF-8 encoding for strings data.
+[UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each
 character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes.
 For this reason, it is recommended to use the `string_view` class to access these characters for
 most operations.
 
-The `string_view.cuh` header also includes some utility methods for reading and writing 
+The `string_view.cuh` header also includes some utility methods for reading and writing
 (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.
 
 ### `cudf::lists_column_view` and `cudf::lists_view`
@@ -1171,7 +1177,7 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o
 
 # cuIO: file reading and writing
 
-cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file 
+cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file
 formats commonly used in data analytics, including CSV, Parquet, ORC, Avro, and JSON_Lines.
 
 // TODO: add more detail and move to a separate file.
diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md
index 1bdf9c208d8..3c1e992c7eb 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/docs/TESTING.md
@@ -1,68 +1,68 @@
 # Unit Testing in libcudf
 
-Unit tests in libcudf are written using 
+Unit tests in libcudf are written using
 [Google Test](https://github.com/google/googletest/blob/master/docs/primer.md).
 
-**Important:** Instead of including `gtest/gtest.h` directly, use 
+**Important:** Instead of including `gtest/gtest.h` directly, use
 `#include <cudf_test/cudf_gtest.hpp>`.
 
 ## Best Practices: What Should We Test?
 
-In general we should test to make sure all code paths are covered. This is not always easy or 
+In general we should test to make sure all code paths are covered. This is not always easy or
 possible. But generally this means we test all supported combinations of algorithms and data types,
-and all operators supported by algorithms that support multiple operators (e.g. reductions, 
+and all operators supported by algorithms that support multiple operators (e.g. reductions,
 groupby).  Here are some other guidelines.
 
  * In general empty input is not an error in libcudf. Typically empty input results in empty output.
    Tests should verify this.
 
- * Anything that involves manipulating bitmasks (especially hand-rolled kernels) should have tests 
+ * Anything that involves manipulating bitmasks (especially hand-rolled kernels) should have tests
    that check varying number of rows, especially around boundaries like the warp size (32). So, test
    fewer than 32 rows, more than 32 rows, exactly 32 rows, and greater than 64 rows.
 
- * Most algorithms should have one or more tests exercising inputs with a large enough number of 
-   rows to require launching multiple thread blocks, especially when values are ultimately 
-   communicated between blocks (e.g. reductions). This is especially important for custom kernels 
-   but also applies to Thrust and CUB algorithm calls with lambdas / functors. 
+ * Most algorithms should have one or more tests exercising inputs with a large enough number of
+   rows to require launching multiple thread blocks, especially when values are ultimately
+   communicated between blocks (e.g. reductions). This is especially important for custom kernels
+   but also applies to Thrust and CUB algorithm calls with lambdas / functors.
 
  * For anything involving strings or lists, test exhaustive combinations of empty strings/lists,
-   null strings/lists and strings/lists with null elements. 
-   
+   null strings/lists and strings/lists with null elements.
+
  * Strings tests should include a mixture of non-ASCII UTF-8 characters like `é` in test data.
 
  * Test sliced columns as input (that is, columns that have a nonzero `offset`). This is an easy to
    forget case.
 
- * Tests that verify various forms of "degenerate" column inputs, for example: empty 
-   string columns that have no children (not many paths in cudf can generate these but it 
-   does happen); columns with zero size but that somehow have non-null data pointers; and struct 
+ * Tests that verify various forms of "degenerate" column inputs, for example: empty
+   string columns that have no children (not many paths in cudf can generate these but it
+   does happen); columns with zero size but that somehow have non-null data pointers; and struct
    columns with no children.
 
- * Decimal types are not included in the `NumericTypes` type list, but are included in 
-   `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as 
+ * Decimal types are not included in the `NumericTypes` type list, but are included in
+   `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as
    appropriate.
 
 
 ## Directory and File Naming
 
-The naming of unit test directories and source files should be consistent with the feature being 
+The naming of unit test directories and source files should be consistent with the feature being
 tested. For example, the tests for APIs in `copying.hpp` should live in `cudf/cpp/tests/copying`.
-Each feature (or set of related features) should have its own test source file named 
-`<feature>_tests.cu/cpp`. For example, `cudf/cpp/src/copying/scatter.cu` has tests in 
+Each feature (or set of related features) should have its own test source file named
+`<feature>_tests.cu/cpp`. For example, `cudf/cpp/src/copying/scatter.cu` has tests in
 `cudf/cpp/tests/copying/scatter_tests.cu`.
 
-In the interest of improving compile time, whenever possible, test source files should be `.cpp` 
+In the interest of improving compile time, whenever possible, test source files should be `.cpp`
 files because `nvcc` is slower than `gcc` in compiling host code. Note that `thrust::device_vector`
-includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, 
-`rmm::device_buffer` and the various `column_wrapper` types described later can be used in `.cpp` 
+includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`,
+`rmm::device_buffer` and the various `column_wrapper` types described later can be used in `.cpp`
 files, and are therefore preferred in test code over `thrust::device_vector`.
 
 ## Base Fixture
 
 All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests).
-Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture` 
-found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and 
-finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is 
+Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture`
+found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and
+finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is
 not necessary for your test fixtures to inherit from it.
 
 Example:
@@ -74,7 +74,7 @@ class MyTestFixture : public cudf::test::BaseFixture {...};
 
 In general, libcudf features must work across all of the supported types (there are exceptions e.g.
 not all binary operations are supported for all types). In order to automate the process of running
-the same tests across multiple types, we use GTest's 
+the same tests across multiple types, we use GTest's
 [Typed Tests](https://github.com/google/googletest/blob/master/docs/advanced.md#typed-tests).
 Typed tests allow you to write a test once and run it across a list of types.
 
@@ -92,15 +92,15 @@ TYPED_TEST(TypedTestFixture, FirstTest){
 ```
 
 To specify the list of types to use, instead of GTest's `::testing::Types<...>`, libcudf provides `cudf::test::Types<...>` which is a custom, drop-in replacement for `::testing::Types`.
-In this example, all tests using the `TypedTestFixture` fixture will run once for each type in the 
+In this example, all tests using the `TypedTestFixture` fixture will run once for each type in the
 list defined in `TestTypes` (`int, float, double`).
 
 ### Type Lists
 
-The list of types that are used in tests should be consistent across all tests. To ensure 
-consistency, several sets of common type lists are provided in 
+The list of types that are used in tests should be consistent across all tests. To ensure
+consistency, several sets of common type lists are provided in
 `include/cudf_test/type_lists.hpp`. For example, `NumericTypes` is a type list of all numeric types,
-`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every 
+`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every
 element type that libcudf supports.
 
 ```c++
@@ -110,17 +110,17 @@ element type that libcudf supports.
 TYPED_TEST_SUITE(TypedTestFixture, cudf::test::NumericTypes);
 ```
 
-Whenever possible, use one of the type list provided in `include/utilities/test/type_lists.hpp` 
+Whenever possible, use one of the type list provided in `include/utilities/test/type_lists.hpp`
 rather than creating new custom lists.
 
 #### Advanced Type Lists
 
-Sometimes it is necessary to generate more advanced type lists than the simple lists of single types 
-in the `TypeList` example above. libcudf provides a set of meta-programming utilities in 
+Sometimes it is necessary to generate more advanced type lists than the simple lists of single types
+in the `TypeList` example above. libcudf provides a set of meta-programming utilities in
 `include/cudf_test/type_list_utilities.hpp` for generating and composing more advanced type lists.
 
 For example, it may be useful to generate a *nested* type list where each element in the list is two
-types. In a nested type list, each element in the list is itself another list. In order to access 
+types. In a nested type list, each element in the list is itself another list. In order to access
 the `N`th type within the nested list, use `GetType<NestedList, N>`.
 
 Imagine testing all possible two-type combinations of `<int,float>`. This could be done manually:
@@ -129,7 +129,7 @@ Imagine testing all possible two-type combinations of `<int,float>`. This could
 using namespace cudf::test;
 template <typename TwoTypes>
 TwoTypesFixture : BaseFixture{...};
-using TwoTypesList = Types< Types<int, int>, Types<int, float>, 
+using TwoTypesList = Types< Types<int, int>, Types<int, float>,
                             Types<float, int>, Types<float, float> >;
 TYPED_TEST_SUITE(TwoTypesFixture, TwoTypesList);
 TYPED_TEST(TwoTypesFixture, FirstTest){
@@ -140,49 +140,49 @@ TYPED_TEST(TwoTypesFixture, FirstTest){
 }
 ```
 
-The above example manually specifies all pairs composed of `int` and `float`. `CrossProduct` is a 
+The above example manually specifies all pairs composed of `int` and `float`. `CrossProduct` is a
 utility in `type_list_utilities.hpp` which materializes this cross product automatically.
 
 ```c++
-using TwoTypesList = Types< Types<int, int>, Types<int, float>, 
+using TwoTypesList = Types< Types<int, int>, Types<int, float>,
                             Types<float, int>, Types<float, float> >;
 using CrossProductTypeList = CrossProduct< Types<int, float>, Types<int, float> >;
 // TwoTypesList and CrossProductTypeList are identical
 ```
 
 `CrossProduct` can be used with an arbitrary number of type lists to generate nested type lists of
-two or more types. **However**, overuse of `CrossProduct` can dramatically inflate compile time. 
-The cross product of two type lists of size `n` and `m` will result in a new list with 
-`n*m` nested type lists. This means `n*m` templates will be instantiated; `n` and `m` need not be 
+two or more types. **However**, overuse of `CrossProduct` can dramatically inflate compile time.
+The cross product of two type lists of size `n` and `m` will result in a new list with
+`n*m` nested type lists. This means `n*m` templates will be instantiated; `n` and `m` need not be
 large before compile time becomes unreasonable.
 
-There are a number of other utilities in `type_list_utilities.hpp`. For more details, see the 
-documentation in that file and their associated tests in 
+There are a number of other utilities in `type_list_utilities.hpp`. For more details, see the
+documentation in that file and their associated tests in
 `cudf/cpp/tests/utilities_tests/type_list_tests.cpp`.
 
 ## Utilities
 
 libcudf provides a number of utilities in `include/cudf_test` to make common testing operations more
-convenient. Before creating your own test utilities, look to see if one already exists that does 
-what you need. If not, consider adding a new utility to do what you need. However, make sure that 
-the utility is generic enough to be useful for other tests and is not overly tailored to your 
+convenient. Before creating your own test utilities, look to see if one already exists that does
+what you need. If not, consider adding a new utility to do what you need. However, make sure that
+the utility is generic enough to be useful for other tests and is not overly tailored to your
 specific testing need.
 
 ### Column Wrappers
 
 In order to make generating input columns easier, libcudf provides the `*_column_wrapper` classes in
 `include/cudf_test/column_wrapper.hpp`. These classes wrap a `cudf::column` and provide constructors
-for initializing a `cudf::column` object usable with libcudf APIs. Any `*_column_wrapper` class is 
-implicitly convertible to a `column_view` or `mutable_column_view` and therefore may be 
+for initializing a `cudf::column` object usable with libcudf APIs. Any `*_column_wrapper` class is
+implicitly convertible to a `column_view` or `mutable_column_view` and therefore may be
 transparently passed to any API expecting a `column_view` or `mutable_column_view` argument.
 
 #### `fixed_width_column_wrapper`
 
 The `fixed_width_column_wrapper` class should be used for constructing and initializing columns of
-any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc. 
-`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each 
-element in the column. For nullable columns, an additional iterator can be provided to indicate the 
-validity of each element. There are also constructors that accept a `std::initializer_list<T>` for 
+any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc.
+`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each
+element in the column. For nullable columns, an additional iterator can be provided to indicate the
+validity of each element. There are also constructors that accept a `std::initializer_list<T>` for
 the column elements and optionally for the validity of each element.
 
 Example:
@@ -207,9 +207,9 @@ fixed_width_column_wrapper<int32_t> w{ {1,2,3,4}, {1, 0, 1, 0}};
 #### `fixed_point_column_wrapper`
 
 The `fixed_point_column_wrapper` class should be used for constructing and initializing columns of
-any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides 
-constructors that accept an iterator range to generate each element in the column. For nullable 
-columns, an additional iterator can be provided to indicate the validity of each element. 
+any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides
+constructors that accept an iterator range to generate each element in the column. For nullable
+columns, an additional iterator can be provided to indicate the validity of each element.
 Constructors also take the scale of the fixed-point values to create.
 
 Example:
@@ -226,10 +226,10 @@ fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
 
 #### `dictionary_column_wrapper`
 
-The `dictionary_column_wrapper` class should be used to create dictionary columns. 
-`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each 
-element in the column. For nullable columns, an additional iterator can be provided to indicate the 
-validity of each element. There are also constructors that accept a `std::initializer_list<T>` for 
+The `dictionary_column_wrapper` class should be used to create dictionary columns.
+`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each
+element in the column. For nullable columns, an additional iterator can be provided to indicate the
+validity of each element. There are also constructors that accept a `std::initializer_list<T>` for
 the column elements and optionally for the validity of each element.
 
 Example:
@@ -273,30 +273,30 @@ dictionary_column_wrapper<std::string> d({"", "bb", "", "bb", "", "a", ""}, vali
 
 #### `strings_column_wrapper`
 
-The `strings_column_wrapper` class should be used to create columns of strings. It provides 
-constructors that accept an iterator range to generate each string in the column. For nullable 
-columns, an additional iterator can be provided to indicate the validity of each string. There are 
-also constructors that accept a `std::initializer_list<std::string>` for the column's strings and 
+The `strings_column_wrapper` class should be used to create columns of strings. It provides
+constructors that accept an iterator range to generate each string in the column. For nullable
+columns, an additional iterator can be provided to indicate the validity of each string. There are
+also constructors that accept a `std::initializer_list<std::string>` for the column's strings and
 optionally for the validity of each element.
 
 Example:
 ```c++
-// Creates a non-nullable STRING column with 7 string elements: 
+// Creates a non-nullable STRING column with 7 string elements:
 // {"", "this", "is", "a", "column", "of", "strings"}
 std::vector<std::string> strings{"", "this", "is", "a", "column", "of", "strings"};
 strings_column_wrapper s(strings.begin(), strings.end());
 
-// Creates a nullable STRING column with 7 string elements: 
+// Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
 std::vector<std::string> strings{"", "this", "is", "a", "column", "of", "strings"};
 auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
 strings_column_wrapper s(strings.begin(), strings.end(), validity);
 
-// Creates a non-nullable STRING column with 7 string elements: 
+// Creates a non-nullable STRING column with 7 string elements:
 // {"", "this", "is", "a", "column", "of", "strings"}
 strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"});
 
-// Creates a nullable STRING column with 7 string elements: 
+// Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
 auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;});
 strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity);
@@ -304,10 +304,10 @@ strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, val
 
 #### `lists_column_wrapper`
 
-The `lists_column_wrapper` class should be used to create columns of lists. It provides 
-constructors that accept an iterator range to generate each list in the column. For nullable 
-columns, an additional iterator can be provided to indicate the validity of each list. There are 
-also constructors that accept a `std::initializer_list<T>` for the column's lists and 
+The `lists_column_wrapper` class should be used to create columns of lists. It provides
+constructors that accept an iterator range to generate each list in the column. For nullable
+columns, an additional iterator can be provided to indicate the validity of each list. There are
+also constructors that accept a `std::initializer_list<T>` for the column's lists and
 optionally for the validity of each element. A number of other constructors are available.
 
 Example:
@@ -357,9 +357,9 @@ lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validit
 
 #### `structs_column_wrapper`
 
-The `structs_column_wrapper` class should be used to create columns of structs. It provides 
+The `structs_column_wrapper` class should be used to create columns of structs. It provides
 constructors that accept a vector or initializer list of pre-constructed columns or column wrappers
-for child columns. For nullable columns, an additional iterator can be provided to indicate the 
+for child columns. For nullable columns, an additional iterator can be provided to indicate the
 validity of each struct.
 
 Examples:
@@ -413,29 +413,29 @@ have the same metadata.
 
 #### `expect_column_properties_equal`
 
-Verifies that two columns have the same type, size, and nullability. For nested types, recursively 
+Verifies that two columns have the same type, size, and nullability. For nested types, recursively
 verifies the equality of type, size and nullability of all nested children.
 
 #### `expect_column_properties_equivalent`
 
-Verifies that two columns have equivalent type and equal size, ignoring nullability. For nested 
+Verifies that two columns have equivalent type and equal size, ignoring nullability. For nested
 types, recursively verifies the equivalence of type, and equality of size of all nested children,
 ignoring nullability.
 
 Note "equivalent type". Most types are equivalent if and only they are equal. `fixed_point` types
-are one exception. They are equivalent if the representation type is equal, even if they have 
-different scales. Nested type columns can be equivalent in the case where they both have zero size, 
-but one has children (also empty) and the other does not. For columns with nonzero size, both equals 
+are one exception. They are equivalent if the representation type is equal, even if they have
+different scales. Nested type columns can be equivalent in the case where they both have zero size,
+but one has children (also empty) and the other does not. For columns with nonzero size, both equals
 and equivalent expect equal number of children.
 
 #### `expect_columns_equal`
 
-Verifies that two columns have equal properties and verifies elementwise equality of the column 
+Verifies that two columns have equal properties and verifies elementwise equality of the column
 data. Null elements are treated as equal.
 
 #### `expect_columns_equivalent`
 
-Verifies that two columns have equivalent properties and verifies elementwise equivalence of the 
+Verifies that two columns have equivalent properties and verifies elementwise equivalence of the
 column data. Null elements are treated as equivalent.
 
 #### `expect_equal_buffers`
@@ -444,6 +444,6 @@ Verifies the bitwise equality of two device memory buffers.
 
 ### Printing and accessing column data
 
-`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing 
+`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing
 columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to
 the host (`to_host).

From 7fc65d80893a85c58252742e33cfee5e4dda179d Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Tue, 16 Nov 2021 10:38:37 +0800
Subject: [PATCH 003/202] Update cudf JNI to 22.02.0-SNAPSHOT (#9681)

Signed-off-by: Peixin Li <pxli@nyu.edu>

cudfjni version update.

NOTE: this includes change to use gpuci/cuda images since official cuda images is not ready yet on docker hub

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9681
---
 java/ci/Dockerfile.centos7          |  8 +++++---
 java/ci/README.md                   | 10 +++++-----
 java/pom.xml                        |  2 +-
 java/src/main/native/CMakeLists.txt |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index a6264a84696..2ee57bfaeab 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -17,11 +17,13 @@
 ###
 # Build the image for cudf development environment.
 #
-# Arguments: CUDA_VERSION=11.0, 11.1, 11.2.0 or 11.2.2
+# Arguments: CUDA_VERSION=11.5.0
 #
 ###
-ARG CUDA_VERSION
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
+ARG CUDA_VERSION=11.5.0
+# use rapids gpuci/cuda images until nvidia/cuda cuda 11.5+ images are available in docker hub
+# FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
+FROM gpuci/cuda:$CUDA_VERSION-devel-centos7
 
 ### Install basic requirements
 RUN yum install -y centos-release-scl
diff --git a/java/ci/README.md b/java/ci/README.md
index 5432dc8d0f1..0e947b62511 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,14 +11,14 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.2.2 -t cudf-build:11.2.2-devel-centos7 .
+docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.5.0 -t cudf-build:11.5.0-devel-centos7 .
 ```
 
 The following CUDA versions are supported w/ CUDA Enhanced Compatibility:
 * CUDA 11.0+
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.5.0-devel-centos7" with another name you like.
 
 ## Start the docker then build
 
@@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.5.0-devel-centos7 bash
 ```
 
 ### Download the cuDF source code
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.12
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.02
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-21.12.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-22.02.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/pom.xml b/java/pom.xml
index 356d94455c8..87d43ec1272 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>21.12.0-SNAPSHOT</version>
+    <version>22.02.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index d9fc3f337e7..a5a6646c7e6 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -26,7 +26,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 

From 7e4a985444148d727a1be457e745eff7fecc75fc Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Mon, 15 Nov 2021 20:51:11 -0800
Subject: [PATCH 004/202] Some improvements to `parse_decimal` function and
 bindings for `is_fixed_point` (#9658)

This PR adds Java bindings for `is_fixed_point`

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9658
---
 .../strings/convert/convert_fixed_point.hpp   |  8 ++--
 .../main/java/ai/rapids/cudf/ColumnView.java  | 32 ++++++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 16 ++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 38 +++++++++----------
 4 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 7bcb7e72ab2..5fe5c880f9d 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -93,18 +93,16 @@ std::unique_ptr<column> from_fixed_point(
  * @brief Returns a boolean column identifying strings in which all
  * characters are valid for conversion to fixed-point.
  *
- * The output row entry is set to `true` if the corresponding string element
- * has at least one character in [+-0123456789.]. The optional sign character
- * must only be in the first position. The decimal point may only appear once.
+ * The sign and the exponent is optional. The decimal point may only appear once.
  * Also, the integer component must fit within the size limits of the
  * underlying fixed-point storage type. The value of the integer component
  * is based on the scale of the `decimal_type` provided.
  *
  * @code{.pseudo}
  * Example:
- * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789', '-0.005]
+ * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34', '.789', '-0.005]
  * b = is_fixed_point(s)
- * b is [true, true, false, false, false, true, true, true]
+ * b is [true, true, false, false, true, true, true, true]
  * @endcode
  *
  * Any null entries result in corresponding null entries in the output column.
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index aa9d3f0d9f3..329c251f72d 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -348,6 +348,34 @@ public final ColumnVector isNull() {
     return new ColumnVector(isNullNative(getNativeView()));
   }
 
+  /**
+   * Returns a Boolean vector with the same number of rows as this instance, that has
+   * TRUE for any entry that is a fixed-point, and FALSE if its not a fixed-point.
+   * A null will be returned for null entries.
+   *
+   * The sign and the exponent is optional. The decimal point may only appear once.
+   * The integer component must fit within the size limits of the underlying fixed-point
+   * storage type. The value of the integer component is based on the scale of the target
+   * decimalType.
+   *
+   * Example:
+   * vec = ["A", "nan", "Inf", "-Inf", "Infinity", "infinity", "2.1474", "112.383", "-2.14748",
+   *        "NULL", "null", null, "1.2", "1.2e-4", "0.00012"]
+   * vec.isFixedPoint() = [false, false, false, false, false, false, true, true, true, false, false,
+   *                       null, true, true, true]
+   *
+   * @param decimalType the data type that should be used for bounds checking. Note that only
+   *                Decimal types (fixed-point) are allowed.
+   * @return Boolean vector
+   */
+  public final ColumnVector isFixedPoint(DType decimalType) {
+    assert type.equals(DType.STRING);
+    assert decimalType.isDecimalType();
+    return new ColumnVector(isFixedPoint(getNativeView(),
+        decimalType.getTypeId().getNativeId(), decimalType.getScale()));
+  }
+
+
   /**
    * Returns a Boolean vector with the same number of rows as this instance, that has
    * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned
@@ -375,6 +403,7 @@ public final ColumnVector isInteger() {
    */
   public final ColumnVector isInteger(DType intType) {
     assert type.equals(DType.STRING);
+    assert intType.isBackedByInt() || intType.isBackedByLong();
     return new ColumnVector(isIntegerWithType(getNativeView(),
         intType.getTypeId().getNativeId(), intType.getScale()));
   }
@@ -3220,6 +3249,9 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) {
    */
   private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format);
 
+
+  private static native long isFixedPoint(long viewHandle, int nativeTypeId, int scale);
+
   /**
    * Native method to concatenate a list column of strings (each row is a list of strings),
    * concatenates the strings within each row and returns a single strings column result.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index d2a2030e24c..bce330ea4a3 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2023,6 +2023,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo
   CATCH_STD(env, 0)
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject,
+                                                                    jlong handle, jint j_dtype,
+                                                                    jint scale) {
+
+  JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale);
+    std::unique_ptr<cudf::column> result = cudf::strings::is_fixed_point(*view, fp_dtype);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0)
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
                                                                          jlong handle, jint j_dtype,
                                                                          jint scale) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0d007aa0ed7..4d52862f7b0 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -18,12 +18,7 @@
 
 package ai.rapids.cudf;
 
-import ai.rapids.cudf.HostColumnVector.BasicType;
-import ai.rapids.cudf.HostColumnVector.DataType;
-import ai.rapids.cudf.HostColumnVector.ListType;
-import ai.rapids.cudf.HostColumnVector.StructData;
-import ai.rapids.cudf.HostColumnVector.StructType;
-
+import ai.rapids.cudf.HostColumnVector.*;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
@@ -38,20 +33,9 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
-import static ai.rapids.cudf.QuantileMethod.HIGHER;
-import static ai.rapids.cudf.QuantileMethod.LINEAR;
-import static ai.rapids.cudf.QuantileMethod.LOWER;
-import static ai.rapids.cudf.QuantileMethod.MIDPOINT;
-import static ai.rapids.cudf.QuantileMethod.NEAREST;
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
-import static ai.rapids.cudf.TableTest.assertStructColumnsAreEqual;
-import static ai.rapids.cudf.TableTest.assertTablesAreEqual;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertNotEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertThrows;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static ai.rapids.cudf.QuantileMethod.*;
+import static ai.rapids.cudf.TableTest.*;
+import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
 public class ColumnVectorTest extends CudfTestBase {
@@ -4834,6 +4818,20 @@ void testIsInteger() {
     }
   }
 
+  @Test
+  void testIsFixedPoint() {
+    String[] decimalStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity",
+        "2.1474", "112.383", "-2.14748", "NULL", "null", null, "1.2", "1.2e-4", "0.00012"};
+
+    DType dt = DType.create(DType.DTypeEnum.DECIMAL32, -3);
+    try (ColumnVector decStringCV = ColumnVector.fromStrings(decimalStrings);
+         ColumnVector isFixedPoint = decStringCV.isFixedPoint(dt);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, false, false
+             , false, true, true, true, false, false, null, true, true, true)) {
+      assertColumnsAreEqual(expected, isFixedPoint);
+    }
+  }
+
   @Test
   void testIsFloat() {
     String[] floatStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "-0.0", "0.0",

From c3bcc8d6d223a999b5beba3c60ad8af8d86844a0 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 16 Nov 2021 10:36:25 -0600
Subject: [PATCH 005/202] Fix `null` handling when `boolean` dtype is passed
 (#9691)

Fixes: #9642

This PR fixes issue where null values being treated as `False` when `boolean` dtype was being passed to the `Series` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9691
---
 python/cudf/cudf/core/column/column.py |  5 +++++
 python/cudf/cudf/tests/test_series.py  | 11 +++++++++++
 2 files changed, 16 insertions(+)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5f9104263b1..6f2f01c746d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2060,6 +2060,11 @@ def as_column(
                         return cudf.core.column.Decimal32Column.from_arrow(
                             data
                         )
+                    if is_bool_dtype(dtype):
+                        # Need this special case handling for bool dtypes,
+                        # since 'boolean' & 'pd.BooleanDtype' are not
+                        # understood by np.dtype below.
+                        dtype = "bool"
                     np_type = np.dtype(dtype).type
                     pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 09f0417b7ac..73fe46746ce 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1335,3 +1335,14 @@ def test_equals_names(lhs, rhs):
     expect = lhs.to_pandas().equals(rhs.to_pandas())
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data", [[True, False, None, True, False], [None, None], []]
+)
+@pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()])
+def test_nullable_bool_dtype_series(data, bool_dtype):
+    psr = pd.Series(data, dtype=pd.BooleanDtype())
+    gsr = cudf.Series(data, dtype=bool_dtype)
+
+    assert_eq(psr, gsr.to_pandas(nullable=True))

From e08ae9cb15fe260015cf70a22181fa67123e779f Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Tue, 16 Nov 2021 18:03:14 -0800
Subject: [PATCH 006/202] Implement Series.datetime.floor (#9571)

Fixes: #7102
Replaces: [#9488](https://github.com/rapidsai/cudf/pull/9488/files)

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9571
---
 cpp/include/cudf/datetime.hpp            |  93 ++++++++++-
 cpp/src/datetime/datetime_ops.cu         | 199 ++++++++++++++++++-----
 cpp/tests/datetime/datetime_ops_test.cpp |  92 ++++++++++-
 docs/cudf/source/api_docs/series.rst     |   2 +
 python/cudf/cudf/_lib/cpp/datetime.pxd   |  17 +-
 python/cudf/cudf/_lib/datetime.pyx       |  33 +++-
 python/cudf/cudf/core/column/datetime.py |   7 +-
 python/cudf/cudf/core/series.py          |  73 ++++++++-
 python/cudf/cudf/tests/test_datetime.py  |  39 ++++-
 9 files changed, 502 insertions(+), 53 deletions(-)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index d67984daa7c..71e5968bf07 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -378,5 +378,96 @@ std::unique_ptr<column> ceil_nanosecond(
   column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Round down to the nearest day
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest hour
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest minute
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest second
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> floor_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest millisecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest microsecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round down to the nearest nanosecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> floor_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ccfad56b4ea..717bd7ac0a8 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -54,6 +54,8 @@ enum class datetime_component {
   NANOSECOND
 };
 
+enum class rounding_kind { CEIL, FLOOR };
+
 template <datetime_component Component>
 struct extract_component_operator {
   template <typename Timestamp>
@@ -88,32 +90,59 @@ struct extract_component_operator {
   }
 };
 
-template <datetime_component COMPONENT>
-struct ceil_timestamp {
+// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round
+// function.
+template <typename DurationType>
+struct RoundFunctor {
+  template <typename Timestamp>
+  CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt)
+  {
+    switch (round_kind) {
+      case rounding_kind::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
+      case rounding_kind::FLOOR: return cuda::std::chrono::floor<DurationType>(dt);
+      default: cudf_assert(false && "Unsupported rounding kind.");
+    }
+    __builtin_unreachable();
+  }
+};
+
+struct RoundingDispatcher {
+  rounding_kind round_kind;
+  datetime_component component;
+
+  RoundingDispatcher(rounding_kind round_kind, datetime_component component)
+    : round_kind(round_kind), component(component)
+  {
+  }
+
   template <typename Timestamp>
   CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
   {
-    using namespace cuda::std::chrono;
-    // want to use this with D, H, T (minute), S, L (millisecond), U
-    switch (COMPONENT) {
+    switch (component) {
       case datetime_component::DAY:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_D>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_D>{}(round_kind, ts));
       case datetime_component::HOUR:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_h>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_h>{}(round_kind, ts));
       case datetime_component::MINUTE:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_m>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_m>{}(round_kind, ts));
       case datetime_component::SECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_s>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_s>{}(round_kind, ts));
       case datetime_component::MILLISECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ms>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ms>{}(round_kind, ts));
       case datetime_component::MICROSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_us>(ts));
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_us>{}(round_kind, ts));
       case datetime_component::NANOSECOND:
-        return time_point_cast<typename Timestamp::duration>(ceil<duration_ns>(ts));
-      default: cudf_assert(false && "Unexpected resolution");
+        return time_point_cast<typename Timestamp::duration>(
+          RoundFunctor<duration_ns>{}(round_kind, ts));
+      default: cudf_assert(false && "Unsupported datetime rounding resolution.");
     }
-
-    return {};
+    __builtin_unreachable();
   }
 };
 
@@ -196,10 +225,11 @@ struct is_leap_year_op {
 };
 
 // Specific function for applying ceil/floor date ops
-template <typename TransformFunctor>
-struct dispatch_ceil {
+struct dispatch_round {
   template <typename Timestamp>
   std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
+    rounding_kind round_kind,
+    datetime_component component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr) const
@@ -221,7 +251,7 @@ struct dispatch_ceil {
                       column.begin<Timestamp>(),
                       column.end<Timestamp>(),
                       output->mutable_view().begin<Timestamp>(),
-                      TransformFunctor{});
+                      RoundingDispatcher{round_kind, component});
 
     return output;
   }
@@ -384,13 +414,14 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   }
 }
 
-template <datetime_component Component>
-std::unique_ptr<column> ceil_general(column_view const& column,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> round_general(rounding_kind round_kind,
+                                      datetime_component component,
+                                      column_view const& column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   return cudf::type_dispatcher(
-    column.type(), dispatch_ceil<detail::ceil_timestamp<Component>>{}, column, stream, mr);
+    column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column,
@@ -498,53 +529,147 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::DAY>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::HOUR>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MINUTE>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::SECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_millisecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MILLISECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_microsecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::MICROSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> ceil_nanosecond(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ceil_general<detail::datetime_component::NANOSECOND>(
-    column, rmm::cuda_stream_default, mr);
+  return detail::round_general(detail::rounding_kind::CEIL,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_millisecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_microsecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> floor_nanosecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_kind::FLOOR,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index c0d2d1cc447..b70ac29fd5d 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -357,9 +357,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
 
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto host_val                     = to_host<T>(input);
   thrust::host_vector<T> timestamps = host_val.first;
@@ -403,6 +403,22 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
   auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     ceiled_millisecond.begin(), ceiled_millisecond.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
+
+  std::vector<T> ceiled_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_microsecond.begin(), ceiled_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond);
+
+  std::vector<T> ceiled_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_nanosecond.begin(), ceiled_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond);
 }
 
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
@@ -827,4 +843,76 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+
+  auto host_val                     = to_host<T>(input);
+  thrust::host_vector<T> timestamps = host_val.first;
+
+  std::vector<T> floored_day(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<days>(i));
+  });
+  auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(floored_day.begin(),
+                                                                               floored_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day);
+
+  std::vector<T> floored_hour(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_hour.begin(), floored_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour);
+
+  std::vector<T> floored_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_minute.begin(), floored_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute);
+
+  std::vector<T> floored_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_second.begin(), floored_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second);
+
+  std::vector<T> floored_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_millisecond.begin(), floored_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond);
+
+  std::vector<T> floored_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_microsecond.begin(), floored_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond);
+
+  std::vector<T> floored_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(floor<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    floored_nanosecond.begin(), floored_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 46a31a0dcf6..b90ee628332 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -300,6 +300,8 @@ Datetime methods
 
    strftime
    isocalendar
+   ceil
+   floor
 
 
 Timedelta properties
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 2af4dd648c5..38ed9fbd769 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -23,7 +23,22 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] ceil_microsecond(
         const column_view& column
     ) except +
-    cdef unique_ptr[column] ceil_nanosecond(const column_view& column) except +
+    cdef unique_ptr[column] ceil_nanosecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_day(const column_view& column) except +
+    cdef unique_ptr[column] floor_hour(const column_view& column) except +
+    cdef unique_ptr[column] floor_minute(const column_view& column) except +
+    cdef unique_ptr[column] floor_second(const column_view& column) except +
+    cdef unique_ptr[column] floor_millisecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_microsecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] floor_nanosecond(
+        const column_view& column
+    ) except +
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 5cda06362b6..3215088c438 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -72,13 +72,13 @@ def ceil_datetime(Column col, object field):
             c_result = move(libcudf_datetime.ceil_day(col_view))
         elif field == "H":
             c_result = move(libcudf_datetime.ceil_hour(col_view))
-        elif field == "T":
+        elif field == "T" or field == "min":
             c_result = move(libcudf_datetime.ceil_minute(col_view))
         elif field == "S":
             c_result = move(libcudf_datetime.ceil_second(col_view))
-        elif field == "L":
+        elif field == "L" or field == "ms":
             c_result = move(libcudf_datetime.ceil_millisecond(col_view))
-        elif field == "U":
+        elif field == "U" or field == "us":
             c_result = move(libcudf_datetime.ceil_microsecond(col_view))
         elif field == "N":
             c_result = move(libcudf_datetime.ceil_nanosecond(col_view))
@@ -89,6 +89,33 @@ def ceil_datetime(Column col, object field):
     return result
 
 
+def floor_datetime(Column col, object field):
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
+        if field == "D":
+            c_result = move(libcudf_datetime.floor_day(col_view))
+        elif field == "H":
+            c_result = move(libcudf_datetime.floor_hour(col_view))
+        elif field == "T" or field == "min":
+            c_result = move(libcudf_datetime.floor_minute(col_view))
+        elif field == "S":
+            c_result = move(libcudf_datetime.floor_second(col_view))
+        elif field == "L" or field == "ms":
+            c_result = move(libcudf_datetime.floor_millisecond(col_view))
+        elif field == "U" or field == "us":
+            c_result = move(libcudf_datetime.floor_microsecond(col_view))
+        elif field == "N":
+            c_result = move(libcudf_datetime.floor_nanosecond(col_view))
+        else:
+            raise ValueError(f"Invalid resolution: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
+
+
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 68379002e6b..756e48edccb 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -222,8 +222,11 @@ def values(self):
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
-    def ceil(self, field: str) -> ColumnBase:
-        return libcudf.datetime.ceil_datetime(self, field)
+    def ceil(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.ceil_datetime(self, freq)
+
+    def floor(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.floor_datetime(self, freq)
 
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, cudf.Scalar):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 00a8ebabe34..c804f2bca2c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4592,11 +4592,76 @@ def _get_dt_field(self, field):
             data=out_column, index=self.series._index, name=self.series.name
         )
 
-    def ceil(self, field):
-        out_column = self.series._column.ceil(field)
+    def ceil(self, freq):
+        """
+        Perform ceil operation on the data to the specified freq.
 
-        return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.ceil("T")
+        0   2001-01-01 00:05:00
+        1   2001-01-01 00:05:00
+        2   2001-01-01 00:06:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.ceil(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
+        )
+
+    def floor(self, freq):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded up to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58",
+        ... "2001-01-01 00:05:04"], dtype="datetime64[ns]")
+        >>> t.dt.floor("T")
+        0   2001-01-01 00:04:00
+        1   2001-01-01 00:04:00
+        2   2001-01-01 00:05:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.floor(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
         )
 
     def strftime(self, date_format, *args, **kwargs):
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index d666dfc0ec1..bf75badc06f 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1777,12 +1777,45 @@ def test_error_values():
     ],
 )
 @pytest.mark.parametrize("time_type", DATETIME_TYPES)
-@pytest.mark.parametrize("resolution", ["D", "H", "T", "S", "L", "U", "N"])
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
 def test_ceil(data, time_type, resolution):
 
-    ps = pd.Series(data, dtype=time_type)
-    gs = cudf.from_pandas(ps)
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
 
     expect = ps.dt.ceil(resolution)
     got = gs.dt.ceil(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            [
+                "2020-05-31 08:00:00",
+                "1999-12-31 18:40:10",
+                "2000-12-31 04:00:05",
+                "1900-02-28 07:00:06",
+                "1800-03-14 07:30:20",
+                "2100-03-14 07:30:20",
+                "1970-01-01 00:00:09",
+                "1969-12-31 12:59:10",
+            ]
+        )
+    ],
+)
+@pytest.mark.parametrize("time_type", DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_floor(data, time_type, resolution):
+
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
+
+    expect = ps.dt.floor(resolution)
+    got = gs.dt.floor(resolution)
+    assert_eq(expect, got)

From 4d13d81bb04a51a1ad7f476184c2b1eb88038126 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Tue, 16 Nov 2021 21:11:28 -0800
Subject: [PATCH 007/202] Fixed build by adding more checks for int8, int16
 (#9707)

Add additional checks for int8, int16

fixes [#/rapidsai/cudf/4127](https://github.com/NVIDIA/spark-rapids/issues/4127)

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9707
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 329c251f72d..729444f460c 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -398,12 +398,13 @@ public final ColumnVector isInteger() {
    * for null entries.
    *
    * @param intType the data type that should be used for bounds checking. Note that only
-   *                integer types are allowed.
+   *                cudf integer types are allowed including signed/unsigned int8 through int64
    * @return Boolean vector
    */
   public final ColumnVector isInteger(DType intType) {
     assert type.equals(DType.STRING);
-    assert intType.isBackedByInt() || intType.isBackedByLong();
+    assert intType.isBackedByInt() || intType.isBackedByLong() || intType.isBackedByByte()
+        || intType.isBackedByShort();
     return new ColumnVector(isIntegerWithType(getNativeView(),
         intType.getTypeId().getNativeId(), intType.getScale()));
   }

From 91141042ac5ce5024975eb2eab63f916047e6b6a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 17 Nov 2021 10:31:10 -0800
Subject: [PATCH 008/202] Add parameters to control row group size in Parquet
 writer (#9677)

Closes https://github.com/rapidsai/cudf/issues/9615

Adds the following API to the Parquet writer:

- Set maximum row group size, in bytes (minimum of 512KB);
- Set maximum row group size, in rows (minimum of 5000).

The API is more limited than its ORC equivalent because of limitation in Parquet page size control/estimation.

Other changes:

- Fix naming in some ORC APIs to be consistent.
- Change `rowgroup` to `row_group` in APIs, since Parquet specs refer to this as "row group", not "rowgroup".
- Replace some `uint32_t` use in Parquet writer.
- Remove unused `target_page_size`.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9677
---
 cpp/include/cudf/io/detail/parquet.hpp   |   2 +-
 cpp/include/cudf/io/orc.hpp              |  20 ++--
 cpp/include/cudf/io/parquet.hpp          | 125 ++++++++++++++++++++++-
 cpp/src/io/functions.cpp                 |   6 +-
 cpp/src/io/orc/writer_impl.cu            |  12 +--
 cpp/src/io/parquet/writer_impl.cu        |  79 +++++++-------
 cpp/src/io/parquet/writer_impl.hpp       |  12 +--
 cpp/tests/io/parquet_test.cpp            |  22 ++++
 python/cudf/cudf/_lib/cpp/io/orc.pxd     |   8 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |  22 +++-
 python/cudf/cudf/_lib/parquet.pyx        |  34 +++---
 python/cudf/cudf/io/parquet.py           |   6 ++
 python/cudf/cudf/tests/test_parquet.py   |  23 +++++
 python/cudf/cudf/utils/ioutils.py        |  10 +-
 14 files changed, 291 insertions(+), 90 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 14f27ef8eef..98922ad10a4 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -148,7 +148,7 @@ class writer {
    * @param[in] metadata_list List of input file metadata
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  static std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+  static std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
     const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 };
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index fb1199fc166..3bc2e6c9ef2 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -475,24 +475,24 @@ class orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
@@ -769,24 +769,24 @@ class chunked_orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool enable_statistics() const { return _enable_statistics; }
+  bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto stripe_size_rows() const { return _stripe_size_rows; }
+  auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
    */
-  auto row_index_stride() const
+  auto get_row_index_stride() const
   {
-    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
   }
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 660ec051304..88cf7416506 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -37,6 +37,9 @@ namespace io {
  * @file
  */
 
+constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  // 128MB
+constexpr size_type default_row_group_size_rows = 1000000;
+
 /**
  * @brief Builds parquet_reader_options to use for `read_parquet()`.
  */
@@ -398,6 +401,10 @@ class parquet_writer_options {
   bool _write_timestamps_as_int96 = false;
   // Column chunks file path to be set in the raw output metadata
   std::string _column_chunks_file_path;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink and table.
@@ -472,6 +479,16 @@ class parquet_writer_options {
    */
   std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -510,6 +527,28 @@ class parquet_writer_options {
   {
     _column_chunks_file_path.assign(file_path);
   }
+
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
 };
 
 class parquet_writer_options_builder {
@@ -582,6 +621,30 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
    *
@@ -637,7 +700,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(
  * @param[in] metadata_list List of input file metadata.
  * @return A parquet-compatible blob that contains the data for all row groups in the list.
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 
 /**
@@ -660,6 +723,10 @@ class chunked_parquet_writer_options {
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
+  // Maximum size of each row group (unless smaller than a single page)
+  size_t _row_group_size_bytes = default_row_group_size_bytes;
+  // Maximum number of rows in row group (unless smaller than a single page)
+  size_type _row_group_size_rows = default_row_group_size_rows;
 
   /**
    * @brief Constructor from sink.
@@ -703,6 +770,16 @@ class chunked_parquet_writer_options {
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
+  /**
+   * @brief Returns maximum row group size, in bytes.
+   */
+  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+
+  /**
+   * @brief Returns maximum row group size, in rows.
+   */
+  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+
   /**
    * @brief Sets metadata.
    *
@@ -732,6 +809,28 @@ class chunked_parquet_writer_options {
    */
   void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   */
+  void set_row_group_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(
+      size_bytes >= 512 * 1024,
+      "The maximum row group size cannot be smaller than the page size, which is 512KB.");
+    _row_group_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum row group size, in rows.
+   */
+  void set_row_group_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(
+      size_rows >= 5000,
+      "The maximum row group size cannot be smaller than the page size, which is 5000 rows.");
+    _row_group_size_rows = size_rows;
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -811,6 +910,30 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum row group size, in bytes.
+   *
+   * @param val maximum row group size
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val)
+  {
+    options.set_row_group_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output row groups.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& row_group_size_rows(size_type val)
+  {
+    options.set_row_group_size_rows(val);
+    return *this;
+  }
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b678941db21..a8ca1d3a459 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -405,13 +405,13 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
 }
 
 /**
- * @copydoc cudf::io::merge_rowgroup_metadata
+ * @copydoc cudf::io::merge_row_group_metadata
  */
-std::unique_ptr<std::vector<uint8_t>> merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   CUDF_FUNC_RANGE();
-  return detail_parquet::writer::merge_rowgroup_metadata(metadata_list);
+  return detail_parquet::writer::merge_row_group_metadata(metadata_list);
 }
 
 table_input_metadata::table_input_metadata(table_view const& table,
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 2bf020d08a2..1563e3e1fd7 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1301,10 +1301,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
@@ -1321,10 +1321,10 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
-    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
-    row_index_stride{options.row_index_stride()},
+    max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
+    row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.enable_statistics()),
+    enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
     out_sink_(std::move(sink))
 {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2c7d745bb4c..62803432157 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1026,6 +1026,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1045,6 +1047,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_row_group_size{options.get_row_group_size_bytes()},
+    max_row_group_rows{options.get_row_group_size_rows()},
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
@@ -1148,8 +1152,7 @@ void writer::impl::write(table_view const& table)
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
 
-  uint32_t num_fragments =
-    (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+  size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1162,21 +1165,20 @@ void writer::impl::write(table_view const& table)
     init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
   }
 
-  size_t global_rowgroup_base = md.row_groups.size();
+  auto const global_rowgroup_base = static_cast<size_type>(md.row_groups.size());
 
   // Decide row group boundaries based on uncompressed data size
-  size_t rowgroup_size   = 0;
-  uint32_t num_rowgroups = 0;
-  for (uint32_t f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments;
-       f++) {
-    size_t fragment_data_size = 0;
+  auto rowgroup_size = 0ul;
+  auto num_rowgroups = 0;
+  for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) {
+    auto fragment_data_size = 0ul;
     // Replace with STL algorithm to transform and sum
     for (auto i = 0; i < num_columns; i++) {
       fragment_data_size += fragments[i][f].fragment_data_size;
     }
     if (f > rowgroup_start &&
-        (rowgroup_size + fragment_data_size > max_rowgroup_size_ ||
-         (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) {
+        (rowgroup_size + fragment_data_size > max_row_group_size ||
+         (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) {
       // update schema
       md.row_groups.resize(md.row_groups.size() + 1);
       md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
@@ -1204,15 +1206,15 @@ void writer::impl::write(table_view const& table)
     }
   }
   // Initialize row groups and column chunks
-  uint32_t num_chunks = num_rowgroups * num_columns;
+  auto const num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
+  for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
        r++, global_r++) {
-    uint32_t fragments_in_chunk = (uint32_t)(
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size);
+    size_type const fragments_in_chunk =
+      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
     md.row_groups[global_r].total_byte_size = 0;
     md.row_groups[global_r].columns.resize(num_columns);
-    for (int i = 0; i < num_columns; i++) {
+    for (auto i = 0; i < num_columns; i++) {
       gpu::EncColumnChunk* ck = &chunks[r][i];
 
       *ck             = {};
@@ -1244,8 +1246,8 @@ void writer::impl::write(table_view const& table)
   }
 
   auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
-  for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
-    for (int col = 0; col < num_columns; col++) {
+  for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
+    for (auto col = 0; col < num_columns; col++) {
       if (chunks.host_view()[rg][col].use_dictionary) {
         md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
           Encoding::PLAIN_DICTIONARY);
@@ -1274,16 +1276,16 @@ void writer::impl::write(table_view const& table)
   }
 
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<uint32_t> batch_list;
-  uint32_t num_pages          = 0;
-  size_t max_bytes_in_batch   = 1024 * 1024 * 1024;  // 1GB - TBD: Tune this
-  size_t max_uncomp_bfr_size  = 0;
-  size_t max_comp_bfr_size    = 0;
-  size_t max_chunk_bfr_size   = 0;
-  uint32_t max_pages_in_batch = 0;
-  size_t bytes_in_batch       = 0;
-  size_t comp_bytes_in_batch  = 0;
-  for (uint32_t r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
+  std::vector<size_type> batch_list;
+  size_type num_pages          = 0;
+  size_t max_bytes_in_batch    = 1024 * 1024 * 1024;  // 1GB - TODO: Tune this
+  size_t max_uncomp_bfr_size   = 0;
+  size_t max_comp_bfr_size     = 0;
+  size_t max_chunk_bfr_size    = 0;
+  size_type max_pages_in_batch = 0;
+  size_t bytes_in_batch        = 0;
+  size_t comp_bytes_in_batch   = 0;
+  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
     size_t rowgroup_size      = 0;
     size_t comp_rowgroup_size = 0;
     if (r < num_rowgroups) {
@@ -1331,11 +1333,11 @@ void writer::impl::write(table_view const& table)
 
   // This contains stats for both the pages and the rowgroups. TODO: make them separate.
   rmm::device_uvector<statistics_chunk> page_stats(num_stats_bfr, stream);
-  for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) {
-    uint8_t* bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
-    uint8_t* bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (uint32_t j = 0; j < batch_list[b]; j++, r++) {
-      for (int i = 0; i < num_columns; i++) {
+  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+    auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
+    auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
+    for (auto j = 0; j < batch_list[b]; j++, r++) {
+      for (auto i = 0; i < num_columns; i++) {
         gpu::EncColumnChunk* ck = &chunks[r][i];
         ck->uncompressed_bfr    = bfr;
         ck->compressed_bfr      = bfr_c;
@@ -1360,14 +1362,15 @@ void writer::impl::write(table_view const& table)
   pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
-  for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size();
+  for (auto b = 0, r = 0, global_r = global_rowgroup_base;
+       b < static_cast<size_type>(batch_list.size());
        b++) {
     // Count pages in this batch
-    uint32_t rnext               = r + batch_list[b];
-    uint32_t first_page_in_batch = chunks[r][0].first_page;
-    uint32_t first_page_in_next_batch =
+    auto const rnext               = r + batch_list[b];
+    auto const first_page_in_batch = chunks[r][0].first_page;
+    auto const first_page_in_next_batch =
       (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch;
+    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
     // device_span<gpu::EncPage> batch_pages{pages.data() + first_page_in_batch, }
     encode_pages(
       chunks,
@@ -1514,7 +1517,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_ch
   return _impl->close(column_chunks_file_path);
 }
 
-std::unique_ptr<std::vector<uint8_t>> writer::merge_rowgroup_metadata(
+std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
 {
   std::vector<uint8_t> output;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index c7cdf8effd1..9188218f607 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -56,13 +56,6 @@ using cudf::detail::hostdevice_2dvector;
  * @brief Implementation for parquet writer
  */
 class writer::impl {
-  // Parquet datasets are divided into fixed-size, independent rowgroups
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXSIZE = 128 * 1024 * 1024;  // 128MB
-  static constexpr uint32_t DEFAULT_ROWGROUP_MAXROWS = 1000000;            // Or at most 1M rows
-
-  // rowgroups are divided into pages
-  static constexpr uint32_t DEFAULT_TARGET_PAGE_SIZE = 512 * 1024;
-
  public:
   /**
    * @brief Constructor with writer options.
@@ -209,9 +202,8 @@ class writer::impl {
   // Cuda stream to be used
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-  size_t max_rowgroup_size_          = DEFAULT_ROWGROUP_MAXSIZE;
-  size_t max_rowgroup_rows_          = DEFAULT_ROWGROUP_MAXROWS;
-  size_t target_page_size_           = DEFAULT_TARGET_PAGE_SIZE;
+  size_t max_row_group_size          = default_row_group_size_bytes;
+  size_type max_row_group_rows       = default_row_group_size_rows;
   Compression compression_           = Compression::UNCOMPRESSED;
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 3bae8d7ab1e..b233819092a 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -3056,4 +3056,26 @@ TEST_F(ParquetReaderTest, EmptyOutput)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
+{
+  const auto unused_table = std::make_unique<table>();
+  std::vector<char> out_buffer;
+
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_rows(4999),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_group_size_bytes(511 << 10),
+    cudf::logic_error);
+
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_rows(4999),
+               cudf::logic_error);
+  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+                 .row_group_size_bytes(511 << 10),
+               cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index f0450483345..4b5ec913fb6 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -72,10 +72,10 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options()
         cudf_io_types.sink_info get_sink() except+
         cudf_io_types.compression_type get_compression() except+
-        bool enable_statistics() except+
-        size_t stripe_size_bytes() except+
-        size_type stripe_size_rows() except+
-        size_type row_index_stride() except+
+        bool is_enabled_statistics() except+
+        size_t get_stripe_size_bytes() except+
+        size_type get_stripe_size_rows() except+
+        size_type get_row_index_stride() except+
         cudf_table_view.table_view get_table() except+
         const cudf_io_types.table_input_metadata *get_metadata() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 81ca7e5836b..9d95dce83bc 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -74,6 +74,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -87,6 +89,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_path(
             string column_chunks_file_path
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -116,6 +120,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         parquet_writer_options build() except +
 
@@ -130,6 +140,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_io_types.table_input_metadata* get_metadata(
         ) except+
+        size_t get_row_group_size_bytes() except+
+        size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -140,6 +152,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
+        void set_row_group_size_bytes(size_t val) except+
+        void set_row_group_size_rows(size_type val) except+
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -160,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
+        parquet_writer_options_builder& row_group_size_bytes(
+            size_t val
+        ) except+
+        parquet_writer_options_builder& row_group_size_rows(
+            size_type val
+        ) except+
 
         chunked_parquet_writer_options build() except +
 
@@ -173,6 +193,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             string column_chunks_file_path,
         ) except+
 
-    cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata(
+    cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
     ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 71705f4d0c1..d17184685fa 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -46,7 +46,7 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    merge_rowgroup_metadata as parquet_merge_metadata,
+    merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
     parquet_writer_options,
@@ -282,7 +282,9 @@ cpdef write_parquet(
         object compression="snappy",
         object statistics="ROWGROUP",
         object metadata_file_path=None,
-        object int96_timestamps=False):
+        object int96_timestamps=False,
+        object row_group_size_bytes=None,
+        object row_group_size_rows=None):
     """
     Cython function to call into libcudf API, see `write_parquet`.
 
@@ -334,7 +336,6 @@ cpdef write_parquet(
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
-    cdef parquet_writer_options args
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
     cdef string c_column_chunks_file_path
     cdef bool _int96_timestamps = int96_timestamps
@@ -342,16 +343,21 @@ cpdef write_parquet(
         c_column_chunks_file_path = str.encode(metadata_file_path)
 
     # Perform write
+    cdef parquet_writer_options args = move(
+        parquet_writer_options.builder(sink, tv)
+        .metadata(tbl_meta.get())
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .column_chunks_file_path(c_column_chunks_file_path)
+        .int96_timestamps(_int96_timestamps)
+        .build()
+    )
+    if row_group_size_bytes is not None:
+        args.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        args.set_row_group_size_rows(row_group_size_rows)
+
     with nogil:
-        args = move(
-            parquet_writer_options.builder(sink, tv)
-            .metadata(tbl_meta.get())
-            .compression(comp_type)
-            .stats_level(stat_freq)
-            .column_chunks_file_path(c_column_chunks_file_path)
-            .int96_timestamps(_int96_timestamps)
-            .build()
-        )
         out_metadata_c = move(parquet_writer(args))
 
     if metadata_file_path is not None:
@@ -483,11 +489,11 @@ cdef class ParquetWriter:
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
-    Cython function to call into libcudf API, see `merge_rowgroup_metadata`.
+    Cython function to call into libcudf API, see `merge_row_group_metadata`.
 
     See Also
     --------
-    cudf.io.parquet.merge_rowgroup_metadata
+    cudf.io.parquet.merge_row_group_metadata
     """
     cdef vector[unique_ptr[vector[uint8_t]]] list_c
     cdef vector[uint8_t] blob_c
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 302021a082f..9d665d9a0a5 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -441,6 +441,8 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
+    row_group_size_bytes=None,
+    row_group_size_rows=None,
     *args,
     **kwargs,
 ):
@@ -480,6 +482,8 @@ def to_parquet(
                     statistics=statistics,
                     metadata_file_path=metadata_file_path,
                     int96_timestamps=int96_timestamps,
+                    row_group_size_bytes=row_group_size_bytes,
+                    row_group_size_rows=row_group_size_rows,
                 )
         else:
             write_parquet_res = libparquet.write_parquet(
@@ -490,6 +494,8 @@ def to_parquet(
                 statistics=statistics,
                 metadata_file_path=metadata_file_path,
                 int96_timestamps=int96_timestamps,
+                row_group_size_bytes=row_group_size_bytes,
+                row_group_size_rows=row_group_size_rows,
             )
 
         return write_parquet_res
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index df31738050b..b6595be9566 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -231,6 +231,11 @@ def _make_parquet_path_or_buf(src):
     yield _make_parquet_path_or_buf
 
 
+@pytest.fixture(scope="module")
+def large_int64_gdf():
+    return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)}))
+
+
 @pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize(
@@ -2170,3 +2175,21 @@ def test_parquet_reader_brotli(datadir):
     got = cudf.read_parquet(fname).to_pandas(nullable=True)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
+@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
+def test_parquet_writer_row_group_size(
+    tmpdir, large_int64_gdf, size_bytes, size_rows
+):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(
+        fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
+    )
+
+    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    # 8 bytes per row, as the column is int64
+    expected_num_rows = max(
+        math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
+    )
+    assert expected_num_rows == row_groups
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6746753249c..11994830fed 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -221,6 +221,12 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight. If ``False``,
     timestamps will not be altered.
+row_group_size_bytes: integer or None, default None
+    Maximum size of each stripe of the output.
+    If None, 13369344 (128MB) will be used.
+row_group_size_rows: integer or None, default None
+    Maximum number of rows of each stripe of the output.
+    If None, 1000000 will be used.
 
 
 See Also
@@ -404,10 +410,10 @@
 stripe_size_bytes: integer or None, default None
     Maximum size of each stripe of the output.
     If None, 67108864 (64MB) will be used.
-stripe_size_rows: integer or None, default None 1000000
+stripe_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
-row_index_stride: integer or None, default None 10000
+row_index_stride: integer or None, default None
     Row index stride (maximum number of rows in each row group).
     If None, 10000 will be used.
 

From 17e6f5b9d0a9456e82250f725da5fe61ce6c9ff5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 17 Nov 2021 14:58:38 -0800
Subject: [PATCH 009/202] Simplify merge internals and reduce overhead (#9516)

This PR is a pretty thorough rewrite of the internals of merging. There is a ton of complexity imposed by matching all the different edge cases allowed by the pandas API, but I've tried to unify the logic for different code paths as much as possible. I've also added checks for a number of edge cases that were not previously being handled. I see about a 10% performance improvement for merges on small to medium data sizes from this PR (as expected, there's no change for large data where most time is spent in C++). There's also a substantial reduction in total code that should make it easier to address issues going forward. I'm still not entirely happy with the complexity of the result and I think that further simplification should be possible, but I think this is a sufficiently large step forward to be worth pushing forward in this state, especially if it helps enable other changes to joining.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9516
---
 python/cudf/cudf/core/dataframe.py            |  30 +-
 python/cudf/cudf/core/frame.py                |   9 +-
 python/cudf/cudf/core/groupby/groupby.py      |  24 -
 python/cudf/cudf/core/join/__init__.py        |   4 +-
 python/cudf/cudf/core/join/_join_helpers.py   | 118 ++--
 python/cudf/cudf/core/join/join.py            | 505 ++++++++----------
 python/cudf/cudf/tests/test_joining.py        |  34 +-
 python/dask_cudf/dask_cudf/tests/test_join.py |   4 -
 8 files changed, 288 insertions(+), 440 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b2e6588edb2..a95453a4e62 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
         else:
             if is_list_like(data):
                 if len(data) > 0 and is_scalar(data[0]):
-                    new_df = self._from_columns(
-                        [data], index=index, columns=columns
-                    )
+                    if columns is not None:
+                        data = dict(zip(columns, [data]))
+                    else:
+                        data = dict(enumerate([data]))
+                    new_df = DataFrame(data=data, index=index)
+
                     self._data = new_df._data
                     self.index = new_df._index
                     self.columns = new_df.columns
@@ -3760,11 +3763,8 @@ def join(
                 FutureWarning,
             )
 
-        lhs = self
-        rhs = other
-
-        df = lhs.merge(
-            rhs,
+        df = self.merge(
+            other,
             left_index=True,
             right_index=True,
             how=how,
@@ -3772,7 +3772,7 @@ def join(
             sort=sort,
         )
         df.index.name = (
-            None if lhs.index.name != rhs.index.name else lhs.index.name
+            None if self.index.name != other.index.name else self.index.name
         )
         return df
 
@@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             df._index = as_index(index)
         return df
 
-    @classmethod
-    def _from_columns(cls, cols, index=None, columns=None):
-        """
-        Construct a DataFrame from a list of Columns
-        """
-        if columns is not None:
-            data = dict(zip(columns, cols))
-        else:
-            data = dict(enumerate(cols))
-
-        return cls(data=data, index=index,)
-
     def interpolate(
         self,
         method="linear",
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c0858398492..72239fc2a8e 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -46,7 +46,7 @@
     serialize_columns,
 )
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.join import merge
+from cudf.core.join import Merge, MergeSemi
 from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
@@ -3755,6 +3755,7 @@ def _merge(
         suffixes=("_x", "_y"),
     ):
         lhs, rhs = self, right
+        merge_cls = Merge
         if how == "right":
             # Merge doesn't support right, so just swap
             how = "left"
@@ -3762,8 +3763,10 @@ def _merge(
             left_on, right_on = right_on, left_on
             left_index, right_index = right_index, left_index
             suffixes = (suffixes[1], suffixes[0])
+        elif how in {"leftsemi", "leftanti"}:
+            merge_cls = MergeSemi
 
-        return merge(
+        return merge_cls(
             lhs,
             rhs,
             on=on,
@@ -3775,7 +3778,7 @@ def _merge(
             sort=sort,
             indicator=indicator,
             suffixes=suffixes,
-        )
+        ).perform_merge()
 
     def _is_sorted(self, ascending=None, null_position=None):
         """
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ba69e42674a..dc6461663ce 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1178,18 +1178,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     _PROTECTED_KEYS = frozenset(("obj",))
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def __getitem__(self, key):
         return self.obj[key].groupby(
             self.grouping, dropna=self._dropna, sort=self._sort
@@ -1262,18 +1250,6 @@ class SeriesGroupBy(GroupBy):
     Name: Max Speed, dtype: float64
     """
 
-    def __init__(
-        self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
-    ):
-        super().__init__(
-            obj=obj,
-            by=by,
-            level=level,
-            sort=sort,
-            as_index=as_index,
-            dropna=dropna,
-        )
-
     def agg(self, func):
         result = super().agg(func)
 
diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py
index 0463b8f9df1..71a91c398ad 100644
--- a/python/cudf/cudf/core/join/__init__.py
+++ b/python/cudf/cudf/core/join/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from cudf.core.join.join import merge
+from cudf.core.join.join import Merge, MergeSemi
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index cc9c0fb66da..6dec0b10273 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -3,16 +3,17 @@
 
 import collections
 import warnings
-from typing import TYPE_CHECKING, Any, Iterable, Tuple
+from typing import TYPE_CHECKING, Any, Tuple, cast
 
 import numpy as np
-import pandas as pd
 
 import cudf
+from cudf.api.types import is_dtype_equal
+from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
 if TYPE_CHECKING:
-    from cudf.core.column import CategoricalColumn, ColumnBase
+    from cudf.core.column import ColumnBase
     from cudf.core.frame import Frame
 
 
@@ -28,61 +29,36 @@ class _Indexer:
     # >>> _Indexer("a", column=True).get(df)  # returns column "a" of df
     # >>> _Indexer("b", index=True).get(df)  # returns index level "b" of df
 
-    def __init__(self, name: Any, column=False, index=False):
-        if column and index:
-            raise ValueError("Cannot specify both column and index")
+    def __init__(self, name: Any):
         self.name = name
-        self.column, self.index = column, index
 
+
+class _ColumnIndexer(_Indexer):
     def get(self, obj: Frame) -> ColumnBase:
-        # get the column from `obj`
-        if self.column:
-            return obj._data[self.name]
-        else:
-            if obj._index is not None:
-                return obj._index._data[self.name]
-        raise KeyError()
+        return obj._data[self.name]
 
     def set(self, obj: Frame, value: ColumnBase, validate=False):
-        # set the colum in `obj`
-        if self.column:
-            obj._data.set_by_label(self.name, value, validate=validate)
-        else:
-            if obj._index is not None:
-                obj._index._data.set_by_label(
-                    self.name, value, validate=validate
-                )
-            else:
-                raise KeyError()
-
-
-def _frame_select_by_indexers(
-    frame: Frame, indexers: Iterable[_Indexer]
-) -> Frame:
-    # Select columns from the given `Frame` using `indexers`,
-    # and return a new `Frame`.
-    index_data = frame._data.__class__()
-    data = frame._data.__class__()
-
-    for idx in indexers:
-        if idx.index:
-            index_data.set_by_label(idx.name, idx.get(frame), validate=False)
-        else:
-            data.set_by_label(idx.name, idx.get(frame), validate=False)
+        obj._data.set_by_label(self.name, value, validate=validate)
 
-    result_index = (
-        cudf.core.index._index_from_data(index_data) if index_data else None
-    )
-    result = cudf.core.frame.Frame(data=data, index=result_index)
-    return result
+
+class _IndexIndexer(_Indexer):
+    def get(self, obj: Frame) -> ColumnBase:
+        if obj._index is not None:
+            return obj._index._data[self.name]
+        raise KeyError
+
+    def set(self, obj: Frame, value: ColumnBase, validate=False):
+        if obj._index is not None:
+            obj._index._data.set_by_label(self.name, value, validate=validate)
+        else:
+            raise KeyError
 
 
 def _match_join_keys(
     lcol: ColumnBase, rcol: ColumnBase, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # returns the common dtype that lcol and rcol should be casted to,
-    # before they can be used as left and right join keys.
-    # If no casting is necessary, returns None
+    # Casts lcol and rcol to a common dtype for use as join keys. If no casting
+    # is necessary, they are returned as is.
 
     common_type = None
 
@@ -91,12 +67,22 @@ def _match_join_keys(
     rtype = rcol.dtype
 
     # if either side is categorical, different logic
-    if isinstance(ltype, CategoricalDtype) or isinstance(
-        rtype, CategoricalDtype
-    ):
-        return _match_categorical_dtypes(lcol, rcol, how)
+    left_is_categorical = isinstance(ltype, CategoricalDtype)
+    right_is_categorical = isinstance(rtype, CategoricalDtype)
+    if left_is_categorical and right_is_categorical:
+        return _match_categorical_dtypes_both(
+            cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how
+        )
+    elif left_is_categorical or right_is_categorical:
+        if left_is_categorical:
+            if how in {"left", "leftsemi", "leftanti"}:
+                return lcol, rcol.astype(ltype)
+            common_type = ltype.categories.dtype
+        else:
+            common_type = rtype.categories.dtype
+        return lcol.astype(common_type), rcol.astype(common_type)
 
-    if pd.api.types.is_dtype_equal(ltype, rtype):
+    if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
     if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
@@ -131,34 +117,9 @@ def _match_join_keys(
     return lcol.astype(common_type), rcol.astype(common_type)
 
 
-def _match_categorical_dtypes(
-    lcol: ColumnBase, rcol: ColumnBase, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
-    # cast the keys lcol and rcol to a common dtype
-    # when at least one of them is a categorical type
-    ltype, rtype = lcol.dtype, rcol.dtype
-
-    if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance(
-        rcol, cudf.core.column.CategoricalColumn
-    ):
-        # if both are categoricals, logic is complicated:
-        return _match_categorical_dtypes_both(lcol, rcol, how)
-
-    if isinstance(ltype, CategoricalDtype):
-        if how in {"left", "leftsemi", "leftanti"}:
-            return lcol, rcol.astype(ltype)
-        common_type = ltype.categories.dtype
-    elif isinstance(rtype, CategoricalDtype):
-        common_type = rtype.categories.dtype
-    return lcol.astype(common_type), rcol.astype(common_type)
-
-
 def _match_categorical_dtypes_both(
     lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
 ) -> Tuple[ColumnBase, ColumnBase]:
-    # The commontype depends on both `how` and the specifics of the
-    # categorical variables to be merged.
-
     ltype, rtype = lcol.dtype, rcol.dtype
 
     # when both are ordered and both have the same categories,
@@ -184,9 +145,6 @@ def _match_categorical_dtypes_both(
             "neither side is ordered"
         )
 
-    # the following should now always hold
-    assert not ltype.ordered and not rtype.ordered
-
     if how == "inner":
         # cast to category types -- we must cast them back later
         return _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 28b2d5d8167..dd8f462fb1d 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,16 +1,14 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from __future__ import annotations
 
-import functools
-from collections import namedtuple
-from typing import TYPE_CHECKING, Callable, Tuple
+from typing import TYPE_CHECKING, Callable
 
 import cudf
 from cudf import _lib as libcudf
 from cudf.core.join._join_helpers import (
     _coerce_to_tuple,
-    _frame_select_by_indexers,
-    _Indexer,
+    _ColumnIndexer,
+    _IndexIndexer,
     _match_join_keys,
 )
 
@@ -18,47 +16,7 @@
     from cudf.core.frame import Frame
 
 
-def merge(
-    lhs,
-    rhs,
-    *,
-    on,
-    left_on,
-    right_on,
-    left_index,
-    right_index,
-    how,
-    sort,
-    indicator,
-    suffixes,
-):
-    if how in {"leftsemi", "leftanti"}:
-        merge_cls = MergeSemi
-    else:
-        merge_cls = Merge
-    mergeobj = merge_cls(
-        lhs,
-        rhs,
-        on=on,
-        left_on=left_on,
-        right_on=right_on,
-        left_index=left_index,
-        right_index=right_index,
-        how=how,
-        sort=sort,
-        indicator=indicator,
-        suffixes=suffixes,
-    )
-    return mergeobj.perform_merge()
-
-
-_JoinKeys = namedtuple("JoinKeys", ["left", "right"])
-
-
-class Merge(object):
-    # A namedtuple of indexers representing the left and right keys
-    _keys: _JoinKeys
-
+class Merge:
     # The joiner function must have the following signature:
     #
     #     def joiner(
@@ -71,7 +29,7 @@ class Merge(object):
     # join key. The `joiner` returns a tuple of two Columns
     # representing the rows to gather from the left- and right- side
     # tables respectively.
-    _joiner: Callable
+    _joiner: Callable = libcudf.join.join
 
     def __init__(
         self,
@@ -133,150 +91,157 @@ def __init__(
             how=how,
             suffixes=suffixes,
         )
-        self._joiner = functools.partial(libcudf.join.join, how=how)
-
-        self.lhs = lhs
-        self.rhs = rhs
-        self.on = on
-        self.left_on = left_on
-        self.right_on = right_on
-        self.left_index = left_index
-        self.right_index = right_index
+
+        self.lhs = lhs.copy(deep=False)
+        self.rhs = rhs.copy(deep=False)
         self.how = how
         self.sort = sort
-        if suffixes:
-            self.lsuffix, self.rsuffix = suffixes
-        self._compute_join_keys()
-
-    @property
-    def _out_class(self):
-        # type of the result
-        out_class = cudf.DataFrame
+        self.lsuffix, self.rsuffix = suffixes
+
+        # At this point validation guarantees that if on is not None we
+        # don't have any other args, so we can apply it directly to left_on and
+        # right_on.
+        self._using_left_index = bool(left_index)
+        left_on = (
+            lhs.index._data.names if left_index else left_on if left_on else on
+        )
+        self._using_right_index = bool(right_index)
+        right_on = (
+            rhs.index._data.names
+            if right_index
+            else right_on
+            if right_on
+            else on
+        )
 
-        if isinstance(self.lhs, cudf.MultiIndex) or isinstance(
-            self.rhs, cudf.MultiIndex
+        if left_on or right_on:
+            self._left_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_left_index and on in lhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(left_on) if left_on else [])
+            ]
+            self._right_keys = [
+                _ColumnIndexer(name=on)
+                if not self._using_right_index and on in rhs._data
+                else _IndexIndexer(name=on)
+                for on in (_coerce_to_tuple(right_on) if right_on else [])
+            ]
+            if len(self._left_keys) != len(self._right_keys):
+                raise ValueError(
+                    "Merge operands must have same number of join key columns"
+                )
+            self._using_left_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._left_keys
+            )
+            self._using_right_index = any(
+                isinstance(idx, _IndexIndexer) for idx in self._right_keys
+            )
+        else:
+            # if `on` is not provided and we're not merging
+            # index with column or on both indexes, then use
+            # the intersection  of columns in both frames
+            on_names = set(lhs._data) & set(rhs._data)
+            self._left_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._right_keys = [_ColumnIndexer(name=on) for on in on_names]
+            self._using_left_index = False
+            self._using_right_index = False
+
+        if isinstance(lhs, cudf.MultiIndex) or isinstance(
+            rhs, cudf.MultiIndex
         ):
-            out_class = cudf.MultiIndex
-        elif isinstance(self.lhs, cudf.BaseIndex):
-            out_class = self.lhs.__class__
-        return out_class
+            self._out_class = cudf.MultiIndex
+        elif isinstance(lhs, cudf.BaseIndex):
+            self._out_class = lhs.__class__
+        else:
+            self._out_class = cudf.DataFrame
+
+        self._key_columns_with_same_name = (
+            set(_coerce_to_tuple(on))
+            if on
+            else set()
+            if (self._using_left_index or self._using_right_index)
+            else set(
+                [
+                    lkey.name
+                    for lkey, rkey in zip(self._left_keys, self._right_keys)
+                    if lkey.name == rkey.name
+                ]
+            )
+        )
 
     def perform_merge(self) -> Frame:
-        lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs)
-
-        left_table = _frame_select_by_indexers(lhs, self._keys.left)
-        right_table = _frame_select_by_indexers(rhs, self._keys.right)
+        left_join_cols = {}
+        right_join_cols = {}
+
+        for left_key, right_key in zip(self._left_keys, self._right_keys):
+            lcol = left_key.get(self.lhs)
+            rcol = right_key.get(self.rhs)
+            lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how)
+            left_join_cols[left_key.name] = lcol_casted
+            right_join_cols[left_key.name] = rcol_casted
+
+            # Categorical dtypes must be cast back from the underlying codes
+            # type that was returned by _match_join_keys.
+            if (
+                self.how == "inner"
+                and isinstance(lcol.dtype, cudf.CategoricalDtype)
+                and isinstance(rcol.dtype, cudf.CategoricalDtype)
+            ):
+                lcol_casted = lcol_casted.astype("category")
+                rcol_casted = rcol_casted.astype("category")
+
+            left_key.set(self.lhs, lcol_casted, validate=False)
+            right_key.set(self.rhs, rcol_casted, validate=False)
 
         left_rows, right_rows = self._joiner(
-            left_table, right_table, how=self.how,
+            cudf.core.frame.Frame(left_join_cols),
+            cudf.core.frame.Frame(right_join_cols),
+            how=self.how,
         )
-        lhs, rhs = self._restore_categorical_keys(lhs, rhs)
 
-        left_result = cudf.core.frame.Frame()
-        right_result = cudf.core.frame.Frame()
+        gather_index = self._using_left_index or self._using_right_index
 
-        gather_index = self.left_index or self.right_index
-        if left_rows is not None:
-            left_result = lhs._gather(
+        left_result = (
+            self.lhs._gather(
                 left_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
-        if right_rows is not None:
-            right_result = rhs._gather(
+            if left_rows is not None
+            else cudf.core.frame.Frame()
+        )
+        right_result = (
+            self.rhs._gather(
                 right_rows,
                 nullify=True,
                 keep_index=gather_index,
                 check_bounds=False,
             )
+            if right_rows is not None
+            else cudf.core.frame.Frame()
+        )
 
-        result = self._merge_results(left_result, right_result)
+        result = self._out_class._from_data(
+            *self._merge_results(left_result, right_result)
+        )
 
         if self.sort:
             result = self._sort_result(result)
         return result
 
-    def _compute_join_keys(self):
-        # Computes self._keys
-        left_keys = []
-        right_keys = []
-        if (
-            self.left_index
-            or self.right_index
-            or self.left_on
-            or self.right_on
-        ):
-            if self.left_index:
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.lhs.index._data.names
-                    ]
-                )
-            if self.left_on:
-                # TODO: require left_on or left_index to be specified
-                left_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.left_on)
-                    ]
-                )
-            if self.right_index:
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, index=True)
-                        for on in self.rhs.index._data.names
-                    ]
-                )
-            if self.right_on:
-                # TODO: require right_on or right_index to be specified
-                right_keys.extend(
-                    [
-                        _Indexer(name=on, column=True)
-                        for on in _coerce_to_tuple(self.right_on)
-                    ]
-                )
-        elif self.on:
-            on_names = _coerce_to_tuple(self.on)
-            for on in on_names:
-                # If `on` is provided, Merge on columns if present,
-                # otherwise default to indexes.
-                if on in self.lhs._data:
-                    left_keys.append(_Indexer(name=on, column=True))
-                else:
-                    left_keys.append(_Indexer(name=on, index=True))
-                if on in self.rhs._data:
-                    right_keys.append(_Indexer(name=on, column=True))
-                else:
-                    right_keys.append(_Indexer(name=on, index=True))
-
-        else:
-            # if `on` is not provided and we're not merging
-            # index with column or on both indexes, then use
-            # the intersection  of columns in both frames
-            on_names = set(self.lhs._data) & set(self.rhs._data)
-            left_keys = [_Indexer(name=on, column=True) for on in on_names]
-            right_keys = [_Indexer(name=on, column=True) for on in on_names]
-
-        if len(left_keys) != len(right_keys):
-            raise ValueError(
-                "Merge operands must have same number of join key columns"
-            )
-
-        self._keys = _JoinKeys(left=left_keys, right=right_keys)
-
-    def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
+    def _merge_results(self, left_result: Frame, right_result: Frame):
         # Merge the Frames `left_result` and `right_result` into a single
         # `Frame`, suffixing column names if necessary.
 
         # If two key columns have the same name, a single output column appears
-        # in the result. For all other join types, the key column from the rhs
-        # is simply dropped. For outer joins, the two key columns are combined
-        # by filling nulls in the left key column with corresponding values
-        # from the right key column:
+        # in the result. For all non-outer join types, the key column from the
+        # rhs is simply dropped. For outer joins, the two key columns are
+        # combined by filling nulls in the left key column with corresponding
+        # values from the right key column:
         if self.how == "outer":
-            for lkey, rkey in zip(*self._keys):
+            for lkey, rkey in zip(self._left_keys, self._right_keys):
                 if lkey.name == rkey.name:
                     # fill nulls in lhs from values in the rhs
                     lkey.set(
@@ -285,36 +250,26 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
                         validate=False,
                     )
 
-        # Compute the result column names:
-        # left_names and right_names will be a mappings of input column names
-        # to the corresponding names in the final result.
-        left_names = dict(zip(left_result._data, left_result._data))
-        right_names = dict(zip(right_result._data, right_result._data))
-
-        # For any columns from left_result and right_result that have the same
-        # name:
-        # - if they are key columns, keep only the left column
-        # - if they are not key columns, use suffixes to differentiate them
-        #   in the final result
-        common_names = set(left_names) & set(right_names)
-
-        if self.on:
-            key_columns_with_same_name = self.on
-        else:
-            key_columns_with_same_name = [
-                lkey.name
-                for lkey, rkey in zip(*self._keys)
-                if (
-                    (lkey.index, rkey.index) == (False, False)
-                    and lkey.name == rkey.name
-                )
-            ]
-        for name in common_names:
-            if name not in key_columns_with_same_name:
-                left_names[name] = f"{name}{self.lsuffix}"
-                right_names[name] = f"{name}{self.rsuffix}"
+        # All columns from the left table make it into the output. Non-key
+        # columns that share a name with a column in the right table are
+        # suffixed with the provided suffix.
+        common_names = set(left_result._data.names) & set(
+            right_result._data.names
+        )
+        cols_to_suffix = common_names - self._key_columns_with_same_name
+        data = {
+            (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col
+            for name, col in left_result._data.items()
+        }
+
+        # The right table follows the same rule as the left table except that
+        # key columns from the right table are removed.
+        for name, col in right_result._data.items():
+            if name in common_names:
+                if name not in self._key_columns_with_same_name:
+                    data[f"{name}{self.rsuffix}"] = col
             else:
-                del right_names[name]
+                data[name] = col
 
         # determine if the result has multiindex columns.  The result
         # of a join has a MultiIndex as its columns if:
@@ -333,69 +288,44 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame:
         else:
             multiindex_columns = False
 
-        # Assemble the data columns of the result
-        data = left_result._data.__class__(multiindex=multiindex_columns)
-
-        for lcol in left_names:
-            data.set_by_label(
-                left_names[lcol], left_result._data[lcol], validate=False
-            )
-        for rcol in right_names:
-            data.set_by_label(
-                right_names[rcol], right_result._data[rcol], validate=False
-            )
-
-        # Index of the result:
-        if self.left_index and self.right_index:
+        if self._using_right_index:
+            # right_index and left_on
             index = left_result._index
-        elif self.left_index:
+        elif self._using_left_index:
             # left_index and right_on
             index = right_result._index
-        elif self.right_index:
-            # right_index and left_on
-            index = left_result._index
         else:
             index = None
 
         # Construct result from data and index:
-        result = self._out_class._from_data(data=data, index=index)
-
-        return result
+        return (
+            left_result._data.__class__(
+                data=data, multiindex=multiindex_columns
+            ),
+            index,
+        )
 
     def _sort_result(self, result: Frame) -> Frame:
         # Pandas sorts on the key columns in the
         # same order as given in 'on'. If the indices are used as
         # keys, the index will be sorted. If one index is specified,
         # the key columns on the other side will be used to sort.
-        if self.on:
-            if isinstance(result, cudf.BaseIndex):
-                sort_order = result._get_sorted_inds()
-            else:
-                # need a list instead of a tuple here because
-                # _get_sorted_inds calls down to ColumnAccessor.get_by_label
-                # which handles lists and tuples differently
-                sort_order = result._get_sorted_inds(
-                    list(_coerce_to_tuple(self.on))
-                )
-            return result._gather(
-                sort_order, keep_index=False, check_bounds=False
-            )
         by = []
-        if self.left_index and self.right_index:
+        if self._using_left_index and self._using_right_index:
             if result._index is not None:
                 by.extend(result._index._data.columns)
-        if self.left_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.left_on)]
-            )
-        if self.right_on:
-            by.extend(
-                [result._data[col] for col in _coerce_to_tuple(self.right_on)]
-            )
+        if not self._using_left_index:
+            by.extend([result._data[col.name] for col in self._left_keys])
+        if not self._using_right_index:
+            by.extend([result._data[col.name] for col in self._right_keys])
         if by:
-            to_sort = cudf.DataFrame._from_columns(by)
+            to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
             sort_order = to_sort.argsort()
-            result = result._gather(sort_order, check_bounds=False)
+            result = result._gather(
+                sort_order,
+                keep_index=self._using_left_index or self._using_right_index,
+                check_bounds=False,
+            )
         return result
 
     @staticmethod
@@ -410,10 +340,9 @@ def _validate_merge_params(
         how,
         suffixes,
     ):
-        """
-        Error for various invalid combinations of merge input parameters
-        """
-        # must actually support the requested merge type
+        # Error for various invalid combinations of merge input parameters
+
+        # We must actually support the requested merge type
         if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}:
             raise NotImplementedError(f"{how} merge not supported yet")
 
@@ -424,15 +353,55 @@ def _validate_merge_params(
                     'Can only pass argument "on" OR "left_on" '
                     'and "right_on", not a combination of both.'
                 )
+            elif left_index or right_index:
+                # Passing 'on' with 'left_index' or 'right_index' is ambiguous
+                raise ValueError(
+                    'Can only pass argument "on" OR "left_index" '
+                    'and "right_index", not a combination of both.'
+                )
             else:
                 # the validity of 'on' being checked by _Indexer
                 return
+        elif left_on and left_index:
+            raise ValueError(
+                'Can only pass argument "left_on" OR "left_index" not both.'
+            )
+        elif right_on and right_index:
+            raise ValueError(
+                'Can only pass argument "right_on" OR "right_index" not both.'
+            )
+
+        # Can't merge on a column name that is present in both a frame and its
+        # indexes.
+        if on:
+            for key in on:
+                if (key in lhs._data and key in lhs.index._data) or (
+                    key in rhs._data and key in rhs.index._data
+                ):
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if left_on:
+            for key in left_on:
+                if key in lhs._data and key in lhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
+        if right_on:
+            for key in right_on:
+                if key in rhs._data and key in rhs.index._data:
+                    raise ValueError(
+                        f"{key} is both an index level and a "
+                        "column label, which is ambiguous."
+                    )
 
         # Can't merge on unnamed Series
         if (isinstance(lhs, cudf.Series) and not lhs.name) or (
             isinstance(rhs, cudf.Series) and not rhs.name
         ):
-            raise ValueError("Can not merge on unnamed Series")
+            raise ValueError("Cannot merge on unnamed Series")
 
         # If nothing specified, must have common cols to use implicitly
         same_named_columns = set(lhs._data) & set(rhs._data)
@@ -459,59 +428,15 @@ def _validate_merge_params(
                         "lsuffix and rsuffix are not defined"
                     )
 
-    def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]:
-        # Match the dtypes of the key columns from lhs and rhs
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        for left_key, right_key in zip(*self._keys):
-            lcol, rcol = left_key.get(lhs), right_key.get(rhs)
-            lcol_casted, rcol_casted = _match_join_keys(
-                lcol, rcol, how=self.how
-            )
-            if lcol is not lcol_casted:
-                left_key.set(out_lhs, lcol_casted, validate=False)
-            if rcol is not rcol_casted:
-                right_key.set(out_rhs, rcol_casted, validate=False)
-        return out_lhs, out_rhs
-
-    def _restore_categorical_keys(
-        self, lhs: Frame, rhs: Frame
-    ) -> Tuple[Frame, Frame]:
-        # For inner joins, any categorical keys in `self.lhs` and `self.rhs`
-        # were casted to their category type to produce `lhs` and `rhs`.
-        # Here, we cast them back.
-        out_lhs = lhs.copy(deep=False)
-        out_rhs = rhs.copy(deep=False)
-        if self.how == "inner":
-            for left_key, right_key in zip(*self._keys):
-                if isinstance(
-                    left_key.get(self.lhs).dtype, cudf.CategoricalDtype
-                ) and isinstance(
-                    right_key.get(self.rhs).dtype, cudf.CategoricalDtype
-                ):
-                    left_key.set(
-                        out_lhs,
-                        left_key.get(out_lhs).astype("category"),
-                        validate=False,
-                    )
-                    right_key.set(
-                        out_rhs,
-                        right_key.get(out_rhs).astype("category"),
-                        validate=False,
-                    )
-        return out_lhs, out_rhs
-
 
 class MergeSemi(Merge):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._joiner = functools.partial(
-            libcudf.join.semi_join, how=kwargs["how"]
-        )
+    _joiner: Callable = libcudf.join.semi_join
 
-    def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame:
+    def _merge_results(self, lhs: Frame, rhs: Frame):
         # semi-join result includes only lhs columns
-        if issubclass(self._out_class, cudf.Index):
-            return self._out_class._from_data(lhs._data)
-        else:
-            return self._out_class._from_data(lhs._data, index=lhs._index)
+        return (
+            lhs._data,
+            lhs._index
+            if not issubclass(self._out_class, cudf.Index)
+            else None,
+        )
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index e9f55c9e51a..0518cc2c9b9 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -230,10 +230,7 @@ def test_dataframe_join_combine_cats():
     expect.index = expect.index.astype("category")
     got = lhs.join(rhs, how="outer")
 
-    # TODO: Remove copying to host
-    # after https://github.com/rapidsai/cudf/issues/5676
-    # is implemented
-    assert_eq(expect.index.sort_values(), got.index.to_pandas().sort_values())
+    assert_eq(expect.index.sort_values(), got.index.sort_values())
 
 
 @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"])
@@ -744,12 +741,6 @@ def test_merge_sort(ons, hows):
     [
         {"left_on": ["a"], "left_index": False, "right_index": True},
         {"right_on": ["b"], "left_index": True, "right_index": False},
-        {
-            "left_on": ["a"],
-            "right_on": ["b"],
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_merge_sort_on_indexes(kwargs):
@@ -1791,12 +1782,6 @@ def test_typecast_on_join_indexes_matching_categorical():
         {"left_index": True, "right_on": "b"},
         {"left_on": "a", "right_index": True},
         {"left_index": True, "right_index": True},
-        {
-            "left_on": "a",
-            "right_on": "b",
-            "left_index": True,
-            "right_index": True,
-        },
     ],
 )
 def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs):
@@ -2148,3 +2133,20 @@ def test_join_on_index_with_duplicate_names():
     got = lhs.join(rhs, how="inner")
 
     assert_join_results_equal(expect, got, how="inner")
+
+
+def test_join_redundant_params():
+    lhs = cudf.DataFrame(
+        {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c")
+    )
+    rhs = cudf.DataFrame(
+        {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a")
+    )
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, on="a", left_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
+    with pytest.raises(ValueError):
+        lhs.merge(rhs, left_on="c", right_on="b")
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 58811ee98fc..8b2d85c59d7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -245,8 +245,6 @@ def test_merge_should_fail():
         left.merge(right, how="left", on=["b"])
     with pytest.raises(KeyError):
         left.merge(right, how="left", on=["c"])
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on=["a"])
 
     # Same column names
     df2["b"] = np.random.randint(0, 12, 12)
@@ -254,8 +252,6 @@ def test_merge_should_fail():
 
     with pytest.raises(KeyError):
         left.merge(right, how="left", on="NonCol")
-    with pytest.raises(KeyError):
-        left.merge(right, how="left", on="a")
 
 
 @pytest.mark.parametrize("how", ["inner", "left"])

From 32bacfaa0a75fd3fb5fb44b106d8138f83001184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= <ismael.from.kone@gmail.com>
Date: Thu, 18 Nov 2021 00:24:07 +0100
Subject: [PATCH 010/202] Interchange dataframe protocol (#9071)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR is a basic implementation of the [interchange dataframe protocol](https://github.com/data-apis/dataframe-api/blob/main/protocol/dataframe_protocol.py) for cudf.
As well-known, there are many dataframe libraries out there where one's weakness is handle by another. To work across these libraries, we rely on `pandas` with method like `from_pandas` and `to_pandas`.
This is a bad design as libraries should maintain an additional dependency to pandas peculiarities.
This protocol provides a high level API that must be implemented by dataframe libraries to allow communication between them.
Thus, we get rid of the high coupling with pandas and depend only on the protocol API where each library has the freedom of its implementation details.
To illustrate:

- `df_obj =  cudf_dataframe.__dataframe__()`

`df_obj` can be consumed by any library implementing the protocol.
- `df = cudf.from_dataframe(any_supported_dataframe)`

here we create  a `cudf dataframe` from any dataframe object supporting the protocol.

So far, it supports the following:

-  Column dtypes: `uint8`, `int`, `float`, `bool` and `categorical`.
-  Missing values are handled for all these dtypes.
-  `string` support is on the way.

Additionally, we support dataframe from CPU device like `pandas`. But it is not testable here  as pandas has not yet adopted the protocol. We've tested it locally with a pandas monkey patched implementation of the protocol.

Authors:
  - Ismaël Koné (https://github.com/iskode)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9071
---
 python/cudf/cudf/__init__.py               |   2 +-
 python/cudf/cudf/core/dataframe.py         |  13 +-
 python/cudf/cudf/core/df_protocol.py       | 829 +++++++++++++++++++++
 python/cudf/cudf/tests/test_df_protocol.py | 219 ++++++
 4 files changed, 1061 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/core/df_protocol.py
 create mode 100644 python/cudf/cudf/tests/test_df_protocol.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index bc35551b5bd..f696a00d1ed 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -42,7 +42,7 @@
     UInt64Index,
     interval_range,
 )
-from cudf.core.dataframe import DataFrame, from_pandas, merge
+from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe
 from cudf.core.series import Series
 from cudf.core.multiindex import MultiIndex
 from cudf.core.cut import cut
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a95453a4e62..bfbe8b06c17 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -40,7 +40,7 @@
     is_string_dtype,
     is_struct_dtype,
 )
-from cudf.core import column, reshape
+from cudf.core import column, df_protocol, reshape
 from cudf.core.abc import Serializable
 from cudf.core.column import (
     as_column,
@@ -6329,6 +6329,17 @@ def explode(self, column, ignore_index=False):
 
         return super()._explode(column, ignore_index)
 
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ):
+        return df_protocol.__dataframe__(
+            self, nan_as_null=nan_as_null, allow_copy=allow_copy
+        )
+
+
+def from_dataframe(df, allow_copy=False):
+    return df_protocol.from_dataframe(df, allow_copy=allow_copy)
+
 
 def make_binop_func(op, postprocess=None):
     # This function is used to wrap binary operations in Frame with an
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
new file mode 100644
index 00000000000..8f258ce27b2
--- /dev/null
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -0,0 +1,829 @@
+import collections
+import enum
+from typing import (
+    Any,
+    Dict,
+    Iterable,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    cast,
+)
+
+import cupy as cp
+import numpy as np
+from numba.cuda import as_cuda_array
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import as_column, build_categorical_column, build_column
+
+# Implementation of interchange protocol classes
+# ----------------------------------------------
+
+
+class _DtypeKind(enum.IntEnum):
+    INT = 0
+    UINT = 1
+    FLOAT = 2
+    BOOL = 20
+    STRING = 21  # UTF-8
+    DATETIME = 22
+    CATEGORICAL = 23
+
+
+class _Device(enum.IntEnum):
+    CPU = 1
+    CUDA = 2
+    CPU_PINNED = 3
+    OPENCL = 4
+    VULKAN = 7
+    METAL = 8
+    VPI = 9
+    ROCM = 10
+
+
+_SUPPORTED_KINDS = {
+    _DtypeKind.INT,
+    _DtypeKind.UINT,
+    _DtypeKind.FLOAT,
+    _DtypeKind.CATEGORICAL,
+    _DtypeKind.BOOL,
+    _DtypeKind.STRING,
+}
+ProtoDtype = Tuple[_DtypeKind, int, str, str]
+
+
+class _CuDFBuffer:
+    """
+    Data in the buffer is guaranteed to be contiguous in memory.
+    """
+
+    def __init__(
+        self,
+        buf: cudf.core.buffer.Buffer,
+        dtype: np.dtype,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Use cudf.core.buffer.Buffer object.
+        """
+        # Store the cudf buffer where the data resides as a private
+        # attribute, so we can use it to retrieve the public attributes
+        self._buf = buf
+        self._dtype = dtype
+        self._allow_copy = allow_copy
+
+    @property
+    def bufsize(self) -> int:
+        """
+        Buffer size in bytes.
+        """
+        return self._buf.nbytes
+
+    @property
+    def ptr(self) -> int:
+        """
+        Pointer to start of the buffer as an integer.
+        """
+        return self._buf.ptr
+
+    def __dlpack__(self):
+        """
+        DLPack not implemented in NumPy yet, so leave it out here.
+        """
+        try:
+            cudarray = as_cuda_array(self._buf).view(self._dtype)
+            res = cp.asarray(cudarray).toDlpack()
+
+        except ValueError:
+            raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
+
+        return res
+
+    def __dlpack_device__(self) -> Tuple[_Device, int]:
+        """
+        _Device type and _Device ID for where the data in the buffer resides.
+        """
+        return (_Device.CUDA, cp.asarray(self._buf).device.id)
+
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(" + str(
+            {
+                "bufsize": self.bufsize,
+                "ptr": self.ptr,
+                "dlpack": self.__dlpack__(),
+                "device": self.__dlpack_device__()[0].name,
+            }
+        )
+        +")"
+
+
+class _CuDFColumn:
+    """
+    A column object, with only the methods and properties required by the
+    interchange protocol defined.
+
+    A column can contain one or more chunks. Each chunk can contain up to three
+    buffers - a data buffer, a mask buffer (depending on null representation),
+    and an offsets buffer (if variable-size binary; e.g., variable-length
+    strings).
+
+    Note: this Column object can only be produced by ``__dataframe__``, so
+          doesn't need its own version or ``__column__`` protocol.
+
+    """
+
+    def __init__(
+        self,
+        column: cudf.core.column.ColumnBase,
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Note: doesn't deal with extension arrays yet, just assume a regular
+        Series/ndarray for now.
+        """
+        if not isinstance(column, cudf.core.column.ColumnBase):
+            raise TypeError(
+                "column must be a subtype of df.core.column.ColumnBase,"
+                f"got {type(column)}"
+            )
+        self._col = column
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def size(self) -> int:
+        """
+        Size of the column, in elements.
+        """
+        return self._col.size
+
+    @property
+    def offset(self) -> int:
+        """
+        Offset of first element. Always zero.
+        """
+        return 0
+
+    @property
+    def dtype(self) -> ProtoDtype:
+        """
+        Dtype description as a tuple
+        ``(kind, bit-width, format string, endianness)``
+
+        Kind :
+
+            - INT = 0
+            - UINT = 1
+            - FLOAT = 2
+            - BOOL = 20
+            - STRING = 21   # UTF-8
+            - DATETIME = 22
+            - CATEGORICAL = 23
+
+        Bit-width : the number of bits as an integer
+        Format string : data type description format string in Apache Arrow C
+                        Data Interface format.
+        Endianness : current only native endianness (``=``) is supported
+
+        Notes:
+
+            - Kind specifiers are aligned with DLPack where possible
+             (hence the jump to 20, leave enough room for future extension)
+            - Masks must be specified as boolean with either bit width 1
+             (for bit masks) or 8 (for byte masks).
+            - Dtype width in bits was preferred over bytes
+            - Endianness isn't too useful, but included now in case
+              in the future we need to support non-native endianness
+            - Went with Apache Arrow format strings over NumPy format strings
+              because they're more complete from a dataframe perspective
+            - Format strings are mostly useful for datetime specification,
+              and for categoricals.
+            - For categoricals, the format string describes the type of the
+              categorical in the data buffer. In case of a separate encoding
+              of the categorical (e.g. an integer to string mapping),
+              this can be derived from ``self.describe_categorical``.
+            - Data types not included: complex, Arrow-style null,
+              binary, decimal, and nested (list, struct, map, union) dtypes.
+        """
+        dtype = self._col.dtype
+
+        # For now, assume that, if the column dtype is 'O' (i.e., `object`),
+        # then we have an array of strings
+        if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O":
+            return (_DtypeKind.STRING, 8, "u", "=")
+
+        return self._dtype_from_cudfdtype(dtype)
+
+    def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
+        """
+        See `self.dtype` for details.
+        """
+        # Note: 'c' (complex) not handled yet (not in array spec v1).
+        #       'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void)
+        #       not handled datetime and timedelta both map to datetime
+        #       (is timedelta handled?)
+        _np_kinds = {
+            "i": _DtypeKind.INT,
+            "u": _DtypeKind.UINT,
+            "f": _DtypeKind.FLOAT,
+            "b": _DtypeKind.BOOL,
+            "U": _DtypeKind.STRING,
+            "M": _DtypeKind.DATETIME,
+            "m": _DtypeKind.DATETIME,
+        }
+        kind = _np_kinds.get(dtype.kind, None)
+        if kind is None:
+            # Not a NumPy/CuPy dtype. Check if it's a categorical maybe
+            if isinstance(dtype, cudf.CategoricalDtype):
+                kind = _DtypeKind.CATEGORICAL
+                # Codes and categories' dtypes are different.
+                # We use codes' dtype as these are stored in the buffer.
+                codes = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes
+                dtype = codes.dtype
+            else:
+                raise ValueError(
+                    f"Data type {dtype} not supported by exchange protocol"
+                )
+
+        if kind not in _SUPPORTED_KINDS:
+            raise NotImplementedError(f"Data type {dtype} not handled yet")
+
+        bitwidth = dtype.itemsize * 8
+        format_str = dtype.str
+        endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "="
+        return (kind, bitwidth, format_str, endianness)
+
+    @property
+    def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
+        """
+        If the dtype is categorical, there are two options:
+
+        - There are only values in the data buffer.
+        - There is a separate dictionary-style encoding for categorical values.
+
+        Raises TypeError if the dtype is not categorical
+
+        Content of returned dict:
+
+            - "is_ordered" : bool, whether the ordering of dictionary
+                             indices is semantically meaningful.
+            - "is_dictionary" : bool, whether a dictionary-style mapping of
+                                categorical values to other objects exists
+            - "mapping" : dict, Python-level only (e.g. ``{int: str}``).
+                          None if not a dictionary-style categorical.
+        """
+        if not self.dtype[0] == _DtypeKind.CATEGORICAL:
+            raise TypeError(
+                "`describe_categorical only works on "
+                "a column with categorical dtype!"
+            )
+        categ_col = cast(cudf.core.column.CategoricalColumn, self._col)
+        ordered = bool(categ_col.dtype.ordered)
+        is_dictionary = True
+        # NOTE: this shows the children approach is better, transforming
+        # `categories` to a "mapping" dict is inefficient
+        categories = categ_col.categories
+        mapping = {ix: val for ix, val in enumerate(categories.values_host)}
+        return ordered, is_dictionary, mapping
+
+    @property
+    def describe_null(self) -> Tuple[int, Any]:
+        """
+        Return the missing value (or "null") representation the column dtype
+        uses, as a tuple ``(kind, value)``.
+
+        Kind:
+
+            - 0 : non-nullable
+            - 1 : NaN/NaT
+            - 2 : sentinel value
+            - 3 : bit mask
+            - 4 : byte mask
+
+        Value : if kind is "sentinel value", the actual value.
+        If kind is a bit mask or a byte mask, the value (0 or 1)
+        indicating a missing value.
+        None otherwise.
+        """
+        kind = self.dtype[0]
+        if self.null_count == 0:
+            # there is no validity mask so it is non-nullable
+            return 0, None
+
+        elif kind in _SUPPORTED_KINDS:
+            # bit mask is universally used in cudf for missing
+            return 3, 0
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self.dtype} not yet supported"
+            )
+
+    @property
+    def null_count(self) -> int:
+        """
+        Number of null elements. Should always be known.
+        """
+        return self._col.null_count
+
+    @property
+    def metadata(self) -> Dict[str, Any]:
+        """
+        Store specific metadata of the column.
+        """
+        return {}
+
+    def num_chunks(self) -> int:
+        """
+        Return the number of chunks the column consists of.
+        """
+        return 1
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFColumn"]:
+        """
+        Return an iterable yielding the chunks.
+
+        See `DataFrame.get_chunks` for details on ``n_chunks``.
+        """
+        return (self,)
+
+    def get_buffers(
+        self,
+    ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]:
+        """
+        Return a dictionary containing the underlying buffers.
+
+        The returned dictionary has the following contents:
+
+            - "data": a two-element tuple whose first element is a buffer
+                      containing the data and whose second element is the data
+                      buffer's associated dtype.
+            - "validity": a two-element tuple whose first element is a buffer
+                          containing mask values indicating missing data and
+                          whose second element is the mask value buffer's
+                          associated dtype. None if the null representation is
+                          not a bit or byte mask.
+            - "offsets": a two-element tuple whose first element is a buffer
+                         containing the offset values for variable-size binary
+                         data (e.g., variable-length strings) and whose second
+                         element is the offsets buffer's associated dtype. None
+                         if the data buffer does not have an associated offsets
+                         buffer.
+        """
+        buffers = {}
+        try:
+            buffers["validity"] = self._get_validity_buffer()
+        except RuntimeError:
+            buffers["validity"] = None
+
+        try:
+            buffers["offsets"] = self._get_offsets_buffer()
+        except RuntimeError:
+            buffers["offsets"] = None
+
+        buffers["data"] = self._get_data_buffer()
+
+        return buffers
+
+    def _get_validity_buffer(
+        self,
+    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the mask values
+        indicating missing data and the buffer's associated dtype.
+
+        Raises RuntimeError if null representation is not a bit or byte mask.
+        """
+
+        null, invalid = self.describe_null
+        if null == 3:
+            if self.dtype[0] == _DtypeKind.CATEGORICAL:
+                valid_mask = cast(
+                    cudf.core.column.CategoricalColumn, self._col
+                ).codes._get_mask_as_column()
+            else:
+                valid_mask = self._col._get_mask_as_column()
+
+            assert (valid_mask is not None) and (
+                valid_mask.data is not None
+            ), "valid_mask(.data) should not be None when "
+            "_CuDFColumn.describe_null[0] = 3"
+            buffer = _CuDFBuffer(
+                valid_mask.data, cp.uint8, allow_copy=self._allow_copy
+            )
+            dtype = (_DtypeKind.UINT, 8, "C", "=")
+            return buffer, dtype
+
+        elif null == 1:
+            raise RuntimeError(
+                "This column uses NaN as null "
+                "so does not have a separate mask"
+            )
+        elif null == 0:
+            raise RuntimeError(
+                "This column is non-nullable so does not have a mask"
+            )
+        else:
+            raise NotImplementedError(
+                f"See {self.__class__.__name__}.describe_null method."
+            )
+
+    def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+        """
+        Return the buffer containing the offset values for
+        variable-size binary data (e.g., variable-length strings)
+        and the buffer's associated dtype.
+
+        Raises RuntimeError if the data buffer does not have an associated
+        offsets buffer.
+        """
+        if self.dtype[0] == _DtypeKind.STRING:
+            offsets = self._col.children[0]
+            assert (offsets is not None) and (offsets.data is not None), " "
+            "offsets(.data) should not be None for string column"
+
+            buffer = _CuDFBuffer(
+                offsets.data, offsets.dtype, allow_copy=self._allow_copy
+            )
+            dtype = self._dtype_from_cudfdtype(offsets.dtype)
+        else:
+            raise RuntimeError(
+                "This column has a fixed-length dtype "
+                "so does not have an offsets buffer"
+            )
+
+        return buffer, dtype
+
+    def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]:
+        """
+        Return the buffer containing the data and
+               the buffer's associated dtype.
+        """
+        if self.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            col_data = self._col
+            dtype = self.dtype
+
+        elif self.dtype[0] == _DtypeKind.CATEGORICAL:
+            col_data = cast(
+                cudf.core.column.CategoricalColumn, self._col
+            ).codes
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        elif self.dtype[0] == _DtypeKind.STRING:
+            col_data = self._col.children[1]
+            dtype = self._dtype_from_cudfdtype(col_data.dtype)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {self._col.dtype} not handled yet"
+            )
+        assert (col_data is not None) and (col_data.data is not None), " "
+        f"col_data(.data) should not be None when dtype = {dtype}"
+        buffer = _CuDFBuffer(
+            col_data.data, col_data.dtype, allow_copy=self._allow_copy
+        )
+
+        return buffer, dtype
+
+
+class _CuDFDataFrame:
+    """
+    A data frame class, with only the methods required by the interchange
+    protocol defined.
+
+    Instances of this (private) class are returned from
+    ``cudf.DataFrame.__dataframe__`` as objects with the methods and
+    attributes defined on this class.
+    """
+
+    def __init__(
+        self,
+        df: "cudf.core.dataframe.DataFrame",
+        nan_as_null: bool = True,
+        allow_copy: bool = True,
+    ) -> None:
+        """
+        Constructor - an instance of this (private) class is returned from
+        `cudf.DataFrame.__dataframe__`.
+        """
+        self._df = df
+        # ``nan_as_null`` is a keyword intended for the consumer to tell the
+        # producer to overwrite null values in the data with
+        # ``NaN`` (or ``NaT``).
+        # This currently has no effect; once support for nullable extension
+        # dtypes is added, this value should be propagated to columns.
+        self._nan_as_null = nan_as_null
+        self._allow_copy = allow_copy
+
+    @property
+    def metadata(self):
+        # `index` isn't a regular column, and the protocol doesn't support row
+        # labels - so we export it as cuDF-specific metadata here.
+        return {"cudf.index": self._df.index}
+
+    def num_columns(self) -> int:
+        return len(self._df.columns)
+
+    def num_rows(self) -> int:
+        return len(self._df)
+
+    def num_chunks(self) -> int:
+        return 1
+
+    def column_names(self) -> Iterable[str]:
+        return self._df.columns.tolist()
+
+    def get_column(self, i: int) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy
+        )
+
+    def get_column_by_name(self, name: str) -> _CuDFColumn:
+        return _CuDFColumn(
+            as_column(self._df[name]), allow_copy=self._allow_copy
+        )
+
+    def get_columns(self) -> Iterable[_CuDFColumn]:
+        return [
+            _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy)
+            for name in self._df.columns
+        ]
+
+    def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
+        if not isinstance(indices, collections.abc.Sequence):
+            raise ValueError("`indices` is not a sequence")
+
+        return _CuDFDataFrame(self._df.iloc[:, indices])
+
+    def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
+        if not isinstance(names, collections.Sequence):
+            raise ValueError("`names` is not a sequence")
+
+        return _CuDFDataFrame(
+            self._df.loc[:, names], self._nan_as_null, self._allow_copy
+        )
+
+    def get_chunks(
+        self, n_chunks: Optional[int] = None
+    ) -> Iterable["_CuDFDataFrame"]:
+        """
+        Return an iterator yielding the chunks.
+        """
+        return (self,)
+
+
+def __dataframe__(
+    self, nan_as_null: bool = False, allow_copy: bool = True
+) -> _CuDFDataFrame:
+    """
+    The public method to attach to cudf.DataFrame.
+
+    ``nan_as_null`` is a keyword intended for the consumer to tell the
+    producer to overwrite null values in the data with ``NaN`` (or ``NaT``).
+    This currently has no effect; once support for nullable extension
+    dtypes is added, this value should be propagated to columns.
+
+    ``allow_copy`` is a keyword that defines whether or not the library is
+    allowed to make a copy of the data. For example, copying data would be
+    necessary if a library supports strided buffers, given that this protocol
+    specifies contiguous buffers.
+    """
+    return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy)
+
+
+"""
+Implementation of the dataframe exchange protocol.
+
+Public API
+----------
+
+from_dataframe : construct a cudf.DataFrame from an input data frame which
+                 implements the exchange protocol
+
+Notes
+-----
+
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to
+  do in pure Python. It's more general but definitely less friendly than
+  having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack
+  ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack),
+  this is worth looking at again.
+
+"""
+
+
+# A typing protocol could be added later to let Mypy validate code using
+# `from_dataframe` better.
+DataFrameObject = Any
+ColumnObject = Any
+
+
+_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64}
+_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64}
+_FLOATS = {32: cp.float32, 64: cp.float64}
+_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}}
+
+
+def from_dataframe(
+    df: DataFrameObject, allow_copy: bool = False
+) -> _CuDFDataFrame:
+    """
+    Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__``
+    """
+    if isinstance(df, cudf.DataFrame):
+        return df
+
+    if not hasattr(df, "__dataframe__"):
+        raise ValueError("`df` does not support __dataframe__")
+
+    return _from_dataframe(df.__dataframe__(allow_copy=allow_copy))
+
+
+def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame:
+    """
+    Create a cudf DataFrame object from DataFrameObject.
+    """
+    # Check number of chunks, if there's more than one we need to iterate
+    if df.num_chunks() > 1:
+        raise NotImplementedError("More than one chunk not handled yet")
+
+    # We need a dict of columns here, with each column being a cudf column.
+    columns = dict()
+    _buffers = []  # hold on to buffers, keeps memory alive
+    for name in df.column_names():
+        col = df.get_column_by_name(name)
+
+        if col.dtype[0] in (
+            _DtypeKind.INT,
+            _DtypeKind.UINT,
+            _DtypeKind.FLOAT,
+            _DtypeKind.BOOL,
+        ):
+            columns[name], _buf = _protocol_to_cudf_column_numeric(col)
+
+        elif col.dtype[0] == _DtypeKind.CATEGORICAL:
+            columns[name], _buf = _protocol_to_cudf_column_categorical(col)
+
+        elif col.dtype[0] == _DtypeKind.STRING:
+            columns[name], _buf = _protocol_to_cudf_column_string(col)
+
+        else:
+            raise NotImplementedError(
+                f"Data type {col.dtype[0]} not handled yet"
+            )
+
+        _buffers.append(_buf)
+
+    df_new = cudf.DataFrame._from_data(columns)
+    df_new._buffers = _buffers
+    return df_new
+
+
+def _protocol_to_cudf_column_numeric(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert an int, uint, float or bool protocol column
+    to the corresponding cudf column
+    """
+    if col.offset != 0:
+        raise NotImplementedError("column.offset > 0 not handled yet")
+
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    _dbuffer, _ddtype = buffers["data"]
+    _check_buffer_is_on_gpu(_dbuffer)
+    cudfcol_num = build_column(
+        Buffer(_dbuffer.ptr, _dbuffer.bufsize),
+        protocol_dtype_to_cupy_dtype(_ddtype),
+    )
+    return _set_missing_values(col, cudfcol_num), buffers
+
+
+def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None:
+    if (
+        buffer.__dlpack_device__()[0] != _Device.CUDA
+        and not buffer._allow_copy
+    ):
+        raise TypeError(
+            "This operation must copy data from CPU to GPU. "
+            "Set `allow_copy=True` to allow it."
+        )
+
+    elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy:
+        raise NotImplementedError(
+            "Only cuDF/GPU dataframes are supported for now. "
+            "CPU (like `Pandas`) dataframes will be supported shortly."
+        )
+
+
+def _set_missing_values(
+    protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase
+) -> cudf.core.column.ColumnBase:
+
+    valid_mask = protocol_col.get_buffers()["validity"]
+    if valid_mask is not None:
+        bitmask = cp.asarray(
+            Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8
+        )
+        cudf_col[~bitmask] = None
+
+    return cudf_col
+
+
+def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
+    kind = _dtype[0]
+    bitwidth = _dtype[1]
+    if _dtype[0] not in _SUPPORTED_KINDS:
+        raise RuntimeError(f"Data type {_dtype[0]} not handled yet")
+
+    return _CP_DTYPES[kind][bitwidth]
+
+
+def _protocol_to_cudf_column_categorical(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a categorical column to a Series instance
+    """
+    ordered, is_dict, mapping = col.describe_categorical
+    if not is_dict:
+        raise NotImplementedError(
+            "Non-dictionary categoricals not supported yet"
+        )
+
+    categories = as_column(mapping.values())
+    buffers = col.get_buffers()
+    assert buffers["data"] is not None, "data buffer should not be None"
+    codes_buffer, codes_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(codes_buffer)
+    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
+    codes = build_column(
+        Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype
+    )
+
+    cudfcol = build_categorical_column(
+        categories=categories,
+        codes=codes,
+        mask=codes.base_mask,
+        size=codes.size,
+        ordered=ordered,
+    )
+
+    return _set_missing_values(col, cudfcol), buffers
+
+
+def _protocol_to_cudf_column_string(
+    col: _CuDFColumn,
+) -> Tuple[
+    cudf.core.column.ColumnBase,
+    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+]:
+    """
+    Convert a string ColumnObject to cudf Column object.
+    """
+    # Retrieve the data buffers
+    buffers = col.get_buffers()
+
+    # Retrieve the data buffer containing the UTF-8 code units
+    assert buffers["data"] is not None, "data buffer should never be None"
+    data_buffer, data_dtype = buffers["data"]
+    _check_buffer_is_on_gpu(data_buffer)
+    encoded_string = build_column(
+        Buffer(data_buffer.ptr, data_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(data_dtype),
+    )
+
+    # Retrieve the offsets buffer containing the index offsets demarcating
+    # the beginning and end of each string
+    assert buffers["offsets"] is not None, "not possible for string column"
+    offset_buffer, offset_dtype = buffers["offsets"]
+    _check_buffer_is_on_gpu(offset_buffer)
+    offsets = build_column(
+        Buffer(offset_buffer.ptr, offset_buffer.bufsize),
+        protocol_dtype_to_cupy_dtype(offset_dtype),
+    )
+
+    cudfcol_str = build_column(
+        None, dtype=cp.dtype("O"), children=(offsets, encoded_string)
+    )
+    return _set_missing_values(col, cudfcol_str), buffers
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
new file mode 100644
index 00000000000..d24c8ca2860
--- /dev/null
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -0,0 +1,219 @@
+from typing import Any, Tuple
+
+import cupy as cp
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.core.buffer import Buffer
+from cudf.core.column import build_column
+from cudf.core.df_protocol import (
+    DataFrameObject,
+    _CuDFBuffer,
+    _CuDFColumn,
+    _DtypeKind,
+    _from_dataframe,
+    protocol_dtype_to_cupy_dtype,
+)
+from cudf.testing._utils import assert_eq
+
+
+def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
+    buf, dtype = buffer_and_dtype
+    device_id = cp.asarray(cudfcol.data).device.id
+    assert buf.__dlpack_device__() == (2, device_id)
+    col_from_buf = build_column(
+        Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype)
+    )
+    # check that non null values are the equals as nulls are represented
+    # by sentinel values in the buffer.
+    non_null_idxs = cudf.Series(cudfcol) != cudf.NA
+    assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs])
+
+    if dtype[0] != _DtypeKind.BOOL:
+        array_from_dlpack = cp.fromDlpack(buf.__dlpack__())
+        col_array = cp.asarray(cudfcol.data_array_view)
+        assert_eq(array_from_dlpack.flatten(), col_array.flatten())
+    else:
+        pytest.raises(TypeError, buf.__dlpack__)
+
+
+def assert_column_equal(col: _CuDFColumn, cudfcol):
+    assert col.size == cudfcol.size
+    assert col.offset == 0
+    assert col.null_count == cudfcol.null_count
+    assert col.num_chunks() == 1
+    if col.null_count == 0:
+        pytest.raises(RuntimeError, col._get_validity_buffer)
+        assert col.get_buffers()["validity"] is None
+    else:
+        assert_buffer_equal(
+            col.get_buffers()["validity"],
+            cudfcol._get_mask_as_column().astype(cp.uint8),
+        )
+
+    if col.dtype[0] == _DtypeKind.CATEGORICAL:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes)
+        assert col.get_buffers()["offsets"] is None
+
+    elif col.dtype[0] == _DtypeKind.STRING:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1])
+        assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0])
+
+    else:
+        assert_buffer_equal(col.get_buffers()["data"], cudfcol)
+        assert col.get_buffers()["offsets"] is None
+
+    if col.null_count == 0:
+        assert col.describe_null == (0, None)
+    else:
+        assert col.describe_null == (3, 0)
+
+
+def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame):
+    assert dfo.num_columns() == len(df.columns)
+    assert dfo.num_rows() == len(df)
+    assert dfo.num_chunks() == 1
+    assert dfo.column_names() == list(df.columns)
+    for col in df.columns:
+        assert_column_equal(dfo.get_column_by_name(col), df[col]._column)
+
+
+def assert_from_dataframe_equals(dfobj):
+    df2 = _from_dataframe(dfobj)
+
+    assert_dataframe_equal(dfobj, df2)
+    if isinstance(dfobj._df, cudf.DataFrame):
+        assert_eq(dfobj._df, df2)
+
+    elif isinstance(dfobj._df, pd.DataFrame):
+        assert_eq(cudf.DataFrame(dfobj._df), df2)
+
+    else:
+        raise TypeError(f"{type(dfobj._df)} not supported yet.")
+
+
+def assert_from_dataframe_exception(dfobj):
+    exception_msg = "This operation must copy data from CPU to GPU."
+    " Set `allow_copy=True` to allow it."
+    with pytest.raises(TypeError, match=exception_msg):
+        _from_dataframe(dfobj)
+
+
+def assert_df_unique_dtype_cols(data):
+    cdf = cudf.DataFrame(data=data)
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_from_dataframe():
+    data = dict(a=[1, 2, 3], b=[9, 10, 11])
+    df1 = cudf.DataFrame(data=data)
+    df2 = cudf.from_dataframe(df1)
+    assert_eq(df1, df2)
+
+
+def test_int_dtype():
+    data_int = dict(a=[1, 2, 3], b=[9, 10, 11])
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_float_dtype():
+    data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8])
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_categorical_dtype():
+    cdf = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    cdf["A"] = cdf["A"].astype("category")
+    col = cdf.__dataframe__().get_column_by_name("A")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True))
+
+
+def test_bool_dtype():
+    data_bool = dict(a=[True, True, False], b=[False, True, False])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_string_dtype():
+    data_string = dict(a=["a", "b", "cdef", "", "g"])
+    assert_df_unique_dtype_cols(data_string)
+
+
+def test_mixed_dtype():
+    data_mixed = dict(
+        int=[1, 2, 3],
+        float=[1.5, 2.5, 3.5],
+        bool=[True, False, True],
+        categorical=[5, 1, 5],
+        string=["rapidsai-cudf ", "", "df protocol"],
+    )
+    assert_df_unique_dtype_cols(data_mixed)
+
+
+def test_NA_int_dtype():
+    data_int = dict(
+        a=[1, None, 3, None, 5],
+        b=[9, 10, None, 7, 8],
+        c=[6, 19, 20, 100, 1000],
+    )
+    assert_df_unique_dtype_cols(data_int)
+
+
+def test_NA_float_dtype():
+    data_float = dict(
+        a=[1.4, None, 3.6, None, 5.2],
+        b=[9.7, 10.9, None, 7.8, 8.2],
+        c=[6.1, 19.2, 20.3, 100.4, 1000.5],
+    )
+    assert_df_unique_dtype_cols(data_float)
+
+
+def test_NA_categorical_dtype():
+    df = cudf.DataFrame({"A": [1, 2, 5, 1]})
+    df["B"] = df["A"].astype("category")
+    df.at[[1, 3], "B"] = None  # Set two items to null
+
+    # Some detailed testing for correctness of dtype and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.CATEGORICAL
+    assert col.null_count == 2
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5})
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_bool_dtype():
+    data_bool = dict(a=[None, True, False], b=[False, None, None])
+    assert_df_unique_dtype_cols(data_bool)
+
+
+def test_NA_string_dtype():
+    df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]})
+    df["B"] = df["A"].astype("object")
+    df.at[1, "B"] = cudf.NA  # Set one item to null
+
+    # Test for correctness and null handling:
+    col = df.__dataframe__().get_column_by_name("B")
+    assert col.dtype[0] == _DtypeKind.STRING
+    assert col.null_count == 1
+    assert col.describe_null == (3, 0)
+    assert col.num_chunks() == 1
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=False))
+    assert_from_dataframe_equals(df.__dataframe__(allow_copy=True))
+
+
+def test_NA_mixed_dtype():
+    data_mixed = dict(
+        int=[1, None, 2, 3, 1000],
+        float=[None, 1.5, 2.5, 3.5, None],
+        bool=[True, None, False, None, None],
+        categorical=[5, 1, 5, 3, None],
+        string=[None, None, None, "df protocol", None],
+    )
+    assert_df_unique_dtype_cols(data_mixed)

From d4ff5185d10a988e26b9a32affed0ca5af821e78 Mon Sep 17 00:00:00 2001
From: Christopher Harris <charris@nvidia.com>
Date: Thu, 18 Nov 2021 00:07:28 -0600
Subject: [PATCH 011/202] Simplify write_csv by removing unnecessary
 writer/impl classes (#9089)

Depends on #9040 and (unfortunately) #9041

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9089
---
 cpp/include/cudf/io/detail/csv.hpp |  56 ++++---------
 cpp/src/io/csv/durations.hpp       |  39 +++++++++
 cpp/src/io/csv/writer_impl.cu      | 113 +++++++++++++-------------
 cpp/src/io/csv/writer_impl.hpp     | 122 -----------------------------
 cpp/src/io/functions.cpp           |  10 ++-
 5 files changed, 116 insertions(+), 224 deletions(-)
 create mode 100644 cpp/src/io/csv/durations.hpp
 delete mode 100644 cpp/src/io/csv/writer_impl.hpp

diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index aac44bed50e..c190340f6c1 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,45 +40,23 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr);
 
-class writer {
- public:
-  class impl;
-
- private:
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor for output to a file.
-   *
-   * @param sinkp The data sink to write the data to
-   * @param options Settings for controlling writing behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  writer(std::unique_ptr<cudf::io::data_sink> sinkp,
-         csv_writer_options const& options,
-         rmm::cuda_stream_view stream,
-         rmm::mr::device_memory_resource* mr);  // cannot provide definition here (because
-                                                // _impl is incomplete hence unique_ptr has
-                                                // not enough sizeof() info)
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~writer();
+/**
+ * @brief Write an entire dataset to CSV format.
+ *
+ * @param sink Output sink
+ * @param table The set of columns
+ * @param metadata The metadata associated with the table
+ * @param options Settings for controlling behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource to use for device memory allocation
+ */
+void write_csv(data_sink* sink,
+               table_view const& table,
+               const table_metadata* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-  /**
-   * @brief Writes the entire dataset.
-   *
-   * @param table Set of columns to output
-   * @param metadata Table metadata and column names
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-};
 }  // namespace csv
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
new file mode 100644
index 00000000000..d42ddf3817c
--- /dev/null
+++ b/cpp/src/io/csv/durations.hpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <memory>
+
+namespace cudf {
+namespace io {
+namespace detail {
+namespace csv {
+
+std::unique_ptr<column> pandas_format_durations(
+  column_view const& durations,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace csv
+}  // namespace detail
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index e8c673751db..b9b6fc6cf94 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -19,17 +19,25 @@
  * @brief cuDF-IO CSV writer class implementation
  */
 
-#include "writer_impl.hpp"
+#include "durations.hpp"
+
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -40,13 +48,19 @@
 #include <thrust/scan.h>
 
 #include <algorithm>
+#include <memory>
 #include <sstream>
+#include <string>
+#include <vector>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
 
+using namespace cudf::io::csv;
+using namespace cudf::io;
+
 namespace {
 
 /**
@@ -260,32 +274,16 @@ struct column_to_strings_fn {
 };
 }  // unnamed namespace
 
-// Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
-               csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mr))
-{
-}
-
-// Destructor within this translation unit
-writer::~writer() = default;
-
-writer::impl::impl(std::unique_ptr<data_sink> sink,
-                   csv_writer_options const& options,
-                   rmm::mr::device_memory_resource* mr)
-  : out_sink_(std::move(sink)), mr_(mr), options_(options)
-{
-}
-
 // write the header: column names:
 //
-void writer::impl::write_chunked_begin(table_view const& table,
-                                       const table_metadata* metadata,
-                                       rmm::cuda_stream_view stream)
+void write_chunked_begin(data_sink* out_sink,
+                         table_view const& table,
+                         table_metadata const* metadata,
+                         csv_writer_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::mr::device_memory_resource* mr)
 {
-  if (options_.is_enabled_include_header()) {
+  if (options.is_enabled_include_header()) {
     // need to generate column names if metadata is not provided
     std::vector<std::string> generated_col_names;
     if (metadata == nullptr) {
@@ -298,8 +296,8 @@ void writer::impl::write_chunked_begin(table_view const& table,
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
-    auto const delimiter  = options_.get_inter_column_delimiter();
-    auto const terminator = options_.get_line_terminator();
+    auto const delimiter  = options.get_inter_column_delimiter();
+    auto const terminator = options.get_line_terminator();
 
     // process header names:
     // - if the header name includes the delimiter or terminator character,
@@ -341,18 +339,21 @@ void writer::impl::write_chunked_begin(table_view const& table,
     }
     header.append(terminator);
 
-    out_sink_->host_write(header.data(), header.size());
+    out_sink->host_write(header.data(), header.size());
   }
 }
 
-void writer::impl::write_chunked(strings_column_view const& str_column_view,
-                                 const table_metadata* metadata,
-                                 rmm::cuda_stream_view stream)
+void write_chunked(data_sink* out_sink,
+                   strings_column_view const& str_column_view,
+                   table_metadata const* metadata,
+                   csv_writer_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
 {
   // algorithm outline:
   //
   //  for_each(strings_column.begin(), strings_column.end(),
-  //           [sink = out_sink_](auto str_row) mutable {
+  //           [sink = out_sink](auto str_row) mutable {
   //               auto host_buffer = str_row.host_buffer();
   //               sink->host_write(host_buffer_.data(), host_buffer_.size());
   //           });//or...sink->device_write(device_buffer,...);
@@ -362,7 +363,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options_.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator()};
   auto p_str_col_w_nl =
     cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -370,9 +371,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   auto total_num_bytes      = strings_column.chars_size();
   char const* ptr_all_bytes = strings_column.chars_begin();
 
-  if (out_sink_->is_device_write_preferred(total_num_bytes)) {
+  if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
-    out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream);
+    out_sink->device_write(ptr_all_bytes, total_num_bytes, stream);
   } else {
     // copy the bytes to host to write them out
     thrust::host_vector<char> h_bytes(total_num_bytes);
@@ -383,30 +384,33 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
                              stream.value()));
     stream.synchronize();
 
-    out_sink_->host_write(h_bytes.data(), total_num_bytes);
+    out_sink->host_write(h_bytes.data(), total_num_bytes);
   }
 
   // Needs newline at the end, to separate from next chunk
-  if (out_sink_->is_device_write_preferred(newline.size())) {
-    out_sink_->device_write(newline.data(), newline.size(), stream);
+  if (out_sink->is_device_write_preferred(newline.size())) {
+    out_sink->device_write(newline.data(), newline.size(), stream);
   } else {
-    out_sink_->host_write(options_.get_line_terminator().data(),
-                          options_.get_line_terminator().size());
+    out_sink->host_write(options.get_line_terminator().data(),
+                         options.get_line_terminator().size());
   }
 }
 
-void writer::impl::write(table_view const& table,
-                         const table_metadata* metadata,
-                         rmm::cuda_stream_view stream)
+void write_csv(data_sink* out_sink,
+               table_view const& table,
+               table_metadata const* metadata,
+               csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(table, metadata, stream);
+  write_chunked_begin(out_sink, table, metadata, options, stream, mr);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
-    auto n_rows_per_chunk = options_.get_rows_per_chunk();
+    auto n_rows_per_chunk = options.get_rows_per_chunk();
     //
     // This outputs the CSV in row chunks to save memory.
     // Maybe we can use the total_rows*count calculation and a memory threshold
@@ -436,7 +440,7 @@ void writer::impl::write(table_view const& table,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -459,32 +463,21 @@ void writer::impl::write(table_view const& table,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options_.get_inter_column_delimiter()};
+      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options_.get_na_rep(),
+                                                    options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
                                                     stream);
-        cudf::string_scalar narep{options_.get_na_rep()};
+        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
       }();
 
-      write_chunked(str_concat_col->view(), metadata, stream);
+      write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr);
     }
   }
-
-  // finalize (no-op, for now, but offers a hook for future extensions):
-  //
-  write_chunked_end(table, metadata, stream);
-}
-
-void writer::write(table_view const& table,
-                   const table_metadata* metadata,
-                   rmm::cuda_stream_view stream)
-{
-  _impl->write(table, metadata, stream);
 }
 
 }  // namespace csv
diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp
deleted file mode 100644
index 965c036dc75..00000000000
--- a/cpp/src/io/csv/writer_impl.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <cudf/strings/strings_column_view.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/io/data_sink.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV writer
- */
-class writer::impl {
- public:
-  /**
-   * @brief Constructor with writer options.
-   *
-   * @param sink Output sink
-   * @param options Settings for controlling behavior
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  impl(std::unique_ptr<data_sink> sink,
-       csv_writer_options const& options,
-       rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Write an entire dataset to CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write(table_view const& table,
-             const table_metadata* metadata = nullptr,
-             rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write the header of a CSV format.
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_begin(table_view const& table,
-                           const table_metadata* metadata = nullptr,
-                           rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write dataset to CSV format without header.
-   *
-   * @param strings_column Subset of columns converted to string to be written.
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked(strings_column_view const& strings_column,
-                     const table_metadata* metadata = nullptr,
-                     rmm::cuda_stream_view stream   = rmm::cuda_stream_default);
-
-  /**
-   * @brief Write footer of CSV format (typically, empty).
-   *
-   * @param table The set of columns
-   * @param metadata The metadata associated with the table
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void write_chunked_end(table_view const& table,
-                         const table_metadata* metadata = nullptr,
-                         rmm::cuda_stream_view stream   = rmm::cuda_stream_default)
-  {
-    // purposely no-op (for now);
-  }
-
- private:
-  std::unique_ptr<data_sink> out_sink_;
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  csv_writer_options const options_;
-};
-
-std::unique_ptr<column> pandas_format_durations(
-  column_view const& durations,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index a8ca1d3a459..402e212f07b 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -219,10 +219,14 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
   using namespace cudf::io::detail;
 
   auto sink = make_datasink(options.get_sink());
-  auto writer =
-    std::make_unique<csv::writer>(std::move(sink), options, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table(), options.get_metadata());
+  return csv::write_csv(  //
+    sink.get(),
+    options.get_table(),
+    options.get_metadata(),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 namespace detail_orc = cudf::io::detail::orc;

From 406429a66fad55414fce22f2723270df411e1b75 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Thu, 18 Nov 2021 10:07:58 -0500
Subject: [PATCH 012/202] ceil/floor for `DatetimeIndex` (#9554)

Follow-up to #9571  where we add `ceil` and `floor` support for `Series`.

Here we add `ceil` and `floor` support to `DatetimeIndex` class. This PR is dependent on #9571 getting merged first since it assumes the `libcudf` implementation for `floor` exists.

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9554
---
 docs/cudf/source/api_docs/index_objects.rst |  2 +
 python/cudf/cudf/core/frame.py              | 14 +++++
 python/cudf/cudf/core/index.py              | 62 +++++++++++++++++++++
 python/cudf/cudf/tests/test_index.py        | 26 +++++++++
 4 files changed, 104 insertions(+)

diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 30269bb2a72..2a4dd5ff9c8 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -280,6 +280,8 @@ Time-specific operations
    :toctree: api/
 
    DatetimeIndex.round
+   DatetimeIndex.ceil
+   DatetimeIndex.floor
 
 Conversion
 ~~~~~~~~~~
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 72239fc2a8e..58fe8a43d8d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3673,6 +3673,13 @@ def ceil(self):
         3    5.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("ceil")
 
     def floor(self):
@@ -3705,6 +3712,13 @@ def floor(self):
         5    3.0
         dtype: float64
         """
+
+        warnings.warn(
+            "Series.ceil and DataFrame.ceil are deprecated and will be \
+                removed in the future",
+            DeprecationWarning,
+        )
+
         return self._unaryop("floor")
 
     def scale(self):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 35b80715cca..63fda21152d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1898,6 +1898,68 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
+    def ceil(self, field):
+        """
+        Perform ceil operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
+        ... "1999-12-31 18:40:00"])
+        >>> gIndex.ceil("T")
+        DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.ceil(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
+    def floor(self, field):
+        """
+        Perform floor operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        field : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index of the same type for a DatetimeIndex
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59"
+        ... ,"1999-12-31 18:44:59"])
+        >>> gIndex.floor("T")
+        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
+        dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.floor(field)
+
+        return self.__class__._from_data({self.name: out_column})
+
 
 class TimedeltaIndex(GenericIndex):
     """
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c6cf7c4e6f5..ab211616a02 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2470,3 +2470,29 @@ def test_index_type_methods(data, func):
         assert_eq(False, actual)
     else:
         assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_ceil(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_ceil = pidx.ceil(resolution)
+    cuidx_ceil = cuidx.ceil(resolution)
+
+    assert_eq(pidx_ceil, cuidx_ceil)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_floor(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_floor = pidx.floor(resolution)
+    cuidx_floor = cuidx.floor(resolution)
+
+    assert_eq(pidx_floor, cuidx_floor)

From 91fd74e0e2b9ada200f3c707cc4d0ca4efee329a Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 18 Nov 2021 09:42:48 -0700
Subject: [PATCH 013/202] Support `min` and `max` reduction for structs (#9697)

This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` reduction.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9697
---
 cpp/src/groupby/sort/group_scan_util.cuh      |  20 +--
 .../sort/group_single_pass_reduction_util.cuh |  20 +--
 .../arg_minmax_util.cuh}                      |   4 +-
 cpp/src/reductions/simple.cuh                 |  61 +++++++-
 cpp/tests/reductions/reduction_tests.cpp      | 131 +++++++++++++++++-
 5 files changed, 210 insertions(+), 26 deletions(-)
 rename cpp/src/{groupby/sort/group_util.cuh => reductions/arg_minmax_util.cuh} (98%)

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 013ea924cce..b565e8dc6d8 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -221,16 +221,18 @@ struct group_scan_functor<K,
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::MIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::MIN);
       do_scan(count_iter, map_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 4e0820af236..decb127b264 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <groupby/sort/group_util.cuh>
+#include <reductions/arg_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -271,10 +271,11 @@ struct group_reduction_functor<
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
     if (values.has_nulls()) {
-      auto const binop = row_arg_minmax_fn<true>(values.size(),
-                                                 *d_flattened_values_ptr,
-                                                 flattened_null_precedences.data(),
-                                                 K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
+                                                         *d_flattened_values_ptr,
+                                                         flattened_null_precedences.data(),
+                                                         K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
 
       // Generate bitmask for the output by segmented reduction of the input bitmask.
@@ -288,10 +289,11 @@ struct group_reduction_functor<
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
-      auto const binop = row_arg_minmax_fn<false>(values.size(),
-                                                  *d_flattened_values_ptr,
-                                                  flattened_null_precedences.data(),
-                                                  K == aggregation::ARGMIN);
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
+                                                          *d_flattened_values_ptr,
+                                                          flattened_null_precedences.data(),
+                                                          K == aggregation::ARGMIN);
       do_reduction(count_iter, result_begin, binop);
     }
 
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
similarity index 98%
rename from cpp/src/groupby/sort/group_util.cuh
rename to cpp/src/reductions/arg_minmax_util.cuh
index 31ff29ed4c3..40df23bcd8e 100644
--- a/cpp/src/groupby/sort/group_util.cuh
+++ b/cpp/src/reductions/arg_minmax_util.cuh
@@ -19,7 +19,7 @@
 #include <cudf/table/row_operators.cuh>
 
 namespace cudf {
-namespace groupby {
+namespace reduction {
 namespace detail {
 
 /**
@@ -62,5 +62,5 @@ struct row_arg_minmax_fn {
 };
 
 }  // namespace detail
-}  // namespace groupby
+}  // namespace reduction
 }  // namespace cudf
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 13dfe5cb26c..7dd54e9250a 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,9 +16,13 @@
 
 #pragma once
 
+#include <reductions/arg_minmax_util.cuh>
+
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/reduction.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -28,6 +32,9 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/reduce.h>
 
 namespace cudf {
 namespace reduction {
@@ -252,8 +259,7 @@ struct same_element_type_dispatcher {
   template <typename ElementType>
   static constexpr bool is_supported()
   {
-    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view> ||
-             std::is_same_v<ElementType, cudf::struct_view>);
+    return !(cudf::is_dictionary<ElementType>() || std::is_same_v<ElementType, cudf::list_view>);
   }
 
   template <typename IndexType,
@@ -279,8 +285,55 @@ struct same_element_type_dispatcher {
 
  public:
   template <typename ElementType,
-            std::enable_if_t<is_supported<ElementType>() &&
-                             not cudf::is_fixed_point<ElementType>()>* = nullptr>
+            std::enable_if_t<std::is_same_v<ElementType, cudf::struct_view> &&
+                             (std::is_same_v<Op, cudf::reduction::op::min> ||
+                              std::is_same_v<Op, cudf::reduction::op::max>)>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); }
+
+    auto constexpr is_min_op = std::is_same_v<Op, cudf::reduction::op::min>;
+
+    // We will do reduction to find the ARGMIN/ARGMAX index, then return the element at that index.
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    // Perform reduction to find ARGMIN/ARGMAX.
+    auto const do_reduction = [&](auto const& binop) {
+      return thrust::reduce(rmm::exec_policy(stream),
+                            thrust::make_counting_iterator(0),
+                            thrust::make_counting_iterator(input.size()),
+                            size_type{0},
+                            binop);
+    };
+
+    auto const minmax_idx = [&] {
+      if (input.has_nulls()) {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      } else {
+        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+        return do_reduction(binop);
+      }
+    }();
+
+    return cudf::detail::get_element(input, minmax_idx, stream, mr);
+  }
+
+  template <typename ElementType,
+            std::enable_if_t<is_supported<ElementType>() && !cudf::is_fixed_point<ElementType>() &&
+                             !std::is_same_v<ElementType, cudf::struct_view>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 376f5ce5dd2..2c9279260e7 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -2055,7 +2056,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 struct StructReductionTest : public cudf::test::BaseFixture {
   using SCW = cudf::test::structs_column_wrapper;
 
-  void reduction_test(SCW const& struct_column,
+  void reduction_test(cudf::column_view const& struct_column,
                       cudf::table_view const& expected_value,
                       bool succeeded_condition,
                       bool is_valid,
@@ -2066,7 +2067,7 @@ struct StructReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT));
       auto struct_result = dynamic_cast<cudf::struct_scalar*>(result.get());
       EXPECT_EQ(is_valid, struct_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); }
+      if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_value, struct_result->view()); }
     };
 
     if (succeeded_condition) {
@@ -2210,4 +2211,130 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));
 }
 
+TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected_child1 = STRINGS_CW{"$1"};
+    auto const expected_child2 = INTS_CW{8};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"₹1"};
+    auto const expected_child2 = INTS_CW{3};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
+TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected_child1 = STRINGS_CW{"aab"};
+    auto const expected_child2 = INTS_CW{7};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_min_aggregation());
+  }
+
+  {
+    auto const expected_child1 = STRINGS_CW{"zit"};
+    auto const expected_child2 = INTS_CW{5};
+    this->reduction_test(input,
+                         cudf::table_view{{expected_child1, expected_child2}},
+                         true,
+                         true,
+                         cudf::make_max_aggregation());
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From fc82b1d206e93a46c9ef3535711c88ec20bd4fde Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 19 Nov 2021 02:06:54 +0530
Subject: [PATCH 014/202] Spell check fixes (#9682)

Regular spell check fixes in comments and docs.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9682
---
 cpp/src/binaryop/compiled/binary_ops.cuh            |  4 ++--
 cpp/src/groupby/sort/aggregate.cpp                  |  4 ++--
 cpp/src/io/orc/aggregate_orc_metadata.cpp           |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp           |  2 +-
 cpp/src/io/orc/stripe_enc.cu                        |  4 ++--
 cpp/src/io/orc/writer_impl.cu                       |  2 +-
 cpp/src/io/parquet/parquet_gpu.hpp                  |  4 ++--
 cpp/src/io/text/multibyte_split.cu                  |  2 +-
 cpp/src/lists/drop_list_duplicates.cu               |  4 ++--
 cpp/src/rolling/rolling_detail.cuh                  |  2 +-
 cpp/tests/column/column_view_shallow_test.cpp       |  2 +-
 cpp/tests/datetime/datetime_ops_test.cpp            |  4 ++--
 cpp/tests/transform/row_bit_count_test.cu           |  6 +++---
 python/cudf/cudf/core/column/column.py              |  4 ++--
 python/cudf/cudf/core/column/datetime.py            |  2 +-
 python/cudf/cudf/core/column/decimal.py             |  2 +-
 python/cudf/cudf/core/dataframe.py                  |  2 +-
 python/cudf/cudf/core/groupby/groupby.py            |  2 +-
 python/cudf/cudf/core/index.py                      |  2 +-
 python/cudf/cudf/core/multiindex.py                 |  2 +-
 python/cudf/cudf/core/series.py                     |  8 ++++----
 python/cudf/cudf/core/udf/pipeline.py               |  2 +-
 python/cudf/cudf/core/udf/typing.py                 |  4 ++--
 python/cudf/cudf/testing/testing.py                 |  2 +-
 python/cudf/cudf/tests/test_binops.py               |  2 +-
 python/cudf/cudf/tests/test_custom_accessor.py      |  2 +-
 python/cudf/cudf/tests/test_datetime.py             |  2 +-
 python/cudf/cudf/tests/test_multiindex.py           | 10 +++++-----
 python/cudf/cudf/tests/test_orc.py                  |  4 ++--
 python/cudf/cudf/utils/gpu_utils.py                 |  2 +-
 python/cudf/cudf/utils/ioutils.py                   |  4 ++--
 python/cudf/cudf/utils/utils.py                     |  4 ++--
 python/dask_cudf/dask_cudf/_version.py              |  2 +-
 python/dask_cudf/dask_cudf/backends.py              |  2 +-
 python/dask_cudf/dask_cudf/io/parquet.py            |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py |  2 +-
 36 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 84147fc9220..10e9b2532af 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -117,7 +117,7 @@ struct ops_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
@@ -164,7 +164,7 @@ struct ops2_wrapper {
         } else {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
-        // To supress nvcc warning
+        // To suppress nvcc warning
         return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 234bb447761..d68b701d75f 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -559,7 +559,7 @@ auto column_view_with_common_nulls(column_view const& column_0, column_view cons
 }
 
 /**
- * @brief Perform covariance betweeen two child columns of non-nullable struct column.
+ * @brief Perform covariance between two child columns of non-nullable struct column.
  *
  */
 template <>
@@ -602,7 +602,7 @@ void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation c
 };
 
 /**
- * @brief Perform correlation betweeen two child columns of non-nullable struct column.
+ * @brief Perform correlation between two child columns of non-nullable struct column.
  *
  */
 template <>
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 45d60605936..82161233a92 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -79,7 +79,7 @@ void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_co
  * @brief Adds the column with the given id to the mapping
  *
  * All nested columns and direct ancestors of column `id` are included.
- * Columns that are not on the direct path are excluded, which may result in prunning.
+ * Columns that are not on the direct path are excluded, which may result in pruning.
  */
 void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
                            metadata const& metadata,
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 5132906a5fc..01418fd3bd6 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -119,7 +119,7 @@ class aggregate_orc_metadata {
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
    *
    * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
-   * ommited to match the cuDF table hierarchy.
+   * omitted to match the cuDF table hierarchy.
    *
    * @param column_paths List of full column names (i.e. paths) to select from the ORC file
    * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 217aee8756e..829e4877c44 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -660,7 +660,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
     auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
     auto dst_offset      = offset + s->nnz;
     auto vbuf_bit_idx    = [](int row) {
-      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      // valid_buf is a circular buffer with validity of 8 rows in each element
       return row % (encode_block_size * 8);
     };
     if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
@@ -696,7 +696,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
         ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
 
       if (!t) {
-        // Number of rows enocoded so far
+        // Number of rows encoded so far
         s->present_out += nrows_encoded;
         s->numvals -= min(s->numvals, nrows_encoded);
       }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 1563e3e1fd7..25c4bd65c8f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1426,7 +1426,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       }
     }
     if (col.orc_kind() == LIST or col.orc_kind() == MAP) {
-      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      // Need a new pushdown mask unless both the parent and current column are not nullable
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ac2e6ba5cfb..1bd4cb3c6f4 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -307,7 +307,7 @@ struct EncColumnChunk {
   statistics_chunk const* stats;  //!< Fragment statistics
   uint32_t bfr_size;              //!< Uncompressed buffer size
   uint32_t compressed_size;       //!< Compressed buffer size
-  uint32_t max_page_data_size;    //!< Max data size (excuding header) of any page in this chunk
+  uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
   uint32_t start_row;             //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
@@ -489,7 +489,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan<statistics_group> groups
 /**
  * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
  *
- * @param chunks Flat span of chunks to intialize hash maps for
+ * @param chunks Flat span of chunks to initialize hash maps for
  * @param stream CUDA stream to use
  */
 void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index a427809c81a..d287b9f2419 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -260,7 +260,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour
 
   // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
   // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow seperate logic.
+  // would have to follow separate logic.
   multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>(  //
     tile_multistates,
     tile_offsets,
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 0663bc18ab3..527e834c76c 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -67,7 +67,7 @@ struct has_negative_nans_fn {
  * @brief A structure to be used along with type_dispatcher to check if a column has any
  * negative NaN value.
  *
- * This functor is neccessary because when calling to segmented sort on the list entries, the
+ * This functor is necessary because when calling to segmented sort on the list entries, the
  * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
  * lists. We want to move all NaN values close together in order to call unique_copy later on.
  */
@@ -563,7 +563,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
                      values ? cudf::empty_like(values.value().parent()) : nullptr};
   }
 
-  // The child column conotaining list entries.
+  // The child column containing list entries.
   auto const keys_child = keys.get_sliced_child(stream);
 
   // Generate a mapping from list entries to their 1-based list indices for the keys column.
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 12227404d83..bc1947dfeed 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -722,7 +722,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre
   }
 
   // STD aggregations depends on VARIANCE aggregation. Each element is applied
-  // with sqaured-root in the finalize() step.
+  // with square-root in the finalize() step.
   std::vector<std::unique_ptr<aggregation>> visit(data_type,
                                                   cudf::detail::std_aggregation const& agg) override
   {
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index ab324ea8505..4afa96f08d7 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -84,7 +84,7 @@ TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes);
 // Test for fixed_width, dict, string, list, struct
 // column_view, column_view = same hash.
 // column_view, make a copy = same hash.
-// new column_view from colmn = same hash
+// new column_view from column = same hash
 // column_view, copy column = diff hash
 // column_view, diff column = diff hash.
 //
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index b70ac29fd5d..2097e09e674 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -758,7 +758,7 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear)
         707904541L,     // 1992-06-07 08:09:01 GMT - leap year
         -2181005247L,   // 1900-11-20 09:12:33 GMT - non leap year
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00 - non-leap-year
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT - leap year
         4539564243L,    // 2113-11-08 06:04:03 GMT - non leap year
@@ -827,7 +827,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
         707904541L,     // 1992-06-07 08:09:01 GMT
         -2181005247L,   // 1900-11-20 09:12:33 GMT
         0L,             // UNIX EPOCH 1970-01-01 00:00:00 GMT
-        -12212553600L,  // First full year of Gregorian Calandar 1583-01-01 00:00:00
+        -12212553600L,  // First full year of Gregorian Calendar 1583-01-01 00:00:00
         0L,             // null
         13591632822L,   // 2400-09-13 13:33:42 GMT
         4539564243L,    // 2113-11-08 06:04:03 GMT
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 4645ff9be5f..7fb7326f221 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -228,7 +228,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // Tests that `row_bit_count()` can handle struct<list<int32_t>> with more
   // than max_block_size (256) rows.
   // With a large number of rows, computation spills to multiple thread-blocks,
-  // thus exercising the branch-stack comptutation.
+  // thus exercising the branch-stack computation.
   // The contents of the input column aren't as pertinent to this test as the
   // column size. For what it's worth, it looks as follows:
   //   [ struct({0,1}), struct({2,3}), struct({4,5}), ... ]
@@ -362,7 +362,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_nested_and_exp
   // Inner list column
   // clang-format off
   cudf::test::lists_column_wrapper<int> list{
-    {1, 2, 3, 4, 5},     
+    {1, 2, 3, 4, 5},
     {6, 7, 8},
     {33, 34, 35, 36, 37, 38, 39},
     {-1, -2},
@@ -408,7 +408,7 @@ std::unique_ptr<column> build_nested_column(std::vector<bool> const& struct_vali
 
   // Inner list column
   // clang-format off
-  cudf::test::lists_column_wrapper<int> list{    
+  cudf::test::lists_column_wrapper<int> list{
      {{1, 2, 3, 4, 5}, {2, 3}},
      {{6, 7, 8}, {8, 9}},
      {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}};
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6f2f01c746d..e2bedd9d0b1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -337,7 +337,7 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         else:
             return self.dropna(drop_nan=False).data_array_view
 
-    # TODO: This method is decpreated and can be removed when the associated
+    # TODO: This method is deprecated and can be removed when the associated
     # Frame methods are removed.
     def to_array(self, fillna=None) -> np.ndarray:
         """Get a dense numpy array for the data.
@@ -1851,7 +1851,7 @@ def as_column(
 
         arbitrary = np.asarray(arbitrary)
 
-        # Handle case that `arbitary` elements are cupy arrays
+        # Handle case that `arbitrary` elements are cupy arrays
         if (
             shape
             and shape[0]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 756e48edccb..7c8837ef45f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -543,7 +543,7 @@ def infer_format(element: str, **kwargs) -> str:
     if len(second_parts) > 1:
         # "Z" indicates Zulu time(widely used in aviation) - Which is
         # UTC timezone that currently cudf only supports. Having any other
-        # unsuppported timezone will let the code fail below
+        # unsupported timezone will let the code fail below
         # with a ValueError.
         second_parts.remove("Z")
         second_part = "".join(second_parts[1:])
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6409a9f9196..7037b8e6f36 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -160,7 +160,7 @@ def binary_operator(self, op, other, reflect=False):
         if reflect:
             self, other = other, self
 
-        # Binary Arithmatics between decimal columns. `Scale` and `precision`
+        # Binary Arithmetics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
         if op in ("add", "sub", "mul", "div"):
             scale = _binop_scale(self.dtype, other.dtype, op)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bfbe8b06c17..c0cb6f1917f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6367,7 +6367,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dc6461663ce..7f9f61ed3fd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -18,7 +18,7 @@
 
 
 # The three functions below return the quantiles [25%, 50%, 75%]
-# respectively, which are called in the describe() method to ouput
+# respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
 def _quantile_25(x):
     return x.quantile(0.25)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 63fda21152d..5ea9ac945dc 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -623,7 +623,7 @@ def _union(self, other, sort=None):
                 else:
                     return result
 
-        # If all the above optimizations don't cater to the inpputs,
+        # If all the above optimizations don't cater to the inputs,
         # we materialize RangeIndex's into `Int64Index` and
         # then perform `union`.
         return Int64Index(self._values)._union(other, sort=sort)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 8c4f87d5f67..a1eda697683 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -970,7 +970,7 @@ def _concat(cls, objs):
 
         source_data = [o.to_frame(index=False) for o in objs]
 
-        # TODO: Verify if this is really necesary or if we can rely on
+        # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
             colnames = source_data[0].columns
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c804f2bca2c..cf035ef457d 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2916,7 +2916,7 @@ def unique(self):
 
     def nunique(self, method="sort", dropna=True):
         """Returns the number of unique values of the Series: approximate version,
-        and exact version to be moved to libgdf
+        and exact version to be moved to libcudf
 
         Excludes NA values by default.
 
@@ -2985,7 +2985,7 @@ def value_counts(
 
         Returns
         -------
-        result : Series contanining counts of unique values.
+        result : Series containing counts of unique values.
 
         See also
         --------
@@ -3802,7 +3802,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
     # string of a function by recursively delving into __wrapped__ until
     # it hits the first function that has __signature__ attribute set. To make
-    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # the signature string of `wrapper` matches with its actual parameter list,
     # we directly set the __signature__ attribute of `wrapper` below.
 
     new_sig = inspect.signature(
@@ -5054,7 +5054,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     """Returns a boolean array where two arrays are equal within a tolerance.
 
-    Two values in ``a`` and ``b`` are  considiered equal when the following
+    Two values in ``a`` and ``b`` are  considered equal when the following
     equation is satisfied.
 
     .. math::
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index deb4546e8b8..2464906be04 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -316,7 +316,7 @@ def compile_or_get(frame, func, args):
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
     a thread for each row and calls `f` using that rows data / mask
-    to produce an output value and output valdity for each row.
+    to produce an output value and output validity for each row.
 
     If the UDF has already been compiled for this requested dtypes,
     a cached version will be returned instead of running compilation.
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 4b0f0bf1283..da7ff4c0e32 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -67,7 +67,7 @@ def unify(self, context, other):
         """
         Often within a UDF an instance arises where a variable could
         be a `MaskedType`, an `NAType`, or a literal based off
-        the data at runtime, for examplem the variable `ret` here:
+        the data at runtime, for example the variable `ret` here:
 
         def f(x):
             if x == 1:
@@ -185,7 +185,7 @@ class NAType(types.Type):
     """
     A type for handling ops against nulls
     Exists so we can:
-    1. Teach numba that all occurances of `cudf.NA` are
+    1. Teach numba that all occurrences of `cudf.NA` are
        to be read as instances of this type instead
     2. Define ops like `if x is cudf.NA` where `x` is of
        type `Masked` to mean `if x.valid is False`
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 9562fca7399..59c291eea0b 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -410,7 +410,7 @@ def assert_series_equal(
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_series_type : bool, default True
-        Whether to check the seires class, dtype and
+        Whether to check the series class, dtype and
         inferred_type are identical. Currently it is idle,
         and similar to pandas.
     check_less_precise : bool or int, default False
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 542dcd9301c..ba2a6dce369 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1173,7 +1173,7 @@ def make_scalar_product_data():
         )
     )
 
-    # we can muliply any timedelta by any int, or bool
+    # we can multiply any timedelta by any int, or bool
     valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES))
 
     # we can multiply a float by any int, float, or bool
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 16e5b345ce2..bfd2ccbccef 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -44,7 +44,7 @@ def test_dataframe_accessor(gdf):
     "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})]
 )
 def test_dataframe_accessor_idendity(gdf1, gdf2):
-    """Test for accessor idendities
+    """Test for accessor identities
     - An object should hold persistent reference to the same accessor
     - Different objects should hold difference instances of the accessor
     """
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index bf75badc06f..a95be4f7932 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -171,7 +171,7 @@ def test_dt_ops(data):
     assert_eq(pd_data > pd_data, gdf_data > gdf_data)
 
 
-# libgdf doesn't respect timezones
+# libcudf doesn't respect timezones
 @pytest.mark.parametrize("data", [data1()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_series(data, field):
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index d409a099806..07407b8d359 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -738,9 +738,9 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 )
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
-    """Test memory idendity for deep copy
+    """Test memory identity for deep copy
     Case1: Constructed from GroupBy, StringColumns
-    Case2: Constrcuted from MultiIndex, NumericColumns
+    Case2: Constructed from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
@@ -768,19 +768,19 @@ def test_multiindex_copy_deep(data, deep):
         mi1 = data
         mi2 = mi1.copy(deep=deep)
 
-        # Assert ._levels idendity
+        # Assert ._levels identity
         lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
         rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._codes idendity
+        # Assert ._codes identity
         lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
         rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]
 
         assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
 
-        # Assert ._data idendity
+        # Assert ._data identity
         lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
         rptrs = [d.base_data.ptr for _, d in mi2._data.items()]
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 99b5652110b..6b02874146e 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -502,7 +502,7 @@ def test_orc_writer_sliced(tmpdir):
         "TestOrcFile.decimal.orc",
         "TestOrcFile.decimal.same.values.orc",
         "TestOrcFile.decimal.multiple.values.orc",
-        # For addional information take look at PR 7034
+        # For additional information take look at PR 7034
         "TestOrcFile.decimal.runpos.issue.orc",
     ],
 )
@@ -541,7 +541,7 @@ def test_orc_decimal_precision_fail(datadir):
     assert_eq(pdf, gdf)
 
 
-# For addional information take look at PR 6636 and 6702
+# For additional information take look at PR 6636 and 6702
 @pytest.mark.parametrize(
     "orc_file",
     [
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index 77963f8bcc1..dbdd68f2df8 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -143,7 +143,7 @@ def _try_get_old_or_new_symbols():
             cuda_driver_supported_rt_version >= 11000
             and cuda_runtime_version >= 11000
         ):
-            # With cuda enhanced compatibitlity any code compiled
+            # With cuda enhanced compatibility any code compiled
             # with 11.x version of cuda can now run on any
             # driver >= 450.80.02. 11000 is the minimum cuda
             # version 450.80.02 supports.
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 11994830fed..0f9d9d53b23 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1038,7 +1038,7 @@
     should consume messages from. Valid values are 0 - (N-1)
 start_offset : int, Kafka Topic/Partition offset that consumption
     should begin at. Inclusive.
-end_offset : int, Kafka Topic/Parition offset that consumption
+end_offset : int, Kafka Topic/Partition offset that consumption
     should end at. Inclusive.
 batch_timeout : int, default 10000
     Maximum number of milliseconds that will be spent trying to
@@ -1061,7 +1061,7 @@
     or any object with a `read()` method (such as builtin `open()` file handler
     function or `StringIO`).
 delimiter : string, default None, The delimiter that should be used
-    for splitting text chunks into seperate cudf column rows. Currently
+    for splitting text chunks into separate cudf column rows. Currently
     only a single delimiter is supported.
 
 Returns
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 4f9b23bf6fe..a9611a91554 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -353,7 +353,7 @@ def get_appropriate_dispatched_func(
 
         elif hasattr(cupy_submodule, fname):
             cupy_func = getattr(cupy_submodule, fname)
-            # Handle case if cupy impliments it as a numpy function
+            # Handle case if cupy implements it as a numpy function
             # Unsure if needed
             if cupy_func is func:
                 return NotImplemented
@@ -374,7 +374,7 @@ def _cast_to_appropriate_cudf_type(val, index=None):
     elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1):
         # if index is not None and is of a different length
         # than the index, cupy dispatching behaviour is undefined
-        # so we dont impliment it
+        # so we don't implement it
         if (index is None) or (len(index) == len(val)):
             return cudf.Series(val, index=index)
 
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
index eb7457f3465..8ca2cf98381 100644
--- a/python/dask_cudf/dask_cudf/_version.py
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -417,7 +417,7 @@ def render_pep440_old(pieces):
 
     The ".dev0" means dirty.
 
-    Eexceptions:
+    Exceptions:
     1: no tags. 0.postDISTANCE[.dev0]
     """
     if pieces["closest-tag"]:
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index f81a4743a4a..89b5301ee83 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -196,7 +196,7 @@ def make_meta_object_cudf(x, index=None):
         )
     elif not hasattr(x, "dtype") and x is not None:
         # could be a string, a dtype object, or a python type. Skip `None`,
-        # because it is implictly converted to `dtype('f8')`, which we don't
+        # because it is implicitly converted to `dtype('f8')`, which we don't
         # want here.
         try:
             dtype = np.dtype(x)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 2e5d55e92d2..b47a5e78095 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -111,7 +111,7 @@ def _read_paths(
             frag = next(ds.get_fragments())
             if frag:
                 # Extract hive-partition keys, and make sure they
-                # are orderd the same as they are in `partitions`
+                # are ordered the same as they are in `partitions`
                 raw_keys = pa_ds._get_partition_keys(frag.partition_expression)
                 partition_keys = [
                     (hive_part.name, raw_keys[hive_part.name])
@@ -173,7 +173,7 @@ def read_partition(
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
 
-        # Assume multi-peice read
+        # Assume multi-piece read
         paths = []
         rgs = []
         last_partition_keys = None
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index d93037b3802..706b0e272ea 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -378,7 +378,7 @@ def test_chunksize(tmpdir, chunksize, metadata):
             # one output partition
             assert ddf3.npartitions == 1
         else:
-            # Files can be aggregateed together, but
+            # Files can be aggregated together, but
             # chunksize is not large enough to produce
             # a single output partition
             assert ddf3.npartitions < num_row_groups

From c1bfb26715e0234f6d90aceac7a52caded2e9f9e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Nov 2021 19:29:14 -0500
Subject: [PATCH 015/202] Fix regex non-multiline EOL/$ matching strings ending
 with a new-line (#9715)

Closes #9620

Fixes an edge case described in https://docs.python.org/3/library/re.html#re.MULTILINE
where the '$' EOL regex pattern character (without `MULTILINE` set) should match at the very end of a string and also just before the end of the string if the end of that string contains a new-line.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Sheilah Kirui (https://github.com/skirui-source)

URL: https://github.com/rapidsai/cudf/pull/9715
---
 cpp/src/strings/regex/regex.inl       |  5 ++++-
 cpp/tests/strings/contains_tests.cpp  | 17 +++++++++--------
 python/cudf/cudf/tests/test_string.py |  5 +++--
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 66e99756615..bc0679993d0 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if (last_character || (inst->u1.c == '$' && c == '\n')) {
+            if (last_character ||
+                (c == '\n' && (inst->u1.c == '$' ||
+                               // edge case where \n appears at the end of the string
+                               pos + 1 == dstr.length()))) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 3c11444e4b5..229f9e4cc82 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest)
 
 TEST_F(StringsContainsTests, MultiLine)
 {
-  auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""});
-  auto view  = cudf::strings_column_view(input);
+  auto input =
+    cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
+  auto view = cudf::strings_column_view(input);
 
   auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
+  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   results           = cudf::strings::contains_re(view, "^abc$");
-  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
   results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
+  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
   results          = cudf::strings::matches_re(view, "^abc$");
-  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
   results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
-  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
   results        = cudf::strings::count_re(view, "^abc$");
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index c75eb91a335..cf52c4684c8 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1746,12 +1746,13 @@ def test_string_wrap(data, width):
         ["A B", "1.5", "3,000"],
         ["23", "³", "⅕", ""],
         [" ", "\t\r\n ", ""],
-        ["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
+        ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"],
         ["line\nto be wrapped", "another\nline\nto be wrapped"],
     ],
 )
 @pytest.mark.parametrize(
-    "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"]
+    "pat",
+    ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"],
 )
 @pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL])
 def test_string_count(data, pat, flags):

From 05dd5415b1391270ea74d1f33080bbbf58f848cc Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 19 Nov 2021 14:32:37 -0800
Subject: [PATCH 016/202] Use List of Columns as Input for `drop_nulls`,
 `gather` and `drop_duplicates` (#9558)

Currently, there are several APIs that accepts a `Frame` object as input, in corresponding to their libcudf counterparts that accepts a `table_view`. To make some also work for columns, currently we pass them through `as_frame` and return with `_as_column`. This PR changes the cython API to accept a list of columns and greatly reduces the overhead of column roundtrip (see benchmark for column APIs below).

Starting as a pilot study of standardizing cython calling convention for table APIs, some decisions were made in this PR:
1. Use `list` as the container for the collection of the columns. Ideally, an iterable is most pythonic, but may lose some type safety.
2. The column collection is agnostic to index/data columns, libcudf handle index column separately either. This helps simplify cython logics.

<details>
<summary>Gather/Take Benchmark</summary>

```
----------------------------------- benchmark '100-random-False': 4 tests ------------------------------------
Name (time in us)                                      Min                   Max                Mean
--------------------------------------------------------------------------------------------------------------
gather_single_column[100-random-False] (afte)     420.4372 (1.0)        552.7758 (1.0)      428.8227 (1.0)
gather_single_column[100-random-False] (befo)     597.7047 (1.42)       811.8181 (1.47)     606.3709 (1.41)
take_multiple_column[100-random-False] (afte)     849.6591 (2.02)     6,339.7521 (11.47)    870.1292 (2.03)
take_multiple_column[100-random-False] (befo)     864.0001 (2.06)     1,091.5170 (1.97)     872.8270 (2.04)
--------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-random-True': 4 tests -----------------------------------
Name (time in us)                                     Min                   Max                Mean
-------------------------------------------------------------------------------------------------------------
gather_single_column[100-random-True] (afte)     141.4879 (1.0)      3,144.3723 (2.64)     145.7316 (1.0)
gather_single_column[100-random-True] (befo)     291.5259 (2.06)     3,083.7669 (2.59)     299.2343 (2.05)
take_multiple_column[100-random-True] (afte)     958.2350 (6.77)     1,295.6643 (1.09)     971.2230 (6.66)
take_multiple_column[100-random-True] (befo)     967.4439 (6.84)     1,191.7809 (1.0)      976.4725 (6.70)
-------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-reverse-False': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[100-reverse-False] (afte)     414.2257 (1.0)      6,856.2678 (2.05)     426.5804 (1.0)
gather_single_column[100-reverse-False] (befo)     589.7889 (1.42)     3,387.3413 (1.01)     602.0794 (1.41)
take_multiple_column[100-reverse-False] (afte)     849.6824 (2.05)     4,650.7069 (1.39)     862.7702 (2.02)
take_multiple_column[100-reverse-False] (befo)     863.7700 (2.09)     3,348.6579 (1.0)      877.5145 (2.06)
---------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '100-reverse-True': 4 tests ------------------------------------
Name (time in us)                                      Min                   Max                Mean
--------------------------------------------------------------------------------------------------------------
gather_single_column[100-reverse-True] (afte)     141.5601 (1.0)        292.0129 (1.0)      144.5997 (1.0)
gather_single_column[100-reverse-True] (befo)     286.7738 (2.03)     4,374.5530 (14.98)    297.3910 (2.06)
take_multiple_column[100-reverse-True] (afte)     960.0958 (6.78)     1,354.3908 (4.64)     973.7589 (6.73)
take_multiple_column[100-reverse-True] (befo)     963.5990 (6.81)     1,175.8050 (4.03)     975.9332 (6.75)
--------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '100-sequence-False': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[100-sequence-False] (afte)     418.4479 (1.0)      4,602.9259 (2.09)     436.3953 (1.0)
gather_single_column[100-sequence-False] (befo)     589.5318 (1.41)     4,665.3422 (2.12)     605.6177 (1.39)
take_multiple_column[100-sequence-False] (afte)     851.3979 (2.03)     5,037.6062 (2.29)     866.8329 (1.99)
take_multiple_column[100-sequence-False] (befo)     858.9821 (2.05)     2,197.5730 (1.0)      872.5517 (2.00)
----------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '100-sequence-True': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[100-sequence-True] (afte)     145.0991 (1.0)        229.3726 (1.0)      148.7882 (1.0)
gather_single_column[100-sequence-True] (befo)     289.9761 (2.00)       363.9143 (1.59)     295.9855 (1.99)
take_multiple_column[100-sequence-True] (afte)     961.4970 (6.63)     1,028.0283 (4.48)     969.3146 (6.51)
take_multiple_column[100-sequence-True] (befo)     962.7347 (6.64)     1,048.2450 (4.57)     973.8807 (6.55)
---------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-random-False': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-random-False] (afte)     419.3909 (1.0)        669.2931 (1.0)      427.0140 (1.0)
gather_single_column[10000-random-False] (befo)     600.0311 (1.43)     2,198.0200 (3.28)     610.3418 (1.43)
take_multiple_column[10000-random-False] (afte)     862.4257 (2.06)     4,764.4433 (7.12)     880.1974 (2.06)
take_multiple_column[10000-random-False] (befo)     873.0851 (2.08)     1,024.1494 (1.53)     881.4482 (2.06)
----------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-random-True': 4 tests -----------------------------------
Name (time in us)                                       Min                   Max                Mean
---------------------------------------------------------------------------------------------------------------
gather_single_column[10000-random-True] (afte)     134.2846 (1.0)      4,995.3298 (12.11)    139.0623 (1.0)
gather_single_column[10000-random-True] (befo)     284.2899 (2.12)       412.4213 (1.0)      289.8005 (2.08)
take_multiple_column[10000-random-True] (afte)     960.2159 (7.15)     1,361.8441 (3.30)     973.4057 (7.00)
take_multiple_column[10000-random-True] (befo)     965.8998 (7.19)     1,140.6899 (2.77)     976.9224 (7.03)
---------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-reverse-False': 4 tests -----------------------------------
Name (time in us)                                         Min                   Max                Mean
-----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-reverse-False] (afte)     419.7811 (1.0)        634.7937 (1.0)      428.2997 (1.0)
gather_single_column[10000-reverse-False] (befo)     600.3999 (1.43)       762.5762 (1.20)     608.6369 (1.42)
take_multiple_column[10000-reverse-False] (afte)     856.1970 (2.04)     1,138.3081 (1.79)     870.1638 (2.03)
take_multiple_column[10000-reverse-False] (befo)     869.8748 (2.07)     3,184.0033 (5.02)     889.7182 (2.08)
-----------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-reverse-True': 4 tests ------------------------------------
Name (time in us)                                        Min                   Max                Mean
----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-reverse-True] (afte)     135.4842 (1.0)      3,634.2950 (7.81)     140.8658 (1.0)
gather_single_column[10000-reverse-True] (befo)     284.9372 (2.10)       465.4219 (1.0)      292.6105 (2.08)
take_multiple_column[10000-reverse-True] (afte)     957.0192 (7.06)     1,240.3540 (2.67)     966.7779 (6.86)
take_multiple_column[10000-reverse-True] (befo)     967.6940 (7.14)     1,062.0849 (2.28)     975.9307 (6.93)
----------------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-sequence-False': 4 tests ------------------------------------
Name (time in us)                                          Min                   Max                Mean
------------------------------------------------------------------------------------------------------------------
gather_single_column[10000-sequence-False] (afte)     420.3622 (1.0)        555.1544 (1.0)      427.4441 (1.0)
gather_single_column[10000-sequence-False] (befo)     601.7918 (1.43)     3,534.9689 (6.37)     613.6190 (1.44)
take_multiple_column[10000-sequence-False] (afte)     858.0340 (2.04)     1,166.5919 (2.10)     868.6121 (2.03)
take_multiple_column[10000-sequence-False] (befo)     871.3542 (2.07)     1,118.0961 (2.01)     881.9761 (2.06)
------------------------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-sequence-True': 4 tests -----------------------------------
Name (time in us)                                         Min                   Max                Mean
-----------------------------------------------------------------------------------------------------------------
gather_single_column[10000-sequence-True] (afte)     135.8581 (1.0)      3,894.4702 (3.55)     141.3496 (1.0)
gather_single_column[10000-sequence-True] (befo)     284.5018 (2.09)     2,703.6560 (2.47)     290.8583 (2.06)
take_multiple_column[10000-sequence-True] (afte)     957.4448 (7.05)     1,096.1141 (1.0)      966.4487 (6.84)
take_multiple_column[10000-sequence-True] (befo)     966.2341 (7.11)     1,242.0323 (1.13)     978.3753 (6.92)
-----------------------------------------------------------------------------------------------------------------
```
</details>

<details>
<summary>Dropna Benchmark</summary>

```
------------------------------------ benchmark '100-False': 2 tests -----------------------------------
Name (time in us)                               Min                   Max                Mean
-------------------------------------------------------------------------------------------------------
dropna_single_column[100-False] (afte)     143.9294 (1.0)      6,808.9343 (1.58)     150.8468 (1.0)
dropna_single_column[100-False] (befo)     306.3441 (2.13)     4,297.9000 (1.0)      315.3899 (2.09)
-------------------------------------------------------------------------------------------------------

---------------------------------- benchmark '100-True': 2 tests -----------------------------------
Name (time in us)                              Min                 Max                Mean
----------------------------------------------------------------------------------------------------
dropna_single_column[100-True] (afte)     275.7823 (1.0)      327.2779 (1.0)      279.8443 (1.0)
dropna_single_column[100-True] (befo)     548.6836 (1.99)     692.2791 (2.12)     557.9867 (1.99)
----------------------------------------------------------------------------------------------------

------------------------------------ benchmark '10000-False': 2 tests -----------------------------------
Name (time in us)                                 Min                   Max                Mean
---------------------------------------------------------------------------------------------------------
dropna_single_column[10000-False] (afte)     164.9209 (1.0)      5,742.9820 (1.61)     170.0143 (1.0)
dropna_single_column[10000-False] (befo)     328.6479 (1.99)     3,565.7589 (1.0)      336.6208 (1.98)
---------------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000-True': 2 tests ------------------------------------
Name (time in us)                                Min                   Max                Mean
--------------------------------------------------------------------------------------------------------
dropna_single_column[10000-True] (afte)     304.6701 (1.0)        441.9931 (1.0)      309.9858 (1.0)
dropna_single_column[10000-True] (befo)     571.9690 (1.88)     5,526.0560 (12.50)    586.4943 (1.89)
--------------------------------------------------------------------------------------------------------
```
</details>

<details>
<summary> Unique/Drop_duplicate Benchmark</summary>

```
------------------------------------ benchmark '100': 4 tests -----------------------------------
Name (time in us)                         Min                   Max                Mean
-------------------------------------------------------------------------------------------------
drop_duplicate_df[100] (afte)        891.9560 (2.77)     1,151.0071 (2.76)     904.5752 (2.74)
drop_duplicate_df[100] (befo)        880.9832 (2.74)     5,528.1101 (13.23)    896.1535 (2.72)
unique_single_column[100] (afte)     322.0579 (1.0)        417.7210 (1.0)      329.5932 (1.0)
unique_single_column[100] (befo)     480.7310 (1.49)     4,470.7772 (10.70)    491.7183 (1.49)
-------------------------------------------------------------------------------------------------

-------------------------------- benchmark '10000': 4 tests -------------------------------
Name (time in ms)                         Min               Max              Mean
-------------------------------------------------------------------------------------------
drop_duplicate_df[10000] (afte)        1.0108 (2.23)     3.9981 (4.72)     1.0280 (2.17)
drop_duplicate_df[10000] (befo)        1.0021 (2.21)     3.5031 (4.14)     1.0177 (2.15)
unique_single_column[10000] (afte)     0.4534 (1.0)      4.5188 (5.33)     0.4740 (1.0)
unique_single_column[10000] (befo)     0.6095 (1.34)     0.8471 (1.0)      0.6332 (1.34)
-------------------------------------------------------------------------------------------
```
</details>

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/9558
---
 python/cudf/cudf/_lib/copying.pyx           |  39 ++---
 python/cudf/cudf/_lib/stream_compaction.pyx |  81 +++-------
 python/cudf/cudf/_lib/utils.pxd             |   1 +
 python/cudf/cudf/_lib/utils.pyx             |  34 +++-
 python/cudf/cudf/core/column/column.py      |  56 ++++---
 python/cudf/cudf/core/frame.py              | 162 ++++++++++++--------
 python/cudf/cudf/core/index.py              |  29 +---
 python/cudf/cudf/core/indexed_frame.py      | 113 +++++++++++++-
 python/cudf/cudf/core/multiindex.py         |   8 +-
 python/cudf/cudf/utils/utils.py             |  18 +++
 10 files changed, 322 insertions(+), 219 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 26ef428f21f..28bd78733a3 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -37,7 +37,12 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_table_view,
+    data_from_unique_ptr,
+    table_view_from_columns,
+)
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
@@ -144,27 +149,12 @@ def copy_range(Column input_column,
 
 
 def gather(
-    source_table,
+    columns: list,
     Column gather_map,
-    bool keep_index=True,
-    bool nullify=False,
-    bool check_bounds=True
+    bool nullify=False
 ):
-    if not pd.api.types.is_integer_dtype(gather_map.dtype):
-        raise ValueError("Gather map is not integer dtype.")
-
-    if check_bounds and len(gather_map) > 0 and not nullify:
-        gm_min, gm_max = minmax(gather_map)
-        if gm_min < -len(source_table) or gm_max >= len(source_table):
-            raise IndexError(f"Gather map index with min {gm_min},"
-                             f" max {gm_max} is out of bounds in"
-                             f" {type(source_table)} with {len(source_table)}"
-                             f" rows.")
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, not keep_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view gather_map_view = gather_map.view()
     cdef cpp_copying.out_of_bounds_policy policy = (
         cpp_copying.out_of_bounds_policy.NULLIFY if nullify
@@ -180,16 +170,7 @@ def gather(
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (
-                source_table._index is None)
-            or keep_index is False
-            else source_table._index_names
-        )
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def scatter(object source, Column scatter_map, Column target_column,
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 7167d18409e..ef47e843723 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -24,40 +24,34 @@ from cudf._lib.cpp.types cimport (
     null_policy,
     size_type,
 )
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    columns_from_unique_ptr,
+    data_from_unique_ptr,
+    table_view_from_columns,
+    table_view_from_table,
+)
 
 
-def drop_nulls(source_table, how="any", keys=None, thresh=None):
+def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     """
     Drops null rows from cols depending on key columns.
 
     Parameters
     ----------
-    source_table : source table whose null rows are dropped to form new table
+    columns : list of columns
     how  : "any" or "all". If thresh is None, drops rows of cols that have any
            nulls or all nulls (respectively) in subset (default: "any")
-    keys : List of Column names. If set, then these columns are checked for
-           nulls rather than all of cols (optional)
+    keys : List of column indices. If set, then these columns are checked for
+           nulls rather than all of columns (optional)
     thresh : Minimum number of non-nulls required to keep a row (optional)
 
     Returns
     -------
-    Frame with null rows dropped
+    columns with null rows dropped
     """
 
-    num_index_columns = (
-        0 if source_table._index is None else
-        source_table._index._num_columns)
-    # shifting the index number by number of index columns
     cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
+        keys if keys is not None else range(len(columns))
     )
 
     cdef size_type c_keep_threshold = cpp_keys.size()
@@ -67,7 +61,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
         c_keep_threshold = 1
 
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -78,13 +72,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index is None
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def apply_boolean_mask(source_table, Column boolean_mask):
@@ -124,26 +112,29 @@ def apply_boolean_mask(source_table, Column boolean_mask):
     )
 
 
-def drop_duplicates(source_table,
+def drop_duplicates(columns: list,
                     object keys=None,
                     object keep='first',
-                    bool nulls_are_equal=True,
-                    bool ignore_index=False):
+                    bool nulls_are_equal=True):
     """
     Drops rows in source_table as per duplicate rows in keys.
 
     Parameters
     ----------
-    source_table : source_table whose rows gets dropped
-    keys : List of Column names belong to source_table
+    columns : List of columns
+    keys : List of column indices. If set, then these columns are checked for
+           duplicates rather than all of columns (optional)
     keep : keep 'first' or 'last' or none of the duplicate rows
     nulls_are_equal : if True, nulls are treated equal else not.
 
     Returns
     -------
-    Frame with duplicate dropped
+    columns with duplicate dropped
     """
 
+    cdef vector[size_type] cpp_keys = (
+        keys if keys is not None else range(len(columns))
+    )
     cdef duplicate_keep_option cpp_keep_option
 
     if keep == 'first':
@@ -155,30 +146,14 @@ def drop_duplicates(source_table,
     else:
         raise ValueError('keep must be either "first", "last" or False')
 
-    num_index_columns =(
-        0 if (source_table._index is None or ignore_index)
-        else source_table._index._num_columns)
     # shifting the index number by number of index columns
-    cdef vector[size_type] cpp_keys = (
-        [
-            num_index_columns + source_table._column_names.index(name)
-            for name in keys
-        ]
-        if keys is not None
-        else range(
-            num_index_columns, num_index_columns + source_table._num_columns
-        )
-    )
-
     cdef null_equality cpp_nulls_equal = (
         null_equality.EQUAL
         if nulls_are_equal
         else null_equality.UNEQUAL
     )
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(
-        source_table, ignore_index
-    )
+    cdef table_view source_table_view = table_view_from_columns(columns)
 
     with nogil:
         c_result = move(
@@ -190,13 +165,7 @@ def drop_duplicates(source_table,
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if (source_table._index is None or ignore_index)
-            else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 10f76279401..50893ef9838 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -16,3 +16,4 @@ cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
 cdef table_view table_view_from_table(tbl, ignore_index=*) except*
+cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 18eed2b3396..40edd4bf9a2 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -36,7 +36,6 @@ PARQUET_META_TYPE_MAP = {
     for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
 }
 
-
 cdef table_view table_view_from_columns(columns) except*:
     """Create a cudf::table_view from an iterable of Columns."""
     cdef vector[column_view] column_views
@@ -221,6 +220,32 @@ def _index_level_name(index_name, level, column_names):
         return f"__index_level_{level}__"
 
 
+cdef columns_from_unique_ptr(
+    unique_ptr[table] c_tbl
+):
+    """Convert a libcudf table into list of columns.
+
+    Parameters
+    ----------
+    c_tbl : unique_ptr[cudf::table]
+        The libcudf table whose columns will be extracted
+
+    Returns
+    -------
+    list[Column]
+        A list of columns.
+    """
+    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
+    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
+
+    cdef size_t i
+
+    columns = [Column.from_unique_ptr(move(dereference(it+i)))
+               for i in range(c_columns.size())]
+
+    return columns
+
+
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=None
 ):
@@ -255,13 +280,8 @@ cdef data_from_unique_ptr(
     tuple(Dict[str, Column], Optional[Index])
         A dict of the columns in the output table.
     """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
 
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
+    columns = columns_from_unique_ptr(move(c_tbl))
 
     # First construct the index, if any
     index = (
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e2bedd9d0b1..1d113f6e159 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -34,7 +34,11 @@
     create_null_mask,
 )
 from cudf._lib.scalar import as_device_scalar
-from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
+from cudf._lib.stream_compaction import (
+    distinct_count as cpp_distinct_count,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
@@ -71,7 +75,7 @@
     pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_np_dtypes,
 )
-from cudf.utils.utils import mask_dtype
+from cudf.utils.utils import _gather_map_is_valid, mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
 
@@ -200,11 +204,8 @@ def any(self, skipna: bool = True) -> bool:
         return result_col
 
     def dropna(self, drop_nan: bool = False) -> ColumnBase:
-        if drop_nan:
-            col = self.nans_to_nulls()
-        else:
-            col = self
-        return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
+        col = self.nans_to_nulls() if drop_nan else self
+        return drop_nulls([col])[0]
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -686,28 +687,27 @@ def median(self, skipna: bool = None) -> ScalarLike:
         raise TypeError(f"cannot perform median with type {self.dtype}")
 
     def take(
-        self: T,
-        indices: ColumnBase,
-        keep_index: bool = True,
-        nullify: bool = False,
+        self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True
     ) -> T:
-        """Return Column by taking values from the corresponding *indices*."""
+        """Return Column by taking values from the corresponding *indices*.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
         # Handle zero size
         if indices.size == 0:
             return cast(T, column_empty_like(self, newsize=0))
-        try:
-            return (
-                self.as_frame()
-                ._gather(indices, keep_index=keep_index, nullify=nullify)
-                ._as_column()
-                ._with_type_metadata(self.dtype)
-            )
-        except RuntimeError as e:
-            if "out of bounds" in str(e):
-                raise IndexError(
-                    f"index out of bounds for column of size {len(self)}"
-                ) from e
-            raise
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(indices.dtype):
+            indices = indices.astype("int32")
+        if not _gather_map_is_valid(indices, len(self), check_bounds, nullify):
+            raise IndexError("Gather map index is out of bounds.")
+
+        return libcudf.copying.gather([self], indices, nullify=nullify)[
+            0
+        ]._with_type_metadata(self.dtype)
 
     def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
@@ -1098,11 +1098,7 @@ def unique(self) -> ColumnBase:
         # the following issue resolved:
         # https://github.com/rapidsai/cudf/issues/5286
 
-        return (
-            self.as_frame()
-            .drop_duplicates(keep="first", ignore_index=True)
-            ._as_column()
-        )
+        return drop_duplicates([self], keep="first")[0]
 
     def serialize(self) -> Tuple[dict, list]:
         header: Dict[Any, Any] = {}
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 58fe8a43d8d..d7a75cb9f40 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -52,6 +52,7 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type, is_column_like
+from cudf.utils.utils import _gather_map_is_valid
 
 T = TypeVar("T", bound="Frame")
 
@@ -140,6 +141,37 @@ def _from_data(
         Frame.__init__(obj, data, index)
         return obj
 
+    @classmethod
+    def _from_columns(
+        cls,
+        columns: List[ColumnBase],
+        column_names: List[str],
+        index_names: Optional[List[str]] = None,
+    ):
+        """Construct a `Frame` object from a list of columns.
+
+        If `index_names` is set, the first `len(index_names)` columns are
+        used to construct the index of the frame.
+        """
+        index = None
+        n_index_columns = 0
+        if index_names is not None:
+            n_index_columns = len(index_names)
+            index = cudf.core.index._index_from_data(
+                dict(zip(range(n_index_columns), columns))
+            )
+            if isinstance(index, cudf.MultiIndex):
+                index.names = index_names
+            else:
+                index.name = index_names[0]
+
+        data = {
+            name: columns[i + n_index_columns]
+            for i, name in enumerate(column_names)
+        }
+
+        return cls._from_data(data, index)
+
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
     ) -> Optional[Frame]:
@@ -520,22 +552,32 @@ def _get_columns_by_index(self, indices):
     def _gather(
         self, gather_map, keep_index=True, nullify=False, check_bounds=True
     ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        # TODO: `keep_index` argument is to be removed.
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
         if not is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
-        result = self.__class__._from_data(
-            *libcudf.copying.gather(
-                self,
-                as_column(gather_map),
-                keep_index=keep_index,
-                nullify=nullify,
-                check_bounds=check_bounds,
-            )
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._columns), gather_map, nullify=nullify,
+            ),
+            self._column_names,
         )
 
-        result._copy_type_metadata(self, include_index=keep_index)
-        result._data.names = self._data.names
-        if keep_index and self._index is not None:
-            result._index.names = self._index.names
+        result._copy_type_metadata(self)
         return result
 
     def _hash(self, method, initial_hash=None):
@@ -1396,10 +1438,8 @@ def _drop_na_rows(
         diff = set(subset) - set(self._data)
         if len(diff) != 0:
             raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [
-            name for name, col in self._data.items() if name in subset
-        ]
-        if len(subset_cols) == 0:
+
+        if len(subset) == 0:
             return self.copy(deep=True)
 
         frame = self.copy(deep=False)
@@ -1412,16 +1452,19 @@ def _drop_na_rows(
                 else:
                     frame._data[name] = col
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_nulls(
-                frame, how=how, keys=subset, thresh=thresh
-            )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_nulls(
+                list(self._index._data.columns + frame._columns),
+                how=how,
+                keys=self._positions_from_column_names(
+                    subset, offset_by_index_columns=True
+                ),
+                thresh=thresh,
+            ),
+            self._column_names,
+            self._index.names,
         )
         result._copy_type_metadata(frame)
-        if self._index is not None:
-            result._index.name = self._index.name
-            if isinstance(self._index, cudf.MultiIndex):
-                result._index.names = self._index.names
         return result
 
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
@@ -2262,55 +2305,45 @@ def to_arrow(self):
         )
 
     def drop_duplicates(
-        self,
-        subset=None,
-        keep="first",
-        nulls_are_equal=True,
-        ignore_index=False,
+        self, keep="first", nulls_are_equal=True,
     ):
         """
-        Drops rows in frame as per duplicate rows in `subset` columns from
-        self.
+        Drop duplicate rows in frame.
 
-        subset : list, optional
-            List of columns to consider when dropping rows.
-        keep : ["first", "last", False] first will keep first of duplicate,
-            last will keep last of the duplicate and False drop all
-            duplicate
-        nulls_are_equal: null elements are considered equal to other null
-            elements
-        ignore_index: bool, default False
-            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        keep : ["first", "last", False], default "first"
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
         """
-        if subset is None:
-            subset = self._column_names
-        elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
-            or isinstance(subset, tuple)
-            and subset in self._data.names
-        ):
-            subset = (subset,)
-        diff = set(subset) - set(self._data)
-        if len(diff) != 0:
-            raise KeyError(f"columns {diff} do not exist")
-        subset_cols = [name for name in self._column_names if name in subset]
-        if len(subset_cols) == 0:
-            return self.copy(deep=True)
 
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.drop_duplicates(
-                self,
-                keys=subset,
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns),
+                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
-                ignore_index=ignore_index,
-            )
+            ),
+            self._column_names,
         )
-
+        # TODO: _copy_type_metadata is a common pattern to apply after the
+        # roundtrip from libcudf. We should build this into a factory function
+        # to increase reusability.
         result._copy_type_metadata(self)
         return result
 
+    def _positions_from_column_names(self, column_names):
+        """Map each column name into their positions in the frame.
+
+        The order of indices returned corresponds to the column order in this
+        Frame.
+        """
+        return [
+            i
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
     def replace(
         self,
         to_replace=None,
@@ -2589,7 +2622,10 @@ def _copy_type_metadata(
                     self._index, cudf.core.index.CategoricalIndex
                 ):
                     self._index = cudf.Index(
-                        cast(cudf.core.index.NumericIndex, self._index)._column
+                        cast(
+                            cudf.core.index.NumericIndex, self._index
+                        )._column,
+                        name=self._index.name,
                     )
 
         return self
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 5ea9ac945dc..8f905ee6d49 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -86,6 +86,7 @@ def _lexsorted_equal_range(
 
 def _index_from_data(data: MutableMapping, name: Any = None):
     """Construct an index of the appropriate type from some data."""
+
     if len(data) == 0:
         raise ValueError("Cannot construct Index from any empty Table")
     if len(data) == 1:
@@ -770,34 +771,6 @@ def deserialize(cls, header, frames):
 
         return super().deserialize(header, frames)
 
-    def drop_duplicates(self, keep="first"):
-        """
-        Return Index with duplicate values removed
-
-        Parameters
-        ----------
-        keep : {‘first’, ‘last’, False}, default ‘first’
-            * ‘first’ : Drop duplicates except for the
-                first occurrence.
-            * ‘last’ : Drop duplicates except for the
-                last occurrence.
-            *  False : Drop all duplicates.
-
-        Returns
-        -------
-        Index
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])
-        >>> idx
-        StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object')
-        >>> idx.drop_duplicates()
-        StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object')
-        """  # noqa: E501
-        return super().drop_duplicates(keep=keep)
-
     def _binaryop(
         self,
         other: T,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index cf12907d96a..2044bad9675 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,17 +8,19 @@
 from uuid import uuid4
 
 import cupy as cp
+import numpy as np
 import pandas as pd
 from nvtx import annotate
 
 import cudf
+import cudf._lib as libcudf
 from cudf._typing import ColumnLike
-from cudf.api.types import is_categorical_dtype, is_list_like
+from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import cached_property
+from cudf.utils.utils import _gather_map_is_valid, cached_property
 
 
 def _indices_from_labels(obj, labels):
@@ -435,6 +437,113 @@ def sort_index(
             out = out.reset_index(drop=True)
         return self._mimic_inplace(out, inplace=inplace)
 
+    def _gather(
+        self, gather_map, keep_index=True, nullify=False, check_bounds=True
+    ):
+        """Gather rows of frame specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(gather_map.dtype):
+            gather_map = gather_map.astype("int32")
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            libcudf.copying.gather(
+                list(self._index._columns + self._columns)
+                if keep_index
+                else list(self._columns),
+                gather_map,
+                nullify=nullify,
+            ),
+            self._column_names,
+            self._index.names if keep_index else None,
+        )
+
+        result._copy_type_metadata(self, include_index=keep_index)
+        return result
+
+    def _positions_from_column_names(
+        self, column_names, offset_by_index_columns=False
+    ):
+        """Map each column name into their positions in the frame.
+
+        Return positions of the provided column names, offset by the number of
+        index columns `offset_by_index_columns` is True. The order of indices
+        returned corresponds to the column order in this Frame.
+        """
+        num_index_columns = (
+            len(self._index._data) if offset_by_index_columns else 0
+        )
+        return [
+            i + num_index_columns
+            for i, name in enumerate(self._column_names)
+            if name in set(column_names)
+        ]
+
+    def drop_duplicates(
+        self,
+        subset=None,
+        keep="first",
+        nulls_are_equal=True,
+        ignore_index=False,
+    ):
+        """
+        Drop duplicate rows in frame.
+
+        subset : list, optional
+            List of columns to consider when dropping rows.
+        keep : ["first", "last", False]
+            "first" will keep the first duplicate entry, "last" will keep the
+            last duplicate entry, and False will drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
+        ignore_index: bool, default False
+            If True, the resulting axis will be labeled 0, 1, ..., n - 1.
+        """
+        if subset is None:
+            subset = self._column_names
+        elif (
+            not np.iterable(subset)
+            or isinstance(subset, str)
+            or isinstance(subset, tuple)
+            and subset in self._data.names
+        ):
+            subset = (subset,)
+        diff = set(subset) - set(self._data)
+        if len(diff) != 0:
+            raise KeyError(f"columns {diff} do not exist")
+        subset_cols = [name for name in self._column_names if name in subset]
+        if len(subset_cols) == 0:
+            return self.copy(deep=True)
+
+        keys = self._positions_from_column_names(
+            subset, offset_by_index_columns=not ignore_index
+        )
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_duplicates(
+                list(self._columns)
+                if ignore_index
+                else list(self._index._columns + self._columns),
+                keys=keys,
+                keep=keep,
+                nulls_are_equal=nulls_are_equal,
+            ),
+            self._column_names,
+            self._index.names if not ignore_index else None,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def sort_values(
         self,
         by,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a1eda697683..e0c68e56f63 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -137,9 +137,9 @@ def __init__(
             else:
                 level = cudf.DataFrame({column_name: levels[i]})
 
-            source_data[column_name] = libcudf.copying.gather(level, col)[0][
-                column_name
-            ]
+            source_data[column_name] = libcudf.copying.gather(
+                [level._data[column_name]], col
+            )[0]
 
         super().__init__(source_data)
         self._levels = levels
@@ -1409,7 +1409,7 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     def unique(self):
-        return self.drop_duplicates(ignore_index=True)
+        return self.drop_duplicates(keep="first")
 
     def _clean_nulls_from_index(self):
         """
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index a9611a91554..cea384b9c11 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -12,6 +12,7 @@
 import rmm
 
 import cudf
+from cudf._lib.reduce import minmax
 from cudf.core import column
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
@@ -506,3 +507,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     if (indices == cp.arange(start, stop, step)).all():
         return slice(start, stop, step)
     return indices
+
+
+def _gather_map_is_valid(
+    gather_map: "cudf.core.column.ColumnBase",
+    nrows: int,
+    check_bounds: bool,
+    nullify: bool,
+) -> bool:
+    """Returns true if gather map is valid.
+
+    A gather map is valid if empty or all indices are within the range
+    ``[-nrows, nrows)``, except when ``nullify`` is specifed.
+    """
+    if not check_bounds or nullify or len(gather_map) == 0:
+        return True
+    gm_min, gm_max = minmax(gather_map)
+    return gm_min >= -nrows and gm_max < nrows

From 09a8a4773f74ef6241e9eac4e674181bc753de50 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Nov 2021 16:57:25 -0600
Subject: [PATCH 017/202] Use stop instead of stop_. (#9735)

Small fix to inconsistent variable names in tests, following up from #9571.

Previous conversation: https://github.com/rapidsai/cudf/pull/9571#discussion_r750568195

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9735
---
 cpp/tests/datetime/datetime_ops_test.cpp | 11 +++++------
 cpp/tests/wrappers/timestamps_test.cu    | 15 +++++++--------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 2097e09e674..4ac24317145 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -183,10 +183,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents)
   using namespace cudf::datetime;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto timestamps =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto timestamps = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years =
     fixed_width_column_wrapper<int16_t>{1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033};
@@ -221,9 +220,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen
   using namespace cuda::std::chrono;
 
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
   auto timestamps =
-    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop_));
+    generate_timestamps<T, true>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   auto expected_years = fixed_width_column_wrapper<int16_t>{
     {1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033},
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index b458f34cca8..097b786aefe 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -78,10 +78,9 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
   using namespace cudf::test;
   using namespace cuda::std::chrono;
 
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto chrono_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop_));
+  auto start      = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop       = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto chrono_col = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
 
   // round-trip through the host to copy `chrono_col` values
   // to a new fixed_width_column_wrapper `primitive_col`
@@ -135,14 +134,14 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 
   auto start_lhs = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
   auto start_rhs = milliseconds(-2400000000000);  // Tue, 12 Dec 1893 05:20:00 GMT
-  auto stop_lhs_ = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-  auto stop_rhs_ = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
+  auto stop_lhs  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+  auto stop_rhs  = milliseconds(2600000000000);   // Wed, 22 May 2052 14:13:20 GMT
 
   auto chrono_lhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs));
 
   auto chrono_rhs_col =
-    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs_));
+    generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs));
 
   rmm::device_uvector<int32_t> indices(this->size(), rmm::cuda_stream_default);
   thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end());

From f0367c0e1ebec54c964a2114b248926b8f82ec04 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 19 Nov 2021 15:06:25 -0800
Subject: [PATCH 018/202] Use cuFile direct device reads/writes by default in
 cuIO (#9722)

Making this change early in 22.02 to test through internal use + nightly builds before the release.

- Modify the way cuFile integration is enabled to match the nvCOMP integration.
- Change the default from OFF to GDS (GDS on, only for direct reads/writes, no compatibility mode).
- cuFile JSON config file is now modified on first cuFile use (same time as the driver), instead of the first query that checks if GDS use is enabled.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9722
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/src/io/utilities/config_utils.cpp         |  86 ++++++++++++++
 cpp/src/io/utilities/config_utils.hpp         |  37 +++---
 cpp/src/io/utilities/datasource.cpp           |   9 +-
 cpp/src/io/utilities/file_io_utilities.cpp    | 110 +++++++++---------
 cpp/src/io/utilities/file_io_utilities.hpp    |  26 -----
 .../cudf/source/basics/io-gds-integration.rst |  11 +-
 .../source/basics/io-nvcomp-integration.rst   |   7 +-
 8 files changed, 171 insertions(+), 116 deletions(-)
 create mode 100644 cpp/src/io/utilities/config_utils.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 966728d7647..7a556d2c0f6 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -310,6 +310,7 @@ add_library(
   src/io/statistics/parquet_column_statistics.cu
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/config_utils.cpp
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
new file mode 100644
index 00000000000..2c1dc1cc0aa
--- /dev/null
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+#include <cstdlib>
+#include <string>
+
+namespace cudf::io::detail {
+
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
+{
+  auto const env_val = std::getenv(env_var_name.c_str());
+  return std::string{(env_val == nullptr) ? default_val : env_val};
+}
+
+namespace cufile_integration {
+
+namespace {
+/**
+ * @brief Defines which cuFile usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "GDS") return usage_policy::GDS;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; }
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
+
+namespace {
+/**
+ * @brief Defines which nvCOMP usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "STABLE") return usage_policy::STABLE;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  CUDF_FAIL("Invalid LIBCUDF_NVCOMP_POLICY value: " + env_val);
+}
+}  // namespace
+
+bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; }
+
+}  // namespace nvcomp_integration
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
index a1d8e747e44..baa45fef08a 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <cstdlib>
 #include <string>
 
 namespace cudf::io::detail {
@@ -24,44 +23,34 @@ namespace cudf::io::detail {
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
  */
-inline std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
-{
-  auto const env_val = std::getenv(env_var_name.c_str());
-  return std::string{(env_val == nullptr) ? default_val : env_val};
-}
+std::string getenv_or(std::string const& env_var_name, std::string_view default_val);
 
-namespace nvcomp_integration {
+namespace cufile_integration {
 
-namespace {
 /**
- * @brief Defines which nvCOMP usage to enable.
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
  */
-enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+bool is_always_enabled();
 
 /**
- * @brief Get the current usage policy.
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
  */
-inline usage_policy get_env_policy()
-{
-  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
-  if (env_val == "OFF") return usage_policy::OFF;
-  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
-  return usage_policy::STABLE;
-}
-}  // namespace
+bool is_gds_enabled();
+
+}  // namespace cufile_integration
+
+namespace nvcomp_integration {
 
 /**
  * @brief Returns true if all nvCOMP uses are enabled.
  */
-inline bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+bool is_all_enabled();
 
 /**
  * @brief Returns true if stable nvCOMP use is enabled.
  */
-inline bool is_stable_enabled()
-{
-  return is_all_enabled() or get_env_policy() == usage_policy::STABLE;
-}
+bool is_stable_enabled();
 
 }  // namespace nvcomp_integration
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 7afffaede9e..3de6f35cb0d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -14,15 +14,16 @@
  * limitations under the License.
  */
 
+#include "file_io_utilities.hpp"
+
 #include <cudf/io/datasource.hpp>
+#include <cudf/utilities/error.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <unistd.h>
 
-#include <cudf/utilities/error.hpp>
-#include "file_io_utilities.hpp"
-
 namespace cudf {
 namespace io {
 namespace {
@@ -239,7 +240,7 @@ std::unique_ptr<datasource> datasource::create(const std::string& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_config::instance()->is_required()) {
+  if (detail::cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 387452e171a..7a48b7d7301 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -51,45 +51,14 @@ file_wrapper::~file_wrapper() { close(fd); }
 
 #ifdef CUFILE_FOUND
 
-cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
-{
-  if (is_enabled()) {
-    // Modify the config file based on the policy
-    auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
-    std::ifstream user_config_file(config_file_path);
-    // Modified config file is stored in a temporary directory
-    auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
-    std::ofstream cudf_config_file(cudf_config_path);
-
-    std::string line;
-    while (std::getline(user_config_file, line)) {
-      std::string const tag = "\"allow_compat_mode\"";
-      if (line.find(tag) != std::string::npos) {
-        // TODO: only replace the true/false value
-        // Enable compatiblity mode when cuDF does not fall back to host path
-        cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n";
-      } else {
-        cudf_config_file << line << '\n';
-      }
-
-      // Point libcufile to the modified config file
-      CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
-                   "Failed to set the cuFile config file environment variable.");
-    }
-  }
-}
-cufile_config const* cufile_config::instance()
-{
-  static cufile_config _instance;
-  return &_instance;
-}
-
 /**
  * @brief Class that dynamically loads the cuFile library and manages the cuFile driver.
  */
 class cufile_shim {
  private:
   cufile_shim();
+  void modify_cufile_json() const;
+  void load_cufile_lib();
 
   void* cf_lib                              = nullptr;
   decltype(cuFileDriverOpen)* driver_open   = nullptr;
@@ -116,25 +85,60 @@ class cufile_shim {
   decltype(cuFileWrite)* write                        = nullptr;
 };
 
+void cufile_shim::modify_cufile_json() const
+{
+  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
+  temp_directory tmp_config_dir{"cudf_cufile_config"};
+
+  // Modify the config file based on the policy
+  auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
+  std::ifstream user_config_file(config_file_path);
+  // Modified config file is stored in a temporary directory
+  auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
+  std::ofstream cudf_config_file(cudf_config_path);
+
+  std::string line;
+  while (std::getline(user_config_file, line)) {
+    std::string const tag = "\"allow_compat_mode\"";
+    if (line.find(tag) != std::string::npos) {
+      // TODO: only replace the true/false value instead of replacing the whole line
+      // Enable compatibility mode when cuDF does not fall back to host path
+      cudf_config_file << tag << ": "
+                       << (cufile_integration::is_always_enabled() ? "true" : "false") << ",\n";
+    } else {
+      cudf_config_file << line << '\n';
+    }
+
+    // Point libcufile to the modified config file
+    CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0,
+                 "Failed to set the cuFile config file environment variable.");
+  }
+}
+
+void cufile_shim::load_cufile_lib()
+{
+  cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+  driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
+  CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
+  driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
+  CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
+  handle_register =
+    reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
+  CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
+  handle_deregister =
+    reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
+  CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile cuFileHandleDeregister symbol");
+  read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
+  CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
+  write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
+  CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+}
+
 cufile_shim::cufile_shim()
 {
   try {
-    cf_lib      = dlopen("libcufile.so", RTLD_NOW);
-    driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
-    CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
-    driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));
-    CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol");
-    handle_register =
-      reinterpret_cast<decltype(handle_register)>(dlsym(cf_lib, "cuFileHandleRegister"));
-    CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol");
-    handle_deregister =
-      reinterpret_cast<decltype(handle_deregister)>(dlsym(cf_lib, "cuFileHandleDeregister"));
-    CUDF_EXPECTS(handle_deregister != nullptr,
-                 "could not find cuFile cuFileHandleDeregister symbol");
-    read = reinterpret_cast<decltype(read)>(dlsym(cf_lib, "cuFileRead"));
-    CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol");
-    write = reinterpret_cast<decltype(write)>(dlsym(cf_lib, "cuFileWrite"));
-    CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol");
+    modify_cufile_json();
+    load_cufile_lib();
 
     CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver");
   } catch (cudf::logic_error const& err) {
@@ -285,11 +289,11 @@ std::future<void> cufile_output_impl::write_async(void const* data, size_t offse
 std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_input_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
@@ -299,11 +303,11 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
 #ifdef CUFILE_FOUND
-  if (cufile_config::instance()->is_enabled()) {
+  if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_output_impl>(filepath);
     } catch (...) {
-      if (cufile_config::instance()->is_required()) throw;
+      if (cufile_integration::is_always_enabled()) throw;
     }
   }
 #endif
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 0119f9b7abd..ede0eb6f925 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -162,32 +162,6 @@ class cufile_output : public cufile_io_base {
 
 class cufile_shim;
 
-/**
- * @brief Class that manages cuFile configuration.
- */
-class cufile_config {
-  std::string const default_policy    = "OFF";
-  std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON";
-
-  std::string const policy = default_policy;
-  temp_directory tmp_config_dir{"cudf_cufile_config"};
-
-  cufile_config();
-
- public:
-  /**
-   * @brief Returns true when cuFile use is enabled.
-   */
-  bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; }
-
-  /**
-   * @brief Returns true when cuDF should not fall back to host IO.
-   */
-  bool is_required() const { return policy == "ALWAYS"; }
-
-  static cufile_config const* instance();
-};
-
 /**
  * @brief Class that provides RAII for cuFile file registration.
  */
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index 29cbc2024fc..20f3ec87ccb 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -5,17 +5,18 @@ Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations.
 GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. 
 GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. 
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
+GDS is also included in CUDA Toolkit 11.4 and higher.
 
-Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
+Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. 
 This variable also controls the GDS compatibility mode. 
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "GDS": Enable GDS use; GDS compatibility mode is *off*.
 - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
-- "OFF": Compretely disable GDS use.
+- "OFF": Completely disable GDS use.
 
-Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
+If no value is set, behavior will be the same as the "GDS" option.
 
 This environment variable also affects how cuDF treats GDS errors.
 When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers.
@@ -30,5 +31,3 @@ Operations that support the use of GPUDirect Storage:
 - `to_csv`
 - `to_parquet`
 - `to_orc`
-
-NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
index af89ab5285f..521833e2afd 100644
--- a/docs/cudf/source/basics/io-nvcomp-integration.rst
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -3,15 +3,16 @@ nvCOMP Integration
 
 Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
 
-Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
+Which implementation is used by default depends on the data format and the compression type.
+Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
 
-There are three special values for the environment variable:
+There are three valid values for the environment variable:
 
 - "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
 - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
 - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
 
-Any other value (or no value set) will result in the same behavior as the "STABLE" option.
+If no value is set, behavior will be the same as the "STABLE" option.
 
 
 .. table:: Current policy for nvCOMP use for different types

From 65af9a301acd19784fe7d2d03702be827ce97661 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Nov 2021 16:02:13 -0800
Subject: [PATCH 019/202] Improve cmake format script (#9723)

This PR ports some improvements from rapidsai/rmm#913.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9723
---
 .pre-commit-config.yaml         |  4 ++--
 cpp/scripts/run-cmake-format.sh | 32 +++++++++++++++++++++++---------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e993f548e1d..1e1ad94ab0b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -72,7 +72,7 @@ repos:
                 args: ['-fallback-style=none']
               - id: cmake-format
                 name: cmake-format
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-format
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-format
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
@@ -81,7 +81,7 @@ repos:
                   - cmake-format==0.6.11
               - id: cmake-lint
                 name: cmake-lint
-                entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
+                entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
                 language: python
                 types: [cmake]
                 # Note that pre-commit autoupdate does not update the versions
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index 76de008b14a..9c981c6cdaa 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
+# This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
 # cmakelang linters between a local config file and a second config file that's
 # shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
@@ -16,19 +16,33 @@
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
 # 
-# While this script can be invoked directly (but only from the repo root since
-# all paths are relative to that), it is advisable to instead use the
-# pre-commit hooks via
-# `pre-commit run (cmake-format)|(cmake-format)`.
+# This script can be invoked directly anywhere within the project repository.
+# Alternatively, it may be invoked as a pre-commit hook via
+# `pre-commit run (cmake-format)|(cmake-lint)`.
 #
 # Usage:
 # bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
 
-# Note that pre-commit always runs from the root of the repository, so relative
-# paths are automatically relative to the repo root.
+status=0
+if [ -z ${CUDF_ROOT:+PLACEHOLDER} ]; then
+    CUDF_BUILD_DIR=$(git rev-parse --show-toplevel 2>&1)/cpp/build
+    status=$?
+else
+    CUDF_BUILD_DIR=${CUDF_ROOT}
+fi
+
+if ! [ ${status} -eq 0 ]; then
+    if [[ ${CUDF_BUILD_DIR} == *"not a git repository"* ]]; then
+        echo "This script must be run inside the cudf repository, or the CUDF_ROOT environment variable must be set."
+    else
+        echo "Script failed with unknown error attempting to determine project root:"
+        echo ${CUDF_BUILD_DIR}
+    fi
+    exit 1
+fi
+
 DEFAULT_FORMAT_FILE_LOCATIONS=(
-  "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" 
-  "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 

From 43a13c6aac76a2a5a42674b4e3e05dbb65ddb741 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Mon, 22 Nov 2021 13:43:19 +0800
Subject: [PATCH 020/202] Skip cufile tests in JNI build script (#9744)

Signed-off-by: Peixin Li <pxli@nyu.edu>

related to #9722
skip cufile test in JNI build while we have a separate pipeline for GDS testing

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Tim Liu (https://github.com/NvTimLiu)
  - Gary Shen (https://github.com/GaryShen2008)

URL: https://github.com/rapidsai/cudf/pull/9744
---
 java/ci/build-in-docker.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index e596cdae5b3..df4ca853398 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -16,7 +16,7 @@
 # limitations under the License.
 #
 
-set -e
+set -ex
 gcc --version
 
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
@@ -58,7 +58,7 @@ make -j$PARALLEL_LEVEL
 make install DESTDIR=$INSTALL_PREFIX
 
 ###### Build cudf jar ######
-BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS"
+BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
 if [ "$SIGN_FILE" == true ]; then
     # Build javadoc and sources only when SIGN_FILE is true
     BUILD_ARG="$BUILD_ARG -Prelease"

From 7fa15db306631c026642942993283bd93da1c7c2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Nov 2021 09:33:47 -0500
Subject: [PATCH 021/202] Fix doxygen for enum types in libcudf (#9724)

Fix some doxygen formatting errors around enum types found when looking at various pages in the published docs: https://docs.rapids.ai/api/libcudf/stable/namespacecudf.html

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9724
---
 .../cudf/ast/detail/expression_parser.hpp     |  6 +--
 cpp/include/cudf/ast/expressions.hpp          |  6 +--
 cpp/include/cudf/copying.hpp                  |  8 ++--
 cpp/include/cudf/detail/gather.cuh            |  6 +--
 cpp/include/cudf/detail/structs/utilities.hpp |  4 +-
 cpp/include/cudf/io/types.hpp                 |  6 +--
 .../cudf/strings/char_types/char_types.hpp    | 20 ++++----
 cpp/include/cudf/strings/regex/flags.hpp      |  6 +--
 cpp/include/cudf/strings/strip.hpp            |  6 +--
 cpp/include/cudf/strings/translate.hpp        |  5 +-
 cpp/include/cudf/unary.hpp                    | 46 +++++++++----------
 11 files changed, 61 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index dc800bde527..4f73cb1ef6e 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -37,9 +37,9 @@ namespace detail {
  * linearization process but cannot be explicitly created by the user.
  */
 enum class device_data_reference_type {
-  COLUMN,       // A value in a table column
-  LITERAL,      // A literal value
-  INTERMEDIATE  // An internal temporary value
+  COLUMN,       ///< A value in a table column
+  LITERAL,      ///< A literal value
+  INTERMEDIATE  ///< An internal temporary value
 };
 
 /**
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 7ae40a7d65f..20aaa42fb68 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -122,9 +122,9 @@ enum class ast_operator {
  * This determines which table to use in cases with two tables (e.g. joins).
  */
 enum class table_reference {
-  LEFT,   // Column index in the left table
-  RIGHT,  // Column index in the right table
-  OUTPUT  // Column index in the output table
+  LEFT,   ///< Column index in the left table
+  RIGHT,  ///< Column index in the right table
+  OUTPUT  ///< Column index in the output table
 };
 
 /**
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index ba5043fb261..81dddbd284a 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -41,8 +41,8 @@ namespace cudf {
  */
 
 enum class out_of_bounds_policy : bool {
-  NULLIFY,    /// Output values corresponding to out-of-bounds indices are null
-  DONT_CHECK  /// No bounds checking is performed, better performance
+  NULLIFY,    ///< Output values corresponding to out-of-bounds indices are null
+  DONT_CHECK  ///< No bounds checking is performed, better performance
 };
 
 /**
@@ -901,8 +901,8 @@ std::unique_ptr<scalar> get_element(
  * @brief Indicates whether a row can be sampled more than once.
  */
 enum class sample_with_replacement : bool {
-  FALSE,  // A row can be sampled only once
-  TRUE    // A row can be sampled more than once
+  FALSE,  ///< A row can be sampled only once
+  TRUE    ///< A row can be sampled more than once
 };
 
 /**
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 594191d275d..08dbdb6f1a0 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -66,9 +66,9 @@ struct bounds_checker {
  * @brief The operation to perform when a gather map index is out of bounds
  */
 enum class gather_bitmask_op {
-  DONT_CHECK,   // Don't check for out of bounds indices
-  PASSTHROUGH,  // Preserve mask at rows with out of bounds indices
-  NULLIFY,      // Nullify rows with out of bounds indices
+  DONT_CHECK,   ///< Don't check for out of bounds indices
+  PASSTHROUGH,  ///< Preserve mask at rows with out of bounds indices
+  NULLIFY,      ///< Nullify rows with out of bounds indices
 };
 
 template <gather_bitmask_op Op, typename MapIterator>
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index aece79107c6..6f32e3190bf 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -28,8 +28,8 @@ namespace structs {
 namespace detail {
 
 enum class column_nullability {
-  MATCH_INCOMING,  // generate a null column if the incoming column has nulls
-  FORCE            // always generate a null column
+  MATCH_INCOMING,  ///< generate a null column if the incoming column has nulls
+  FORCE            ///< always generate a null column
 };
 
 /**
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index ac965e2d416..cf6be8a20af 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -87,9 +87,9 @@ enum class quote_style {
  * @brief Column statistics granularity type for parquet/orc writers
  */
 enum statistics_freq {
-  STATISTICS_NONE     = 0,  //!< No column statistics
-  STATISTICS_ROWGROUP = 1,  //!< Per-Rowgroup column statistics
-  STATISTICS_PAGE     = 2,  //!< Per-page column statistics
+  STATISTICS_NONE     = 0,  ///< No column statistics
+  STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
+  STATISTICS_PAGE     = 2,  ///< Per-page column statistics
 };
 
 /**
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index 2af79de0716..04d65065bd3 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -37,16 +37,16 @@ namespace strings {
  * does not match to any explicitly named enumerator.
  */
 enum string_character_types : uint32_t {
-  DECIMAL    = 1 << 0,                             /// all decimal characters
-  NUMERIC    = 1 << 1,                             /// all numeric characters
-  DIGIT      = 1 << 2,                             /// all digit characters
-  ALPHA      = 1 << 3,                             /// all alphabetic characters
-  SPACE      = 1 << 4,                             /// all space characters
-  UPPER      = 1 << 5,                             /// all upper case characters
-  LOWER      = 1 << 6,                             /// all lower case characters
-  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  /// all alphanumeric characters
-  CASE_TYPES = UPPER | LOWER,                      /// all case-able characters
-  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       /// all character types
+  DECIMAL    = 1 << 0,                             ///< all decimal characters
+  NUMERIC    = 1 << 1,                             ///< all numeric characters
+  DIGIT      = 1 << 2,                             ///< all digit characters
+  ALPHA      = 1 << 3,                             ///< all alphabetic characters
+  SPACE      = 1 << 4,                             ///< all space characters
+  UPPER      = 1 << 5,                             ///< all upper case characters
+  LOWER      = 1 << 6,                             ///< all lower case characters
+  ALPHANUM   = DECIMAL | NUMERIC | DIGIT | ALPHA,  ///< all alphanumeric characters
+  CASE_TYPES = UPPER | LOWER,                      ///< all case-able characters
+  ALL_TYPES  = ALPHANUM | CASE_TYPES | SPACE       ///< all character types
 };
 
 /**
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f6aee6d22cc..637b3b0851b 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -33,9 +33,9 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,  /// default
-  MULTILINE = 8,  /// the '^' and '$' honor new-line characters
-  DOTALL    = 16  /// the '.' matching includes new-line characters
+  DEFAULT   = 0,  ///< default
+  MULTILINE = 8,  ///< the '^' and '$' honor new-line characters
+  DOTALL    = 16  ///< the '.' matching includes new-line characters
 };
 
 /**
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 72863bdf23b..fe9cd41e780 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -31,9 +31,9 @@ namespace strings {
  * @brief Direction identifier for strip() function.
  */
 enum class strip_type {
-  LEFT,   //<< strip characters from the beginning of the string
-  RIGHT,  //<< strip characters from the end of the string
-  BOTH    //<< strip characters from the beginning and end of the string
+  LEFT,   ///< strip characters from the beginning of the string
+  RIGHT,  ///< strip characters from the end of the string
+  BOTH    ///< strip characters from the beginning and end of the string
 };
 
 /**
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index e014f88c451..0cbf6b22029 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -60,7 +60,10 @@ std::unique_ptr<column> translate(
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
  */
-enum class filter_type : bool { KEEP, REMOVE };
+enum class filter_type : bool {
+  KEEP,   ///< All characters but those specified are removed
+  REMOVE  ///< Only the specified characters are removed
+};
 
 /**
  * @brief Removes ranges of characters from each string in a strings column.
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 254a7988e2e..36f08b7f23e 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,28 +28,28 @@ namespace cudf {
  */
 
 enum class unary_operator : int32_t {
-  SIN,         // < Trigonometric sine
-  COS,         // < Trigonometric cosine
-  TAN,         // < Trigonometric tangent
-  ARCSIN,      // < Trigonometric sine inverse
-  ARCCOS,      // < Trigonometric cosine inverse
-  ARCTAN,      // < Trigonometric tangent inverse
-  SINH,        // < Hyperbolic sine
-  COSH,        // < Hyperbolic cosine
-  TANH,        // < Hyperbolic tangent
-  ARCSINH,     // < Hyperbolic sine inverse
-  ARCCOSH,     // < Hyperbolic cosine inverse
-  ARCTANH,     // < Hyperbolic tangent inverse
-  EXP,         // < Exponential (base e, Euler number)
-  LOG,         // < Natural Logarithm (base e)
-  SQRT,        // < Square-root (x^0.5)
-  CBRT,        // < Cube-root (x^(1.0/3))
-  CEIL,        // < Smallest integer value not less than arg
-  FLOOR,       // < largest integer value not greater than arg
-  ABS,         // < Absolute value
-  RINT,        // < Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  // < Bitwise Not (~)
-  NOT,         // < Logical Not (!)
+  SIN,         ///< Trigonometric sine
+  COS,         ///< Trigonometric cosine
+  TAN,         ///< Trigonometric tangent
+  ARCSIN,      ///< Trigonometric sine inverse
+  ARCCOS,      ///< Trigonometric cosine inverse
+  ARCTAN,      ///< Trigonometric tangent inverse
+  SINH,        ///< Hyperbolic sine
+  COSH,        ///< Hyperbolic cosine
+  TANH,        ///< Hyperbolic tangent
+  ARCSINH,     ///< Hyperbolic sine inverse
+  ARCCOSH,     ///< Hyperbolic cosine inverse
+  ARCTANH,     ///< Hyperbolic tangent inverse
+  EXP,         ///< Exponential (base e, Euler number)
+  LOG,         ///< Natural Logarithm (base e)
+  SQRT,        ///< Square-root (x^0.5)
+  CBRT,        ///< Cube-root (x^(1.0/3))
+  CEIL,        ///< Smallest integer value not less than arg
+  FLOOR,       ///< largest integer value not greater than arg
+  ABS,         ///< Absolute value
+  RINT,        ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,  ///< Bitwise Not (~)
+  NOT,         ///< Logical Not (!)
 };
 
 /**

From cac53c5b7f4845faea935b29a6efb323eff56a19 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Mon, 22 Nov 2021 10:42:59 -0800
Subject: [PATCH 022/202] Enable string to decimal 128 cast (#9742)

A short PR to enable String to Decimal 128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9742
---
 java/src/main/native/src/ColumnViewJni.cpp       |  1 +
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index bce330ea4a3..4efac307627 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -936,6 +936,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
           break;
         case cudf::type_id::DECIMAL32:
         case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128:
           result = cudf::strings::to_fixed_point(*column, n_data_type);
           break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index f332661dc19..a582541a0d4 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3370,6 +3370,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastStringToBigDecimal() {
+    String[] bigValues = {"923121331938210123.321",
+        "9223372036854775808.191",
+       "9328323982309091029831.002"
+    };
+
+    try (ColumnVector cv = ColumnVector.fromStrings(bigValues);
+        ColumnVector values = cv.castTo(DType.create(DType.DTypeEnum.DECIMAL128, -3));
+        ColumnVector expected = ColumnVector.fromDecimals(new BigDecimal("923121331938210123.321"),
+            new BigDecimal("9223372036854775808.191"),
+            new BigDecimal("9328323982309091029831.002"))) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastByteToString() {
 

From ebeb2023ce81f254aaa638c0cd308da98b15418d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Nov 2021 14:23:13 -0500
Subject: [PATCH 023/202] Fix out-of-bounds memory write in
 decimal128-to-string conversion (#9740)

This fixes an error found in a memcheck test referenced here: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=cuda115,LINUX_VER=centos7,PYTHON=3.8/5082/

This also disables the `FixedPointStringConversionOperator` which fails on a Debug build and may be a bug in `std::string`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9740
---
 cpp/src/strings/convert/utilities.cuh   | 7 ++++---
 cpp/tests/strings/fixed_point_tests.cpp | 4 ++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh
index 234ecf48f2e..d9ca8159706 100644
--- a/cpp/src/strings/convert/utilities.cuh
+++ b/cpp/src/strings/convert/utilities.cuh
@@ -67,8 +67,9 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
   bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
 
   constexpr IntegerType base = 10;
-  constexpr int MAX_DIGITS   = 20;  // largest 64-bit integer is 20 digits
-  char digits[MAX_DIGITS];          // place-holder for digit chars
+  // largest 64-bit integer is 20 digits; largest 128-bit integer is 39 digits
+  constexpr int MAX_DIGITS = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
+  char digits[MAX_DIGITS];  // place-holder for digit chars
   int digits_idx = 0;
   while (value != 0) {
     assert(digits_idx < MAX_DIGITS);
@@ -107,7 +108,7 @@ constexpr size_type count_digits(IntegerType value)
   auto const digits = [value] {
     // largest 8-byte  unsigned value is 18446744073709551615 (20 digits)
     // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits)
-    auto constexpr max_digits = std::is_same_v<IntegerType, __int128_t> ? 39 : 20;
+    auto constexpr max_digits = cuda::std::numeric_limits<IntegerType>::digits10 + 1;
 
     size_type digits = 1;
     __int128_t pow10 = 10;
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 7c188d39f6f..ce4280e0733 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -303,7 +303,11 @@ TEST_F(StringsConvertTest, IsFixedPoint)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled);
 }
 
+#ifdef NDEBUG
 TEST_F(StringsConvertTest, FixedPointStringConversionOperator)
+#else
+TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
+#endif
 {
   auto const max = cuda::std::numeric_limits<__int128_t>::max();
 

From d1811b5baf1d83f8d376a4f6e7fd84020a24506b Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 24 Nov 2021 01:22:13 +0530
Subject: [PATCH 024/202] update cuda version in local build (#9736)

update cuda, ubuntu, python versions in local build using gpuci docker image.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/9736
---
 ci/local/README.md | 6 +++---
 ci/local/build.sh  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/local/README.md b/ci/local/README.md
index 96002802263..7754bcaf647 100644
--- a/ci/local/README.md
+++ b/ci/local/README.md
@@ -18,12 +18,12 @@ Build and test your local repository using a base gpuCI Docker image
 where:
     -H   Show this help text
     -r   Path to repository (defaults to working directory)
-    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7)
+    -i   Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8)
     -s   Skip building and testing and start an interactive shell in a container of the Docker image
 ```
 
 Example Usage:
-`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7`
+`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:22.02-cuda11.5-devel-ubuntu20.04-py3.8`
 
 For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page.
 
@@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you
 
 ### Docker Image Build Repository
 
-The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished.
+The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda11.5-ubuntu20.04-py3.8/`. Feel free to remove this directory after the script is finished.
 
 *Note*: The script *will not* override your local build repository. Your local environment stays in tact.
 
diff --git a/ci/local/build.sh b/ci/local/build.sh
index 1bfb8b63fef..345db967264 100755
--- a/ci/local/build.sh
+++ b/ci/local/build.sh
@@ -3,7 +3,7 @@
 GIT_DESCRIBE_TAG=`git describe --tags`
 MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
-DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.0-devel-ubuntu18.04-py3.7"
+DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8"
 REPO_PATH=${PWD}
 RAPIDS_DIR_IN_CONTAINER="/rapids"
 CPP_BUILD_DIR="cpp/build"

From 0fa0cc48a6b3b93e79f918d419a012b75765561c Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 30 Nov 2021 07:50:12 -0700
Subject: [PATCH 025/202] Support `min` and `max` in inclusive scan for structs
 (#9725)

This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` inclusive scan. Exclusive scan support is not needed in the near future.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9725
---
 cpp/include/cudf/detail/scan.hpp          |  71 ++++----
 cpp/src/groupby/sort/group_scan_util.cuh  |   6 +
 cpp/src/reductions/scan/scan_inclusive.cu |  87 +++++++++-
 cpp/tests/reductions/reduction_tests.cpp  |   2 +-
 cpp/tests/reductions/scan_tests.cpp       | 196 ++++++++++++++++++++++
 5 files changed, 325 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 113c15f19a1..8e3db1c7b10 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -26,22 +26,25 @@ namespace detail {
 /**
  * @brief Computes the exclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * The identity value for the column type as per the aggregation type
- * is used for the value of the first element in the output column.
+ * The identity value for the column type as per the aggregation type is used for the value of the
+ * first element in the output column.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type.
+ * Struct columns are allowed with aggregation types Min and Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or struct type but the
+ *                           `agg` is not Min or Max.
+ *
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -52,22 +55,22 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
 /**
  * @brief Computes the inclusive scan of a column.
  *
- * The null values are skipped for the operation, and if an input element
- * at `i` is null, then the output element at `i` will also be null.
+ * The null values are skipped for the operation, and if an input element at `i` is null, then the
+ * output element at `i` will also be null.
  *
- * String columns are allowed with aggregation types Min and Max.
+ * String and struct columns are allowed with aggregation types Min and Max.
  *
- * @throws cudf::logic_error if column data_type is not an arithmetic type
- *                           or string type but the `agg` is not Min or Max
+ * @throws cudf::logic_error if column data_type is not an arithmetic type or string/struct types
+ *                           but the `agg` is not Min or Max.
  *
- * @param input The input column view for the scan
- * @param agg unique_ptr to aggregation operator applied by the scan
- * @param null_handling Exclude null values when computing the result if
- *                      null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE.
- *                      Any operation with a null results in a null.
+ * @param input The input column view for the scan.
+ * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
+ *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
+ *                      a null.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned scalar's device memory
- * @returns Column with scan results
+ * @param mr Device memory resource used to allocate the returned scalar's device memory.
+ * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        std::unique_ptr<aggregation> const& agg,
@@ -76,24 +79,24 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row ranks for a column
+ * @brief Generate row ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr);
 
 /**
- * @brief Generate row dense ranks for a column
+ * @brief Generate row dense ranks for a column.
  *
- * @param order_by Input column to generate ranks for
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return rank values
+ * @param order_by Input column to generate ranks for.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return rank values.
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b565e8dc6d8..ae3e3232e06 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -239,7 +239,13 @@ struct group_scan_functor<K,
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
 
+    //
     // Gather the children elements of the prefix min/max struct elements first.
+    //
+    // Typically, we should use `get_sliced_child` for each child column to properly handle the
+    // input if it is a sliced view. However, since the input to this function is just generated
+    // from groupby internal APIs which is never a sliced view, we just use `child_begin` and
+    // `child_end` iterators for simplicity.
     auto scanned_children =
       cudf::detail::gather(
         table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 02ecd6df4d9..70f5ca90539 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,13 +14,17 @@
  * limitations under the License.
  */
 
-#include "scan.cuh"
+#include <reductions/arg_minmax_util.cuh>
+#include <reductions/scan/scan.cuh>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -150,6 +154,72 @@ struct scan_functor<Op, cudf::string_view> {
   }
 };
 
+template <typename Op>
+struct scan_functor<Op, cudf::struct_view> {
+  static std::unique_ptr<column> invoke(column_view const& input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // Op is used only to determined if we want to find the min or max element.
+    auto constexpr is_min_op = std::is_same_v<Op, DeviceMin>;
+
+    // Build indices of the scan operation results (ARGMIN/ARGMAX).
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
+    // opposite for ARGMAX.
+    auto gather_map    = rmm::device_uvector<size_type>(input.size(), stream);
+    auto const do_scan = [&](auto const& binop) {
+      thrust::inclusive_scan(rmm::exec_policy(stream),
+                             thrust::counting_iterator<size_type>(0),
+                             thrust::counting_iterator<size_type>(input.size()),
+                             gather_map.begin(),
+                             binop);
+    };
+
+    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
+    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
+      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
+    auto const flattened_null_precedences =
+      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
+                : rmm::device_uvector<cudf::null_order>(0, stream);
+
+    if (input.has_nulls()) {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    } else {
+      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
+        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
+      do_scan(binop);
+    }
+
+    // Gather the children columns of the input column. Must use `get_sliced_child` to properly
+    // handle input in case it is a sliced view.
+    auto const input_children = [&] {
+      auto const it = cudf::detail::make_counting_transform_iterator(
+        0, [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+          return structs_view.get_sliced_child(child_idx);
+        });
+      return std::vector<column_view>(it, it + input.num_children());
+    }();
+
+    // Gather the children elements of the prefix min/max struct elements for the output.
+    auto scanned_children = cudf::detail::gather(table_view{input_children},
+                                                 gather_map,
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 negative_index_policy::NOT_ALLOWED,
+                                                 stream,
+                                                 mr)
+                              ->release();
+
+    // Don't need to set a null mask because that will be handled at the caller.
+    return make_structs_column(input.size(),
+                               std::move(scanned_children),
+                               UNKNOWN_NULL_COUNT,
+                               rmm::device_buffer{0, stream, mr});
+  }
+};
+
 /**
  * @brief Dispatcher for running a Scan operation on an input column
  *
@@ -161,7 +231,11 @@ struct scan_dispatcher {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      return std::is_same_v<Op, DeviceMin> || std::is_same_v<Op, DeviceMax>;
+    } else {
+      return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
+    }
   }
 
  public:
@@ -209,6 +283,15 @@ std::unique_ptr<column> scan_inclusive(
     output->set_null_mask(mask_scan(input, scan_type::INCLUSIVE, stream, mr), UNKNOWN_NULL_COUNT);
   }
 
+  // If the input is a structs column, we also need to push down nulls from the parent output column
+  // into the children columns.
+  if (input.type().id() == type_id::STRUCT && output->has_nulls()) {
+    for (size_type idx = 0; idx < output->num_children(); ++idx) {
+      structs::detail::superimpose_parent_nulls(
+        output->view().null_mask(), output->null_count(), output->child(idx), stream, mr);
+    }
+  }
+
   return output;
 }
 }  // namespace detail
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 2c9279260e7..d8ee8f9d08d 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
@@ -28,7 +29,6 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
-#include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index d1e983460d5..0892436eb47 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
@@ -397,3 +398,198 @@ TYPED_TEST(ScanDurationTest, Sum)
   EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE),
                cudf::logic_error);
 }
+
+struct StructScanTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StructScanTest, StructScanMinMaxNoNull)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  constexpr int32_t dont_care{1};
+
+  auto const input_original = [] {
+    auto child1 = STRINGS_CW{"$dont_care",
+                             "$dont_care",
+                             "año",
+                             "bit",
+                             "₹1",
+                             "aaa",
+                             "zit",
+                             "bat",
+                             "aab",
+                             "$1",
+                             "€1",
+                             "wut",
+                             "₹dont_care"};
+    auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return STRUCTS_CW{{child1, child2}};
+  }();
+
+  auto const input = cudf::slice(input_original, {2, 12})[0];
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"};
+      auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"};
+      auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
+      return STRUCTS_CW{{child1, child2}};
+    }();
+    auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}
+
+TEST_F(StructScanTest, StructScanMinMaxWithNulls)
+{
+  using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
+  using STRINGS_CW = cudf::test::strings_column_wrapper;
+  using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::nulls_at;
+
+  auto const input = [] {
+    auto child1 = STRINGS_CW{{"año",
+                              "bit",
+                              "₹1" /*NULL*/,
+                              "aaa" /*NULL*/,
+                              "zit",
+                              "bat",
+                              "aab",
+                              "$1" /*NULL*/,
+                              "€1" /*NULL*/,
+                              "wut"},
+                             nulls_at({2, 7})};
+    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+                          nulls_at({2, 7})};
+    return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+  }();
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"};
+      auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{
+        "año", "bit", "bit", "" /*NULL*/, "zit", "zit", "zit", "zit", "" /*NULL*/, "zit"};
+      auto child2 = INTS_CW{1, 2, 2, 0 /*NULL*/, 5, 5, 5, 5, 0 /*NULL*/, 5};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "año",
+                               "año",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            1,
+                            1,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+
+  {
+    auto const expected = [] {
+      auto child1 = STRINGS_CW{"año",
+                               "bit",
+                               "bit",
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/,
+                               "" /*NULL*/};
+      auto child2 = INTS_CW{1,
+                            2,
+                            2,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/,
+                            0 /*NULL*/};
+      return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
+    }();
+
+    auto const result = cudf::scan(
+      input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+  }
+}

From dca8a0a0356e90e2b9dfa2a2cedf38d0c90935cb Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Nov 2021 10:40:18 -0600
Subject: [PATCH 026/202] Fix dtype-argument bug in dask_cudf read_csv (#9796)

Closes #9719

`dask_cudf.read_csv` currently fails when both `usecols` and `dtype` are specified. This PR is  a simple fix.  In the near future, the `_internal_read_csv` implementation should also be modified to produce a `Blockwise` HLG Layer, but I will leave that for a separate PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9796
---
 python/dask_cudf/dask_cudf/io/csv.py          | 19 +++++++++++--------
 .../dask_cudf/dask_cudf/io/tests/test_csv.py  |  5 +++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 132201a349e..ebb02e3b6d4 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
     if chunksize is None:
         return read_csv_without_chunksize(path, **kwargs)
 
+    # Let dask.dataframe generate meta
     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    usecols = kwargs.pop("usecols", None)
-    meta = dask_reader(filenames[0], **kwargs)._meta
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
 
     dsk = {}
     i = 0
@@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
                 chunksize,
             )  # specify which chunk of the file we care about
             if start != 0:
-                kwargs2[
-                    "names"
-                ] = meta.columns  # no header in the middle of the file
+                kwargs2["names"] = names  # no header in the middle of the file
                 kwargs2["header"] = None
-            kwargs2["usecols"] = usecols
             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 
             i += 1
 
     divisions = [None] * (len(dsk) + 1)
-    if usecols is not None:
-        meta = meta[usecols]
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 98061f6c624..32960a90bd7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     dd.assert_eq(df, df2)
 
 
-def test_csv_reader_usecols(tmp_path):
+@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
+def test_csv_reader_usecols(tmp_path, dtype):
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4] * 100,
@@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path):
     csv_path = str(tmp_path / "usecols_data.csv")
     df.to_csv(csv_path, index=False)
     ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
-    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
+    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)
 
     dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)

From 1db05c9d889d04df113986eeee0356778ce8b003 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 11:45:54 -0600
Subject: [PATCH 027/202] Use Java classloader to find test resources (#9760)

Updates the Java tests to use the classloader to locate test files rather than reaching directly into the source directory.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9760
---
 .../src/test/java/ai/rapids/cudf/TableTest.java | 14 +++++++-------
 .../src/test/java/ai/rapids/cudf/TestUtils.java | 17 ++++++++++++++++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 4512a08430c..b4247e9bb7c 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -70,11 +70,11 @@
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 public class TableTest extends CudfTestBase {
-  private static final File TEST_PARQUET_FILE = new File("src/test/resources/acq.parquet");
-  private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc");
-  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File(
-      "src/test/resources/timestamp-date-test.orc");
-  private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet");
+  private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+  private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
+  private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
+  private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -548,7 +548,7 @@ void testReadCSVPrune() {
         .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
         .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.2, 119.8)
         .build();
-         Table table = Table.readCSV(schema, opts, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, opts, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
@@ -675,7 +675,7 @@ void testReadCSV() {
         .column(120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L)
         .column("one", "two", "three", "four", "five", "six", "seven\ud801\uddb8", "eight\uBF68", "nine\u03E8", "ten")
         .build();
-         Table table = Table.readCSV(schema, new File("./src/test/resources/simple.csv"))) {
+         Table table = Table.readCSV(schema, TEST_SIMPLE_CSV_FILE)) {
       assertTablesAreEqual(expected, table);
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/TestUtils.java b/java/src/test/java/ai/rapids/cudf/TestUtils.java
index 5a799c666c2..a1acab5883b 100644
--- a/java/src/test/java/ai/rapids/cudf/TestUtils.java
+++ b/java/src/test/java/ai/rapids/cudf/TestUtils.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,6 +18,9 @@
 
 package ai.rapids.cudf;
 
+import java.io.File;
+import java.net.URISyntaxException;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Random;
@@ -211,4 +214,16 @@ static Double[] getDoubles(final long seed, final int size, int specialValues) {
     });
     return result;
   }
+
+  public static File getResourceAsFile(String resourceName) {
+    URL url = TestUtils.class.getClassLoader().getResource(resourceName);
+    if (url == null) {
+      throw new IllegalArgumentException("Unable to locate resource: " + resourceName);
+    }
+    try {
+      return new File(url.toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
 }

From 1697f63b9e6e80695cb157f479fada72d053fa1a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 30 Nov 2021 23:39:13 +0530
Subject: [PATCH 028/202] Run compute-sanitizer in nightly build (#9641)

Addresses part of https://github.com/rapidsai/cudf/issues/904
- This PR enables run of `compute-sanitizer --tool memcheck` on libcudf unit tests when env `COMPUTE_SANITIZER_ENABLE=true`
  This env `COMPUTE_SANITIZER_ENABLE` will be enabled only in nightly builds of cudf. (To be Enabled in PR https://github.com/rapidsai/gpuci-scripts/pull/675)
- This PR also adds script to parse compute-sanitizer log to junit xml file which can be processed by Jenkins.
  Reports only failures. If no errors, no tests are reported under memcheck results.

Note: Only `memcheck` is enabled now.  when required, other checks of compute-sanitizer could be enabled later.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9641
---
 ci/gpu/build.sh | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 664e774c68a..8f83c169330 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 ##############################################
 # cuDF GPU build and test script for CI      #
 ##############################################
@@ -176,6 +176,28 @@ else
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    ################################################################################
+    # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
+    ################################################################################
+    if [[ "$BUILD_MODE" == "branch" && "$BUILD_TYPE" == "gpu" ]]; then
+        if [[ "$COMPUTE_SANITIZER_ENABLE" == "true" ]]; then
+            gpuci_logger "Memcheck on GoogleTests with rmm_mode=cuda"
+            export GTEST_CUDF_RMM_MODE=cuda
+            COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck"
+            mkdir -p "$WORKSPACE/test-results/"
+            for gt in gtests/*; do
+                test_name=$(basename ${gt})
+                if [[ "$test_name" == "ERROR_TEST" ]]; then
+                  continue
+                fi
+                echo "Running GoogleTest $test_name"
+                ${COMPUTE_SANITIZER_CMD} ${gt} | tee "$WORKSPACE/test-results/${test_name}.cs.log"
+            done
+            unset GTEST_CUDF_RMM_MODE
+            # test-results/*.cs.log are processed in gpuci
+        fi
+    fi
+
     CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"`
     CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension
     CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install

From 69d576543b5414372f36d02a189a7217d3bb8006 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Nov 2021 14:40:34 -0500
Subject: [PATCH 029/202] Update check for inf/nan strings in libcudf float
 conversion to ignore case (#9694)

Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126

Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof

Also, remove the expensive `replace` call in the cudf  before calling this from Python.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert Maynard (https://github.com/robertmaynard)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9694
---
 cpp/include/cudf/strings/string.cuh           | 64 +++++++++++++---
 cpp/src/strings/convert/convert_floats.cu     | 13 ++--
 cpp/tests/strings/floats_tests.cpp            | 51 ++++---------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 23 +++---
 python/cudf/cudf/core/column/string.py        | 73 -------------------
 5 files changed, 85 insertions(+), 139 deletions(-)

diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index 82da5ad8f10..d85d19d7f10 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str)
            thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; });
 }
 
+/**
+ * @brief Returns true if input contains the not-a-number string.
+ *
+ * The following are valid for this function: "NAN" and "NaN"
+ * @param d_str input string
+ * @return true if input is as valid NaN string.
+ */
+inline __device__ bool is_nan_str(string_view const& d_str)
+{
+  auto const ptr = d_str.data();
+  return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') &&
+         (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n');
+}
+
+/**
+ * @brief Returns true if input contains the infinity string.
+ *
+ * The following are valid for this function: "INF", "INFINITY", and "Inf"
+ * @param d_str input string
+ * @return true if input is as valid Inf string.
+ */
+inline __device__ bool is_inf_str(string_view const& d_str)
+{
+  auto const ptr  = d_str.data();
+  auto const size = d_str.size_bytes();
+
+  if (size != 3 && size != 8) return false;
+
+  auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') &&
+                            (ptr[2] == 'F' || ptr[2] == 'f');
+
+  return prefix_valid &&
+         ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') &&
+                          (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') &&
+                          (ptr[7] == 'Y' || ptr[7] == 'y')));
+}
+
 /**
  * @brief Returns `true` if all characters in the string
  * are valid for conversion to a float type.
@@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str)
  * An empty string returns `false`.
  * No bounds checking is performed to verify if the value would fit
  * within a specific float type.
- * The following strings are also allowed "NaN", "Inf" and, "-Inf"
- * and will return true.
+ * The following strings are also allowed and will return true:
+ *  "NaN", "NAN", "Inf", "INF", "INFINITY"
  *
  * @param d_str String to check.
  * @return true if string has valid float characters
@@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str)
 inline __device__ bool is_float(string_view const& d_str)
 {
   if (d_str.empty()) return false;
-  // strings allowed by the converter
-  if (d_str.compare("NaN", 3) == 0) return true;
-  if (d_str.compare("Inf", 3) == 0) return true;
-  if (d_str.compare("-Inf", 4) == 0) return true;
   bool decimal_found  = false;
   bool exponent_found = false;
   size_type bytes     = d_str.size_bytes();
   const char* data    = d_str.data();
   // sign character allowed at the beginning of the string
-  size_type chidx = (*data == '-' || *data == '+') ? 1 : 0;
-  bool result     = chidx < bytes;
+  size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0;
+
+  bool result = ch_idx < bytes;
+  // check for nan and infinity strings
+  if (result && data[ch_idx] > '9') {
+    auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx);
+    if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true;
+  }
+
   // check for float chars [0-9] and a single decimal '.'
   // and scientific notation [eE][+-][0-9]
-  for (; chidx < bytes; ++chidx) {
-    auto chr = data[chidx];
+  for (; ch_idx < bytes; ++ch_idx) {
+    auto chr = data[ch_idx];
     if (chr >= '0' && chr <= '9') continue;
     if (!decimal_found && chr == '.') {
       decimal_found = true;  // no more decimals
       continue;
     }
     if (!exponent_found && (chr == 'e' || chr == 'E')) {
-      if (chidx + 1 < bytes) chr = data[chidx + 1];
-      if (chr == '-' || chr == '+') ++chidx;
+      if (ch_idx + 1 < bytes) chr = data[ch_idx + 1];
+      if (chr == '-' || chr == '+') ++ch_idx;
       decimal_found  = true;  // no decimal allowed in exponent
       exponent_found = true;  // no more exponents
       continue;
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 366d4fe7d42..70b5f528213 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -45,7 +45,7 @@ namespace {
  * @brief This function converts the given string into a
  * floating point double value.
  *
- * This will also map strings containing "NaN", "Inf" and "-Inf"
+ * This will also map strings containing "NaN", "Inf", etc.
  * to the appropriate float values.
  *
  * This function will also handle scientific notation format.
@@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str)
   const char* in_ptr = d_str.data();
   const char* end    = in_ptr + d_str.size_bytes();
   if (end == in_ptr) return 0.0;
-  // special strings
-  if (d_str.compare("NaN", 3) == 0) return std::numeric_limits<double>::quiet_NaN();
-  if (d_str.compare("Inf", 3) == 0) return std::numeric_limits<double>::infinity();
-  if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits<double>::infinity();
   double sign{1.0};
   if (*in_ptr == '-' || *in_ptr == '+') {
     sign = (*in_ptr == '-' ? -1 : 1);
     ++in_ptr;
   }
 
+  // special strings: NaN, Inf
+  if ((in_ptr < end) && *in_ptr > '9') {
+    auto const inf_nan = string_view(in_ptr, static_cast<size_type>(thrust::distance(in_ptr, end)));
+    if (string::is_nan_str(inf_nan)) return std::numeric_limits<double>::quiet_NaN();
+    if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits<double>::infinity();
+  }
+
   // Parse and store the mantissa as much as we can,
   // until we are about to exceed the limit of uint64_t
   constexpr uint64_t max_holding = (std::numeric_limits<uint64_t>::max() - 9L) / 10L;
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 126bffa1e49..e6f4f6bb8d9 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   cudf::test::strings_column_wrapper strings2(
-    {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"});
+    {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"});
   results = cudf::strings::is_float(cudf::strings_column_view(strings2));
-  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected2({1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsConvertTest, ToFloats32)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.2e4",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782123456789",
-                                     "12e+309",
-                                     "3.4028236E38"};
+  std::vector<const char*> h_strings{
+    "1234",    nullptr,        "-876",     "543.2",
+    "-0.12",   ".25",          "-.002",    "",
+    "-0.0",    "1.2e4",        "NAN",      "abc123",
+    "123abc",  "456e",         "-1.78e+5", "-122.33644782123456789",
+    "12e+309", "3.4028236E38", "INF",      "Infinity"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
@@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32)
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
-  std::vector<const char*> h_strings{"1234",
-                                     nullptr,
-                                     "-876",
-                                     "543.2",
-                                     "-0.12",
-                                     ".25",
-                                     "-.002",
-                                     "",
-                                     "-0.0",
-                                     "1.28e256",
-                                     "NaN",
-                                     "abc123",
-                                     "123abc",
-                                     "456e",
-                                     "-1.78e+5",
-                                     "-122.33644782",
-                                     "12e+309",
-                                     "1.7976931348623159E308"};
+  std::vector<const char*> h_strings{
+    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
+    "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
+    "-Inf",   "-INFINITY"};
   cudf::test::strings_column_wrapper strings(
     h_strings.begin(),
     h_strings.end(),
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index a582541a0d4..cf602c26717 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4919,11 +4919,12 @@ void testIsFloat() {
     try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings);
          ColumnVector isFloat = floatStringCV.isFloat();
          ColumnVector floats = floatStringCV.asFloats();
-         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY,
-             Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY,
-             -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY,
+             Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f,
+             Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY,
+             1.2e-24f, 0f, 0f, null, 423f);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isFloat);
       assertColumnsAreEqual(expectedFloats, floats);
     }
@@ -4944,12 +4945,12 @@ void testIsDouble() {
     try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings);
          ColumnVector isDouble = doubleStringCV.isFloat();
          ColumnVector doubles = doubleStringCV.asDoubles();
-         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d,
-             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE,
-             Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d,
-             0d, null, 423d);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false,
-             false, true, true, true, true, true, true, true, false, false, null, true)) {
+         ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN,
+             Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY,
+             -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY,
+             1.2e-234d, 0d, 0d, null, 423d);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true,
+             true, true, true, true, true, true, true, true, false, false, null, true)) {
       assertColumnsAreEqual(expected, isDouble);
       assertColumnsAreEqual(expectedDoubles, doubles);
     }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a167383c65c..2a91abc5701 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
-_NAN_INF_VARIATIONS = [
-    "nan",
-    "NAN",
-    "Nan",
-    "naN",
-    "nAN",
-    "NAn",
-    "nAn",
-    "-inf",
-    "-INF",
-    "-InF",
-    "-inF",
-    "-iNF",
-    "-INf",
-    "-iNf",
-    "+inf",
-    "+INF",
-    "+InF",
-    "+inF",
-    "+iNF",
-    "+INf",
-    "+Inf",
-    "+iNf",
-    "inf",
-    "INF",
-    "InF",
-    "inF",
-    "iNF",
-    "INf",
-    "iNf",
-]
-_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "NaN",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "-Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-    "Inf",
-]
-
 
 def _is_supported_regex_flags(flags):
     return flags == 0 or (
@@ -5309,16 +5246,6 @@ def as_numerical_column(
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            # TODO: Replace this `replace` call with a
-            # case-insensitive method once following
-            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
-            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
-            new_values = cudf.core.column.as_column(
-                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
-            )
-            string_col = libcudf.replace.replace(
-                string_col, old_values, new_values
-            )
             if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "

From 00a8845780ae9289f483f1113e5c62d4acd7dfe7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 14:02:24 -0600
Subject: [PATCH 030/202] Refactor TableTest assertion methods to a separate
 utility class (#9762)

TableTest has a number of dependencies, e.g.: Parquet, Hadoop, etc., that make it less ideal to be used in an external project. This moves the column and table assertion methods to a separate AssertUtils utility class that avoids the extra dependencies.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9762
---
 .../ai/rapids/cudf/ArrowColumnVectorTest.java |   3 +-
 .../test/java/ai/rapids/cudf/AssertUtils.java | 272 ++++++++++++++++++
 .../java/ai/rapids/cudf/BinaryOpTest.java     |   2 +-
 .../ai/rapids/cudf/ByteColumnVectorTest.java  |   6 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |  38 +--
 .../test/java/ai/rapids/cudf/IfElseTest.java  |   2 +-
 .../ai/rapids/cudf/IntColumnVectorTest.java   |   4 +-
 .../test/java/ai/rapids/cudf/ScalarTest.java  |   2 +-
 .../test/java/ai/rapids/cudf/TableTest.java   | 251 +---------------
 .../cudf/TimestampColumnVectorTest.java       |   2 +-
 .../test/java/ai/rapids/cudf/UnaryOpTest.java |   2 +-
 .../cudf/ast/CompiledExpressionTest.java      |   2 +-
 12 files changed, 309 insertions(+), 277 deletions(-)
 create mode 100644 java/src/test/java/ai/rapids/cudf/AssertUtils.java

diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
index d5d4059d18d..2a11b24b3a8 100644
--- a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 
-import ai.rapids.cudf.HostColumnVector.BasicType;
 import ai.rapids.cudf.HostColumnVector.ListType;
 import ai.rapids.cudf.HostColumnVector.StructType;
 
@@ -40,7 +39,7 @@
 
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/AssertUtils.java b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
new file mode 100644
index 00000000000..184e7dd0c57
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/AssertUtils.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertArrayEquals;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/** Utility methods for asserting in unit tests */
+public class AssertUtils {
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expect The expected result column
+   * @param cv The input column
+   */
+  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
+    assertColumnsAreEqual(expect, cv, "unnamed");
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   */
+  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param cv The input host column
+   * @param colName The name of the host column
+   */
+  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param cv The input Struct column
+   */
+  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
+    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
+  }
+
+  /**
+   * Checks and asserts that passed in Struct columns match
+   * @param expected The expected result Struct column
+   * @param rowOffset The row number to look from
+   * @param length The number of rows to consider
+   * @param cv The input Struct column
+   * @param colName The name of the column
+   * @param enableNullCountCheck Whether to check for nulls in the Struct column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   * @param expected The expected result column
+   * @param cv The input column
+   * @param colName The name of the column
+   * @param enableNullCheck Whether to check for nulls in the column
+   * @param enableNullabilityCheck Whether the table have a validity mask
+   */
+  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
+      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
+    try (HostColumnVector hostExpected = expected.copyToHost();
+         HostColumnVector hostcv = cv.copyToHost()) {
+      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   * @param expected The expected result host column
+   * @param rowOffset start row index
+   * @param length  number of rows from starting offset
+   * @param cv The input host column
+   * @param colName The name of the host column
+   * @param enableNullCountCheck Whether to check for nulls in the host column
+   */
+  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
+                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
+    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
+    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
+    if (enableNullCountCheck) {
+      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
+    } else {
+      // TODO add in a proper check when null counts are supported by serializing a partitioned column
+    }
+    if (enableNullabilityCheck) {
+      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
+    }
+    DType type = expected.getType();
+    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
+      long tableRow = expectedRow - rowOffset;
+      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
+          "NULL for Column " + colName + " Row " + tableRow);
+      if (!expected.isNull(expectedRow)) {
+        switch (type.typeId) {
+          case BOOL8: // fall through
+          case INT8: // fall through
+          case UINT8:
+            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT16: // fall through
+          case UINT16:
+            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT32: // fall through
+          case UINT32: // fall through
+          case TIMESTAMP_DAYS:
+          case DURATION_DAYS:
+          case DECIMAL32:
+            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case INT64: // fall through
+          case UINT64: // fall through
+          case DURATION_MICROSECONDS: // fall through
+          case DURATION_MILLISECONDS: // fall through
+          case DURATION_NANOSECONDS: // fall through
+          case DURATION_SECONDS: // fall through
+          case TIMESTAMP_MICROSECONDS: // fall through
+          case TIMESTAMP_MILLISECONDS: // fall through
+          case TIMESTAMP_NANOSECONDS: // fall through
+          case TIMESTAMP_SECONDS:
+          case DECIMAL64:
+            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case DECIMAL128:
+            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT32:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case FLOAT64:
+            CudfTestBase.assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case STRING:
+            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
+                "Column " + colName + " Row " + tableRow);
+            break;
+          case LIST:
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
+                expectedOffsets.getInt(expectedRow * 4);
+            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
+                cvOffsets.getInt(tableRow * 4);
+            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
+                colName + " Row " + tableRow);
+            break;
+          case STRUCT:
+            // parent column only has validity which was checked above
+            break;
+          default:
+            throw new IllegalArgumentException(type + " is not supported yet");
+        }
+      }
+    }
+
+    if (type.isNestedType()) {
+      switch (type.typeId) {
+        case LIST:
+          int expectedChildRowOffset = 0;
+          int numChildRows = 0;
+          if (length > 0) {
+            HostMemoryBuffer expectedOffsets = expected.getOffsets();
+            HostMemoryBuffer cvOffsets = cv.getOffsets();
+            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
+            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
+                expectedChildRowOffset;
+          }
+          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
+              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
+              enableNullCountCheck, enableNullabilityCheck);
+          break;
+        case STRUCT:
+          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
+          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
+          for (int i = 0; i < expectedChildren.size(); i++) {
+            HostColumnVectorCore expectedChild = expectedChildren.get(i);
+            HostColumnVectorCore cvChild = cvChildren.get(i);
+            String childName = colName + " child " + i;
+            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
+            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
+                colName, enableNullCountCheck, enableNullabilityCheck);
+          }
+          break;
+        default:
+          throw new IllegalArgumentException(type + " is not supported yet");
+      }
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
+   * @param expected the expected result table
+   * @param rowOffset the row number to start checking from
+   * @param length the number of rows to check
+   * @param table the input table to compare against expected
+   * @param enableNullCheck whether to check for nulls or not
+   * @param enableNullabilityCheck whether the table have a validity mask
+   */
+  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
+                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
+    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
+    assertEquals(length, table.getRowCount(), "ROW COUNT");
+    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
+      ColumnVector expect = expected.getColumn(col);
+      ColumnVector cv = table.getColumn(col);
+      String name = String.valueOf(col);
+      if (rowOffset != 0 || length != expected.getRowCount()) {
+        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
+      }
+      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
+    }
+  }
+
+  /**
+   * Checks and asserts that the two tables match
+   * @param expected the expected result table
+   * @param table the input table to compare against expected
+   */
+  public static void assertTablesAreEqual(Table expected, Table table) {
+    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
+  }
+
+  public static void assertTableTypes(DType[] expectedTypes, Table t) {
+    int len = t.getNumberOfColumns();
+    assertEquals(expectedTypes.length, len);
+    for (int i = 0; i < len; i++) {
+      ColumnVector vec = t.getColumn(i);
+      DType type = vec.getType();
+      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
+    }
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index 894861b8c44..0ca997d3c80 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -27,7 +27,7 @@
 import java.util.Arrays;
 import java.util.stream.IntStream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.TestUtils.*;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
index 878fa7e4516..a26dbec4907 100644
--- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java
@@ -127,9 +127,9 @@ public void testCastToByte() {
          ColumnVector expected1 = ColumnVector.fromBytes((byte)4, (byte)3, (byte)8);
          ColumnVector expected2 = ColumnVector.fromBytes((byte)100);
          ColumnVector expected3 = ColumnVector.fromBytes((byte)-23)) {
-      TableTest.assertColumnsAreEqual(expected1, byteColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, byteColumnVector2);
-      TableTest.assertColumnsAreEqual(expected3, byteColumnVector3);
+      AssertUtils.assertColumnsAreEqual(expected1, byteColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, byteColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected3, byteColumnVector3);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index cf602c26717..fa9052029cc 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -34,8 +34,10 @@
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertStructColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.QuantileMethod.*;
-import static ai.rapids.cudf.TableTest.*;
 import static org.junit.jupiter.api.Assertions.*;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
@@ -86,8 +88,8 @@ void testTransformVector() {
          ColumnVector cv1 = cv.transform(ptx, true);
          ColumnVector cv2 = cv.transform(cuda, false);
          ColumnVector expected = ColumnVector.fromBoxedInts(2*2-2, 3*3-3, null, 4*4-4)) {
-      TableTest.assertColumnsAreEqual(expected, cv1);
-      TableTest.assertColumnsAreEqual(expected, cv2);
+      assertColumnsAreEqual(expected, cv1);
+      assertColumnsAreEqual(expected, cv2);
     }
   }
 
@@ -252,7 +254,7 @@ void testStringCreation() {
     try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END");
          HostColumnVector host = cv.copyToHost();
          ColumnVector backAgain = host.copyToDevice()) {
-      TableTest.assertColumnsAreEqual(cv, backAgain);
+      assertColumnsAreEqual(cv, backAgain);
     }
   }
 
@@ -265,7 +267,7 @@ void testUTF8StringCreation() {
             null,
             "END".getBytes(StandardCharsets.UTF_8));
          ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) {
-      TableTest.assertColumnsAreEqual(expected, cv);
+      assertColumnsAreEqual(expected, cv);
     }
   }
 
@@ -299,7 +301,7 @@ void testConcatNoNulls() {
          ColumnVector v2 = ColumnVector.fromInts(8, 9);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromInts(1, 2, 3, 4, 5, 6, 7, 8, 9)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -310,7 +312,7 @@ void testConcatWithNulls() {
          ColumnVector v2 = ColumnVector.fromBoxedDoubles(null, 9.0);
          ColumnVector v = ColumnVector.concatenate(v0, v1, v2);
          ColumnVector expected = ColumnVector.fromBoxedDoubles(1., 2., 3., 4., 5., 6., 7., null, 9.)) {
-      TableTest.assertColumnsAreEqual(expected, v);
+      assertColumnsAreEqual(expected, v);
     }
   }
 
@@ -1882,13 +1884,13 @@ void testSubvector() {
     try (ColumnVector vec = ColumnVector.fromBoxedInts(1, 2, 3, null, 5);
          ColumnVector expected = ColumnVector.fromBoxedInts(2, 3, null, 5);
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
 
     try (ColumnVector vec = ColumnVector.fromStrings("1", "2", "3", null, "5");
          ColumnVector expected = ColumnVector.fromStrings("2", "3", null, "5");
          ColumnVector found = vec.subVector(1, 5)) {
-      TableTest.assertColumnsAreEqual(expected, found);
+      assertColumnsAreEqual(expected, found);
     }
   }
 
@@ -2014,7 +2016,7 @@ void testTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.strip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2024,7 +2026,7 @@ void testTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.strip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23", null, "23", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2033,7 +2035,7 @@ void testLeftTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.lstrip();
          ColumnVector expected = ColumnVector.fromStrings("123", "123 ", null, "123 ", "123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2043,7 +2045,7 @@ void testLeftTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.lstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("23", "23 ", null, "231", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2052,7 +2054,7 @@ void testRightTrimStringsWhiteSpace() {
     try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n");
          ColumnVector trimmed = cv.rstrip();
          ColumnVector expected = ColumnVector.fromStrings(" 123", "123", null, " 123", "\t\t123")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2062,7 +2064,7 @@ void testRightTrimStrings() {
          Scalar one = Scalar.fromString(" 1");
          ColumnVector trimmed = cv.rstrip(one);
          ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "\t\t123\n\n")) {
-      TableTest.assertColumnsAreEqual(expected, trimmed);
+      assertColumnsAreEqual(expected, trimmed);
     }
   }
 
@@ -2108,7 +2110,7 @@ void testCountElements() {
         Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4));
          ColumnVector lengths = cv.countElements();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2117,7 +2119,7 @@ void testStringLengths() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234");
       ColumnVector lengths = cv.getCharLengths();
       ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 4)) {
-      TableTest.assertColumnsAreEqual(expected, lengths);
+      assertColumnsAreEqual(expected, lengths);
     }
   }
 
@@ -2126,7 +2128,7 @@ void testGetByteCount() {
     try (ColumnVector cv = ColumnVector.fromStrings("1", "12", "123", null, "1234");
          ColumnVector byteLengthVector = cv.getByteCount();
          ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 3, null, 4)) {
-      TableTest.assertColumnsAreEqual(expected, byteLengthVector);
+      assertColumnsAreEqual(expected, byteLengthVector);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/IfElseTest.java b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
index 86ddcc23416..a078befdf40 100644
--- a/java/src/test/java/ai/rapids/cudf/IfElseTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IfElseTest.java
@@ -25,7 +25,7 @@
 
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class IfElseTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
index dd03c4de69e..2fb8164534b 100644
--- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java
@@ -117,8 +117,8 @@ public void testCastToInt() {
          ColumnVector expected1 = ColumnVector.fromInts(4, 3, 8);
          ColumnVector intColumnVector2 = shortColumnVector.asInts();
          ColumnVector expected2 = ColumnVector.fromInts(100)) {
-      TableTest.assertColumnsAreEqual(expected1, intColumnVector1);
-      TableTest.assertColumnsAreEqual(expected2, intColumnVector2);
+      AssertUtils.assertColumnsAreEqual(expected1, intColumnVector1);
+      AssertUtils.assertColumnsAreEqual(expected2, intColumnVector2);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
index 0889363c2d0..86c340bb321 100644
--- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java
@@ -29,7 +29,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.*;
 
 public class ScalarTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b4247e9bb7c..fa221e19387 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -57,6 +57,11 @@
 import java.util.stream.Collectors;
 
 import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertTableTypes;
+import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -94,242 +99,6 @@ public class TableTest extends CudfTestBase {
       "8|118.2|128\n" +
       "9|119.8|129").getBytes(StandardCharsets.UTF_8);
 
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expect The expected result column
-   * @param cv The input column
-   */
-  public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) {
-    assertColumnsAreEqual(expect, cv, "unnamed");
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   */
-  public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param cv The input host column
-   * @param colName The name of the host column
-   */
-  public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) {
-    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param cv The input Struct column
-   */
-  public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) {
-    assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false);
-  }
-
-  /**
-   * Checks and asserts that passed in Struct columns match
-   * @param expected The expected result Struct column
-   * @param rowOffset The row number to look from
-   * @param length The number of rows to consider
-   * @param cv The input Struct column
-   * @param colName The name of the column
-   * @param enableNullCountCheck Whether to check for nulls in the Struct column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in columns match
-   * @param expected The expected result column
-   * @param cv The input column
-   * @param colName The name of the column
-   * @param enableNullCheck Whether to check for nulls in the column
-   * @param enableNullabilityCheck Whether the table have a validity mask
-   */
-  public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length,
-      ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) {
-    try (HostColumnVector hostExpected = expected.copyToHost();
-         HostColumnVector hostcv = cv.copyToHost()) {
-      assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that passed in host columns match
-   * @param expected The expected result host column
-   * @param rowOffset start row index
-   * @param length  number of rows from starting offset
-   * @param cv The input host column
-   * @param colName The name of the host column
-   * @param enableNullCountCheck Whether to check for nulls in the host column
-   */
-  public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length,
-                                                  HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName);
-    assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName);
-    assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName);
-    if (enableNullCountCheck) {
-      assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName);
-    } else {
-      // TODO add in a proper check when null counts are supported by serializing a partitioned column
-    }
-    if (enableNullabilityCheck) {
-      assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected");
-    }
-    DType type = expected.getType();
-    for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) {
-      long tableRow = expectedRow - rowOffset;
-      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
-          "NULL for Column " + colName + " Row " + tableRow);
-      if (!expected.isNull(expectedRow)) {
-        switch (type.typeId) {
-          case BOOL8: // fall through
-          case INT8: // fall through
-          case UINT8:
-            assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT16: // fall through
-          case UINT16:
-            assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT32: // fall through
-          case UINT32: // fall through
-          case TIMESTAMP_DAYS:
-          case DURATION_DAYS:
-          case DECIMAL32:
-            assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case INT64: // fall through
-          case UINT64: // fall through
-          case DURATION_MICROSECONDS: // fall through
-          case DURATION_MILLISECONDS: // fall through
-          case DURATION_NANOSECONDS: // fall through
-          case DURATION_SECONDS: // fall through
-          case TIMESTAMP_MICROSECONDS: // fall through
-          case TIMESTAMP_MILLISECONDS: // fall through
-          case TIMESTAMP_NANOSECONDS: // fall through
-          case TIMESTAMP_SECONDS:
-          case DECIMAL64:
-            assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case DECIMAL128:
-            assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT32:
-            assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case FLOAT64:
-            assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001,
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case STRING:
-            assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
-                "Column " + colName + " Row " + tableRow);
-            break;
-          case LIST:
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) -
-                expectedOffsets.getInt(expectedRow * 4);
-            int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) -
-                cvOffsets.getInt(tableRow * 4);
-            assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " +
-                colName + " Row " + tableRow);
-            break;
-          case STRUCT:
-            // parent column only has validity which was checked above
-            break;
-          default:
-            throw new IllegalArgumentException(type + " is not supported yet");
-        }
-      }
-    }
-
-    if (type.isNestedType()) {
-      switch (type.typeId) {
-        case LIST:
-          int expectedChildRowOffset = 0;
-          int numChildRows = 0;
-          if (length > 0) {
-            HostMemoryBuffer expectedOffsets = expected.getOffsets();
-            HostMemoryBuffer cvOffsets = cv.getOffsets();
-            expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4);
-            numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) -
-                expectedChildRowOffset;
-          }
-          assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset,
-              numChildRows, cv.getNestedChildren().get(0), colName + " list child",
-              enableNullCountCheck, enableNullabilityCheck);
-          break;
-        case STRUCT:
-          List<HostColumnVectorCore> expectedChildren = expected.getNestedChildren();
-          List<HostColumnVectorCore> cvChildren = cv.getNestedChildren();
-          for (int i = 0; i < expectedChildren.size(); i++) {
-            HostColumnVectorCore expectedChild = expectedChildren.get(i);
-            HostColumnVectorCore cvChild = cvChildren.get(i);
-            String childName = colName + " child " + i;
-            assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName);
-            assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild,
-                colName, enableNullCountCheck, enableNullabilityCheck);
-          }
-          break;
-        default:
-          throw new IllegalArgumentException(type + " is not supported yet");
-      }
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables from a given rowindex match based on a provided schema.
-   * @param expected the expected result table
-   * @param rowOffset the row number to start checking from
-   * @param length the number of rows to check
-   * @param table the input table to compare against expected
-   * @param enableNullCheck whether to check for nulls or not
-   * @param enableNullabilityCheck whether the table have a validity mask
-   */
-  public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table,
-                                                 boolean enableNullCheck, boolean enableNullabilityCheck) {
-    assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns());
-    assertEquals(length, table.getRowCount(), "ROW COUNT");
-    for (int col = 0; col < expected.getNumberOfColumns(); col++) {
-      ColumnVector expect = expected.getColumn(col);
-      ColumnVector cv = table.getColumn(col);
-      String name = String.valueOf(col);
-      if (rowOffset != 0 || length != expected.getRowCount()) {
-        name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1);
-      }
-      assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck);
-    }
-  }
-
-  /**
-   * Checks and asserts that the two tables match
-   * @param expected the expected result table
-   * @param table the input table to compare against expected
-   */
-  public static void assertTablesAreEqual(Table expected, Table table) {
-    assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false);
-  }
-
   void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table table) {
     assertEquals(expectedTable.length, table.getNumberOfColumns());
     int numCols = table.getNumberOfColumns();
@@ -358,16 +127,6 @@ void assertTablesHaveSameValues(HashMap<Object, Integer>[] expectedTable, Table
     }
   }
 
-  public static void assertTableTypes(DType[] expectedTypes, Table t) {
-    int len = t.getNumberOfColumns();
-    assertEquals(expectedTypes.length, len);
-    for (int i = 0; i < len; i++) {
-      ColumnVector vec = t.getColumn(i);
-      DType type = vec.getType();
-      assertEquals(expectedTypes[i], type, "Types don't match at " + i);
-    }
-  }
-
   @Test
   void testMergeSimple() {
     try (Table table1 = new Table.TestBuilder()
diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
index 8bf1370a0f7..9a929cec98d 100644
--- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
@@ -22,7 +22,7 @@
 
 import java.util.function.Function;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class TimestampColumnVectorTest extends CudfTestBase {
diff --git a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
index 76970e8bf76..7fcb7cbd85b 100644
--- a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java
@@ -22,7 +22,7 @@
 import ai.rapids.cudf.HostColumnVector.Builder;
 import org.junit.jupiter.api.Test;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class UnaryOpTest extends CudfTestBase {
   private static final Double[] DOUBLES_1 = new Double[]{1.0, 10.0, -100.1, 5.3, 50.0, 100.0, null, Double.NaN, Double.POSITIVE_INFINITY, 1/9.0, Double.NEGATIVE_INFINITY, 500.0, -500.0};
diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
index 2fb6792b409..e50da0a4d4d 100644
--- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java
@@ -36,7 +36,7 @@
 import java.util.function.Function;
 import java.util.stream.Stream;
 
-import static ai.rapids.cudf.TableTest.assertColumnsAreEqual;
+import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 
 public class CompiledExpressionTest extends CudfTestBase {
   @Test

From 554ac817498e64ba1c7ef054873fab7dc658d25c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:50:56 -0600
Subject: [PATCH 031/202] Load native dependencies when Java ColumnView is
 loaded (#9800)

The Java ColumnView class has native methods but does not ensure the corresponding native libraries that implement those methods are loaded.  This adds a static code block to the ColumnView class to load the native libraries when the ColumnView class is loaded.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Kuhu Shukla (https://github.com/kuhushukla)

URL: https://github.com/rapidsai/cudf/pull/9800
---
 java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 729444f460c..6d0d24baf99 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -30,6 +30,10 @@
  */
 public class ColumnView implements AutoCloseable, BinaryOperable {
 
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
   public static final long UNKNOWN_NULL_COUNT = -1;
 
   protected long viewHandle;

From 20d6723fcb5eaffb6398e5cf6c14de8d774ca917 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Nov 2021 15:51:12 -0600
Subject: [PATCH 032/202] Copy Java native dependencies directly into classpath
 (#9787)

Eliminates the intermediate copy of the native libraries for the Java bindings into target/native-deps, instead copying libcudf.so and libcudfjni.so directly into the classpath resources.  This eliminates the need to search target/native-deps at runtime when the native libraries are not in the classpath in the case of running tests before the jar is built.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9787
---
 java/pom.xml                                          |  7 ++-----
 .../main/java/ai/rapids/cudf/NativeDepsLoader.java    | 11 ++---------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/java/pom.xml b/java/pom.xml
index 87d43ec1272..c5a3bc64fad 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -297,9 +297,6 @@
                 <include>LICENSE</include>
               </includes>
             </resource>
-            <resource>
-                <directory>${project.build.directory}/native-deps/</directory>
-            </resource>
         </resources>
         <pluginManagement>
             <plugins>
@@ -499,14 +496,14 @@
                 <executions>
                     <execution>
                         <id>copy-native-libs</id>
-                        <phase>validate</phase>
+                        <phase>generate-resources</phase>
                         <goals>
                             <goal>copy-resources</goal>
                         </goals>
                         <configuration>
                             <overwrite>true</overwrite>
                             <skip>${skipNativeCopy}</skip>
-                            <outputDirectory>${project.build.directory}/native-deps/${os.arch}/${os.name}</outputDirectory>
+                            <outputDirectory>${project.build.outputDirectory}/${os.arch}/${os.name}</outputDirectory>
                             <resources>
                                 <resource>
                                     <directory>${native.build.path}</directory>
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 8780ecc3aa3..9663fbcafb4 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -81,9 +81,7 @@ public static synchronized void loadNativeDeps() {
 
   /**
    * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
-   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. It
-   * will also look for the libraries under ./target/native-deps/${os.arch}/${os.name} to help
-   * facilitate testing while building.
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
    * <br/>
    * Because this just loads the libraries and loading the libraries themselves needs to be a
    * singleton operation it is recommended that any library using this provide their own wrapper
@@ -203,12 +201,7 @@ private static File createFile(String os, String arch, String baseName) throws I
     File loc;
     URL resource = loader.getResource(path);
     if (resource == null) {
-      // It looks like we are not running from the jar, or there are issues with the jar
-      File f = new File("./target/native-deps/" + path);
-      if (!f.exists()) {
-        throw new FileNotFoundException("Could not locate native dependency " + path);
-      }
-      resource = f.toURI().toURL();
+      throw new FileNotFoundException("Could not locate native dependency " + path);
     }
     try (InputStream in = resource.openStream()) {
       loc = File.createTempFile(baseName, ".so");

From 991136c78be01d4de20387086a185cfd5a21713b Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Tue, 30 Nov 2021 15:31:53 -0800
Subject: [PATCH 033/202] Add Pearson correlation for sort groupby (python)
 (#9166)

Fixes: https://github.com/rapidsai/cudf/issues/8691

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Karthikeyan (https://github.com/karthikeyann)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Michael Wang (https://github.com/isVoid)
  - Mayank Anand (https://github.com/mayankanand007)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9166
---
 docs/cudf/source/api_docs/groupby.rst     |   1 +
 docs/cudf/source/basics/groupby.rst       |  10 ++
 python/cudf/cudf/_lib/aggregation.pyx     |  55 +++++++++-
 python/cudf/cudf/_lib/cpp/aggregation.pxd |  15 ++-
 python/cudf/cudf/_lib/groupby.pyx         |   4 +-
 python/cudf/cudf/core/groupby/groupby.py  | 121 +++++++++++++++++++++-
 python/cudf/cudf/tests/test_dataframe.py  | 115 ++++++++++++++++++++
 7 files changed, 314 insertions(+), 7 deletions(-)

diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index cf08d1d791b..575d7442cdf 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -59,6 +59,7 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
+   GroupBy.corr
    
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index 04c4d42fa2a..f3269768025 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -127,6 +127,13 @@ Aggregations on groups is supported via the ``agg`` method:
     a
     1   4   1  2.0
     2   5   2  4.5
+    >>> df.groupby("a").corr(method="pearson")
+              b          c
+    a                      
+    1 b  1.000000  0.866025
+      c  0.866025  1.000000
+    2 b  1.000000  1.000000
+      c  1.000000  1.000000
 
 The following table summarizes the available aggregations and the types
 that support them:
@@ -169,6 +176,9 @@ that support them:
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
    | unique                             | ✅        | ✅         | ✅       | ✅            |        |          |            |           |
    +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   | corr                               | ✅        |            |          |               |        |          |            | ✅        |
+   +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+
+   
 
 GroupBy apply
 -------------
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 4f703724cef..68f7101b6ee 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -1,6 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
-from enum import Enum
+from enum import Enum, IntEnum
 
 import numba
 import numpy as np
@@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation
 
 cimport cudf._lib.cpp.aggregation as libcudf_aggregation
 cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type
 
 import cudf
 
@@ -57,6 +58,22 @@ class AggregationKind(Enum):
     UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET
     PTX = libcudf_aggregation.aggregation.Kind.PTX
     CUDA = libcudf_aggregation.aggregation.Kind.CUDA
+    CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION
+
+
+class CorrelationType(IntEnum):
+    PEARSON = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.PEARSON
+    )
+    KENDALL = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.KENDALL
+    )
+    SPEARMAN = (
+        <underlying_type_t_correlation_type>
+        libcudf_aggregation.correlation_type.SPEARMAN
+    )
 
 
 cdef class Aggregation:
@@ -321,6 +338,22 @@ cdef class Aggregation:
             ))
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef Aggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.make_correlation_aggregation[aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
 cdef class RollingAggregation:
     """A Cython wrapper for rolling window aggregations.
 
@@ -692,6 +725,24 @@ cdef class GroupbyAggregation:
         )
         return agg
 
+    @classmethod
+    def corr(cls, method, libcudf_types.size_type min_periods):
+        cdef GroupbyAggregation agg = cls()
+        cdef libcudf_aggregation.correlation_type c_method = (
+            <libcudf_aggregation.correlation_type> (
+                <underlying_type_t_correlation_type> (
+                    CorrelationType[method.upper()]
+                )
+            )
+        )
+        agg.c_obj = move(
+            libcudf_aggregation.
+            make_correlation_aggregation[groupby_aggregation](
+                c_method, min_periods
+            ))
+        return agg
+
+
 cdef class GroupbyScanAggregation:
     """A Cython wrapper for groupby scan aggregations.
 
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd
index 13bfa49057c..3982b4fecbb 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd
@@ -1,5 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
+ctypedef int32_t underlying_type_t_correlation_type
 
 cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
 
@@ -38,6 +39,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
             COLLECT_SET 'cudf::aggregation::COLLECT_SET'
             PTX 'cudf::aggregation::PTX'
             CUDA 'cudf::aggregation::CUDA'
+            CORRELATION 'cudf::aggregation::CORRELATION'
+
         Kind kind
 
     cdef cppclass rolling_aggregation:
@@ -53,6 +56,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         CUDA 'cudf::udf_type::CUDA'
         PTX 'cudf::udf_type::PTX'
 
+    ctypedef enum correlation_type:
+        PEARSON 'cudf::correlation_type::PEARSON'
+        KENDALL 'cudf::correlation_type::KENDALL'
+        SPEARMAN 'cudf::correlation_type::SPEARMAN'
+
     cdef unique_ptr[T] make_sum_aggregation[T]() except +
 
     cdef unique_ptr[T] make_product_aggregation[T]() except +
@@ -106,3 +114,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         udf_type type,
         string user_defined_aggregator,
         data_type output_type) except +
+
+    cdef unique_ptr[T] make_correlation_aggregation[T](
+        correlation_type type, size_type min_periods) except +
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 0968d22d465..314542c9549 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 from collections import defaultdict
 
@@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"}
 _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT",
                 "UNIQUE"}
 _LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = set()
+_STRUCT_AGGS = {"CORRELATION"}
 _INTERVAL_AGGS = set()
 _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE",
                  "NTH", "COLLECT"}
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 7f9f61ed3fd..f1d622362e2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import collections
+import itertools
 import pickle
 import warnings
 
@@ -13,7 +14,8 @@
 from cudf._typing import DataFrameOrSeries
 from cudf.api.types import is_list_like
 from cudf.core.abc import Serializable
-from cudf.core.column.column import arange
+from cudf.core.column.column import arange, as_column
+from cudf.core.multiindex import MultiIndex
 from cudf.utils.utils import GetAttrGetItemMixin, cached_property
 
 
@@ -69,6 +71,8 @@ def __init__(
         """
         self.obj = obj
         self._as_index = as_index
+        self._by = by
+        self._level = level
         self._sort = sort
         self._dropna = dropna
 
@@ -777,6 +781,121 @@ def median(self):
         """Get the column-wise median of the values in each group."""
         return self.agg("median")
 
+    def corr(self, method="pearson", min_periods=1):
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
+
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        Returns
+        ----------
+        DataFrame
+            Correlation matrix.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+        id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
+
+        if not method.lower() in ("pearson",):
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+
+        # create expanded dataframe consisting all combinations of the
+        # struct columns-pairs to be correlated
+        # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
+        _cols = self.grouping.values.columns.tolist()
+        len_cols = len(_cols)
+
+        new_df_data = {}
+        for x, y in itertools.combinations_with_replacement(_cols, 2):
+            new_df_data[(x, y)] = cudf.DataFrame._from_data(
+                {"x": self.obj._data[x], "y": self.obj._data[y]}
+            ).to_struct()
+        new_gb = cudf.DataFrame._from_data(new_df_data).groupby(
+            by=self.grouping.keys
+        )
+
+        try:
+            gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods))
+        except RuntimeError as e:
+            if "Unsupported groupby reduction type-agg combination" in str(e):
+                raise TypeError(
+                    "Correlation accepts only numerical column-pairs"
+                )
+            raise
+
+        # ensure that column-pair labels are arranged in ascending order
+        cols_list = [
+            (y, x) if i > j else (x, y)
+            for j, y in enumerate(_cols)
+            for i, x in enumerate(_cols)
+        ]
+        cols_split = [
+            cols_list[i : i + len_cols]
+            for i in range(0, len(cols_list), len_cols)
+        ]
+
+        # interleave: combine the correlation results for each column-pair
+        # into a single column
+        res = cudf.DataFrame._from_data(
+            {
+                x: gb_corr.loc[:, i].interleave_columns()
+                for i, x in zip(cols_split, _cols)
+            }
+        )
+
+        # create a multiindex for the groupby correlated dataframe,
+        # to match pandas behavior
+        unsorted_idx = gb_corr.index.repeat(len_cols)
+        idx_sort_order = unsorted_idx._get_sorted_inds()
+        sorted_idx = unsorted_idx._gather(idx_sort_order)
+        if len(gb_corr):
+            # TO-DO: Should the operation below be done on the CPU instead?
+            sorted_idx._data[None] = as_column(
+                cudf.Series(_cols).tile(len(gb_corr.index))
+            )
+        res.index = MultiIndex._from_data(sorted_idx._data)
+
+        return res
+
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d07caef11d5..d555b5c4033 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8924,3 +8924,118 @@ def test_frame_series_where_other(data):
     expected = gdf.where(gdf["b"] == 1, 0)
     actual = pdf.where(pdf["b"] == 1, 0)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id", "val1", "val2"],
+        ),
+        (
+            {
+                "id": [0] * 4 + [1] * 3,
+                "a": [10, 3, 4, 2, -3, 9, 10],
+                "b": [10, 23, -4, 2, -3, 9, 19],
+            },
+            ["id", "a"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val": [None, None, None, None, None, None],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val1": [None, 4, 6, 8, None, 2],
+                "val2": [4, 5, None, 2, 9, None],
+            },
+            ["id"],
+        ),
+        ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "min_per", [0, 1, 2, 3, 4],
+)
+def test_pearson_corr_passing(data, gkey, min_per):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+    expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("method", ["kendall", "spearman"])
+def test_pearson_corr_unsupported_methods(method):
+    gdf = cudf.DataFrame(
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+            "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        }
+    )
+
+    with pytest.raises(
+        NotImplementedError,
+        match="Only pearson correlation is currently supported",
+    ):
+        gdf.groupby("id").corr(method)
+
+
+def test_pearson_corr_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").corr("pearson")
+    expected = pdf.groupby("id").corr("pearson")
+
+    assert_eq(
+        expected, actual, check_dtype=False, check_index_type=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+        {
+            "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+            "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"],
+        },
+    ],
+)
+@pytest.mark.parametrize("gkey", ["id", "val1", "val2"])
+def test_pearson_corr_invalid_column_types(data, gkey):
+    with pytest.raises(
+        TypeError, match="Correlation accepts only numerical column-pairs",
+    ):
+        cudf.DataFrame(data).groupby(gkey).corr("pearson")
+
+
+def test_pearson_corr_multiindex_dataframe():
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]}
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level="a").corr("pearson")
+    expected = gdf.to_pandas().groupby(level="a").corr("pearson")
+
+    assert_eq(expected, actual)

From 1eabcb73b7df235de9985e207e2087af9dfb0e14 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 1 Dec 2021 17:03:36 +0530
Subject: [PATCH 034/202] Fix some doxygen warnings and add missing
 documentation (#9770)

fix to ignore `__device__ void` return type warnings.
add missing documentation on some functions
Correct doxygen doc style comment fixes

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9770
---
 cpp/doxygen/Doxyfile                             |  7 ++++---
 cpp/include/cudf/lists/combine.hpp               |  2 +-
 cpp/include/cudf/scalar/scalar_device_view.cuh   | 16 ++++++++++++++++
 .../cudf/strings/convert/convert_lists.hpp       |  2 +-
 cpp/include/cudf/table/row_operators.cuh         |  3 ++-
 cpp/include/cudf_test/base_fixture.hpp           |  3 +++
 cpp/include/cudf_test/column_wrapper.hpp         |  3 +++
 cpp/include/cudf_test/file_utilities.hpp         |  9 +++++++++
 cpp/include/cudf_test/table_utilities.hpp        |  2 +-
 9 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 55e5119040e..6a556bb4b34 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -2089,7 +2089,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2097,7 +2097,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2129,7 +2129,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED              = __device__= \
+                          __host__=
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index a9407ed57ca..61a81e8a745 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -26,7 +26,7 @@ namespace lists {
  * @file
  */
 
-/*
+/**
  * @brief Flag to specify whether a null list element will be ignored from concatenation, or the
  * entire concatenation result involving null list elements will be a null element.
  */
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 884b412d3e2..56afa150dfc 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -91,6 +91,12 @@ class fixed_width_scalar_device_view_base : public detail::scalar_device_view_ba
     return *data<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @tparam T The desired type
+   * @param value The value to store in scalar
+   */
   template <typename T>
   __device__ void set_value(T value)
   {
@@ -159,6 +165,11 @@ class fixed_width_scalar_device_view : public detail::fixed_width_scalar_device_
     return fixed_width_scalar_device_view_base::value<T>();
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(T value) { fixed_width_scalar_device_view_base::set_value<T>(value); }
 
   /**
@@ -218,6 +229,11 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base {
   {
   }
 
+  /**
+   * @brief Stores the value in scalar
+   *
+   * @param value The value to store in scalar
+   */
   __device__ void set_value(rep_type value) { *_data = value; }
 
   /**
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index ec22186ea99..279bf44e7fc 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -50,7 +50,7 @@ namespace strings {
  *
  * @param input Lists column to format.
  * @param na_rep Replacment string for null elements.
- * @param separator Strings to use for enclosing list components and separating elements.
+ * @param separators Strings to use for enclosing list components and separating elements.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index c719c564a87..70ccac2f75d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -67,7 +67,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 }
 }  // namespace detail
 
-/*
+/**
  * @brief A specialization for floating-point `Element` type relational comparison
  * to derive the order of the elements with respect to `lhs`. Specialization is to
  * handle `nan` in the order shown below.
@@ -187,6 +187,7 @@ class element_equality_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
+   * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
    *
    */
   template <typename Element,
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 5fa07fd5568..e08bf6aa53a 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -50,6 +50,7 @@ class BaseFixture : public ::testing::Test {
   /**
    * @brief Returns pointer to `device_memory_resource` that should be used for
    * all tests inheriting from this fixture
+   * @return pointer to memory resource
    */
   rmm::mr::device_memory_resource* mr() { return _mr; }
 };
@@ -170,6 +171,7 @@ class UniformRandomGenerator {
 
   /**
    * @brief Returns the next random number.
+   * @return generated random number
    */
   template <typename TL = T, std::enable_if_t<!cudf::is_timestamp<TL>()>* = nullptr>
   T generate()
@@ -211,6 +213,7 @@ class TempDirTestEnvironment : public ::testing::Environment {
   /**
    * @brief Get a temporary filepath to use for the specified filename
    *
+   * @param filename name of the file to be placed in temporary directory.
    * @return std::string The temporary filepath
    */
   std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; }
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index f291b04776a..cd2ac9f3ec1 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -79,6 +79,7 @@ class column_wrapper {
 
   /**
    * @brief Releases internal unique_ptr to wrapped column
+   * @return unique_ptr to wrapped column
    */
   std::unique_ptr<cudf::column> release() { return std::move(wrapped); }
 
@@ -1040,11 +1041,13 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
 
   /**
    * @brief Access keys column view
+   * @return column_view to keys column
    */
   column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
 
   /**
    * @brief Access indices column view
+   * @return column_view to indices column
    */
   column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
 
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 90bf0cd99dc..8e242e5a4f3 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -24,6 +24,10 @@
 
 #include <cudf/utilities/error.hpp>
 
+/**
+ * @brief RAII class for creating a temporary directory.
+ *
+ */
 class temp_directory {
   std::string _path;
 
@@ -49,5 +53,10 @@ class temp_directory {
     nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS);
   }
 
+  /**
+   * @brief Returns the path of the temporary directory
+   *
+   * @return string path of the temporary directory
+   */
   const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 831c9f5ac14..f2427c5b8c6 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -39,7 +39,7 @@ void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
 
-/*
+/**
  * @brief Verifies the equivalency of two tables.
  *
  * Treats null elements as equivalent.  Columns that have nullability but no nulls,

From 1ceb8ab01120ffe463600db14e6893e196cbb991 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:10:10 -0500
Subject: [PATCH 035/202] Improve build time of libcudf iterator tests (#9788)

While working on #9641 I noticed that building the iterator gtests takes alot of time in CI. Here is a link to the individual build times for libcudf including the gtests:
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5173/testReport/(root)/BuildTime/
(you can sort by Duration by clicking on table colum header).

Here is a table of the top 20 compile time offenders as recorded on my local machine. Note that like the CI build output, 6 of the top 20 are just building the `ITERATOR_TEST`

| rank | time (ms) | file |
| ---:| ---:|:--- |
|  1 | 814334 | /cudf.dir/src/search/search.cu.o
|  2 | 755375 | /cudf.dir/src/sort/sort_column.cu.o
|  3 | 686235 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_numeric.cu.o
|  4 | 670587 | /cudf.dir/src/groupby/sort/group_nunique.cu.o
|  5 | 585524 | /cudf.dir/src/reductions/scan/scan_inclusive.cu.o
|  6 | 582677 | /ITERATOR_TEST.dir/iterator/pair_iterator_test_numeric.cu.o
|  7 | 568418 | /ITERATOR_TEST.dir/iterator/scalar_iterator_test.cu.o
|  8 | 563196 | /cudf.dir/src/sort/sort.cu.o
|  9 | 548816 | /ITERATOR_TEST.dir/iterator/value_iterator_test_numeric.cu.o
| 10 | 535315 | /cudf.dir/src/groupby/sort/sort_helper.cu.o
| 11 | 531384 | /cudf.dir/src/sort/is_sorted.cu.o
| 12 | 530382 | /ITERATOR_TEST.dir/iterator/value_iterator_test_chrono.cu.o
| 13 | 525187 | /cudf.dir/src/join/semi_join.cu.o
| 14 | 523726 | /cudf.dir/src/rolling/rolling.cu.o
| 15 | 517909 | /cudf.dir/src/reductions/product.cu.o
| 16 | 513119 | /cudf.dir/src/stream_compaction/distinct_count.cu.o
| 17 | 512569 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_chrono.cu.o
| 18 | 508978 | /cudf.dir/src/reductions/sum_of_squares.cu.o
| 19 | 508460 | /cudf.dir/src/lists/drop_list_duplicates.cu.o
| 20 | 505247 | /cudf.dir/src/reductions/sum.cu.o

I made some simple changes to the iterator code logic to use different thrust functions along with a temporary device vector. This approach improved the compile time of the `ITERATOR_TEST` by about 3x. Here are the results of compiling the above 6 files with the changes in this PR.

| new rank | new time (ms) | file |
| ---:| ---:|:--- |
| 59 | 232691 (2.9x) | optional_iterator_test_numeric.cu.o |
| 26 | 416951 (1.4x) | pair_iterator_test_numeric.cu.o |
| 92 | 165947 (3.4x) | scalar_iterator_test.cu.o |
| 65 | 216364 (2.5x) | value_iterator_test_numeric.cu.o |
| 77 | 186583 (2.8x) | value_iterator_test_chrono.cu.o |
| 111 | 137789 (3.7x) | optional_iterator_test_chrono.cu.o |

Total overall build time improved locally by ~3m (10%) using `ninja -j48 install` on a Dell 5820.

Here are the build time results of a CI build with these changes.
https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5190/testReport/(root)/BuildTime/

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9788
---
 cpp/tests/iterator/iterator_tests.cuh         | 17 +++++++--
 .../optional_iterator_test_numeric.cu         | 37 +++++++++----------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 4ec347c4bc1..07eb595449c 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -18,8 +18,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/detail/iterator.cuh>                             // include iterator header
-#include <cudf/detail/utilities/transform_unary_functions.cuh>  //for meanvar
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -28,6 +28,7 @@
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
+#include <thrust/logical.h>
 #include <thrust/transform.h>
 
 #include <cub/device/device_reduce.cuh>
@@ -83,7 +84,17 @@ struct IteratorTest : public cudf::test::BaseFixture {
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
 
-    bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin());
+    // using a temporary vector and calling transform and all_of separately is
+    // equivalent to thrust::equal but compiles ~3x faster
+    auto dev_results = rmm::device_uvector<bool>(num_items, rmm::cuda_stream_default);
+    thrust::transform(thrust::device,
+                      d_in,
+                      d_in_last,
+                      dev_expected.begin(),
+                      dev_results.begin(),
+                      thrust::equal_to{});
+    auto result = thrust::all_of(
+      thrust::device, dev_results.begin(), dev_results.end(), thrust::identity<bool>{});
     EXPECT_TRUE(result) << "thrust test";
   }
 
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 6d51f4a5c14..a8c135a726f 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -50,21 +50,15 @@ struct transformer_optional_meanvar {
   }
 };
 
-struct sum_if_not_null {
-  template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::optional<T> operator()(const thrust::optional<T>& lhs,
-                                                           const thrust::optional<T>& rhs)
-  {
-    return lhs.value_or(T{0}) + rhs.value_or(T{0});
-  }
+template <typename T>
+struct optional_to_meanvar {
+  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
 // Workaround: exclude this test only at device debug mode.
 #if !defined(__CUDACC_DEBUG__)
-// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call.
-// It would be useful for `var`, `std` operation
 TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
 {
   using T        = TypeParam;
@@ -104,22 +98,27 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
   expected_value.value_squared = std::accumulate(
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
-  // std::cout << "expected <mixed_output> = " << expected_value << std::endl;
-
   // GPU test
   auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
-  auto result         = thrust::reduce(it_dev_squared,
-                               it_dev_squared + d_col->size(),
-                               thrust::optional<T_output>{T_output{}},
-                               sum_if_not_null{});
+
+  // this can be computed with a single reduce and without a temporary output vector
+  // but the approach increases the compile time by ~2x
+  auto results = rmm::device_uvector<T_output>(d_col->size(), rmm::cuda_stream_default);
+  thrust::transform(thrust::device,
+                    it_dev_squared,
+                    it_dev_squared + d_col->size(),
+                    results.begin(),
+                    optional_to_meanvar<T_output>{});
+  auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{});
+
   if (not std::is_floating_point<T>()) {
-    EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum";
+    EXPECT_EQ(expected_value, result) << "optional iterator reduction sum";
   } else {
-    EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum";
-    EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3)
+    EXPECT_NEAR(expected_value.value, result.value, 1e-3) << "optional iterator reduction sum";
+    EXPECT_NEAR(expected_value.value_squared, result.value_squared, 1e-3)
       << "optional iterator reduction sum squared";
-    EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count";
+    EXPECT_EQ(expected_value.count, result.count) << "optional iterator reduction count";
   }
 }
 #endif

From 11c3dfef2e7fe6fd67ff93bdf36a47c0a5b2eb37 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 1 Dec 2021 10:28:24 -0600
Subject: [PATCH 036/202] Remove unused masked udf cython/c++ code (#9792)

This PR removes the c++ side of the original masked UDF code introduced in https://github.com/rapidsai/cudf/pull/8213. These kernels had some limitations and are now superseded by the numba-generated versions we moved to in https://github.com/rapidsai/cudf/pull/9174. As far as I can tell, cuDF python was the only thing consuming this API for the short time it has existed. However I am marking this breaking just in case.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9792
---
 .../Modules/JitifyPreprocessKernels.cmake     |   4 +-
 cpp/include/cudf/transform.hpp                |   6 --
 cpp/src/transform/jit/masked_udf_kernel.cu    |  85 ---------------
 cpp/src/transform/transform.cpp               | 102 ------------------
 python/cudf/cudf/_lib/cpp/transform.pxd       |   6 --
 python/cudf/cudf/_lib/transform.pyx           |  24 -----
 6 files changed, 2 insertions(+), 225 deletions(-)
 delete mode 100644 cpp/src/transform/jit/masked_udf_kernel.cu

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index c2ad25760b8..6ab1293ab6f 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -51,8 +51,8 @@ function(jit_preprocess_files)
 endfunction()
 
 jit_preprocess_files(
-  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu
-  transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu
+  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu
+  rolling/jit/kernel.cu
 )
 
 add_custom_target(
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 55e7bc84dbe..45e8ff1310c 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -54,12 +54,6 @@ std::unique_ptr<column> transform(
   bool is_ptx,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<column> generalized_masked_op(
-  table_view const& data_view,
-  std::string const& binary_udf,
-  data_type output_type,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
  * preserving existing null values and also returns new null_count.
diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu
deleted file mode 100644
index 319ad730c53..00000000000
--- a/cpp/src/transform/jit/masked_udf_kernel.cu
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstddef>
-#include <cstdint>
-#include <transform/jit/operation-udf.hpp>
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-
-#include <cuda/std/climits>
-#include <cuda/std/cstddef>
-#include <cuda/std/limits>
-#include <cuda/std/tuple>
-#include <cuda/std/type_traits>
-
-namespace cudf {
-namespace transformation {
-namespace jit {
-
-template <typename T>
-struct Masked {
-  T value;
-  bool valid;
-};
-
-template <typename TypeIn, typename MaskType, typename OffsetType>
-__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::make_tuple(in_ptr[id], valid);
-}
-
-template <typename InType, typename MaskType, typename OffsetType, typename... Arguments>
-__device__ auto make_args(cudf::size_type id,
-                          InType in_ptr,
-                          MaskType in_mask,      // in practice, always cudf::bitmask_type const*
-                          OffsetType in_offset,  // in practice, always cudf::size_type
-                          Arguments... args)
-{
-  bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true;
-  return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...));
-}
-
-template <typename TypeOut, typename... Arguments>
-__global__ void generic_udf_kernel(cudf::size_type size,
-                                   TypeOut* out_data,
-                                   bool* out_mask,
-                                   Arguments... args)
-{
-  int const tid    = threadIdx.x;
-  int const blkid  = blockIdx.x;
-  int const blksz  = blockDim.x;
-  int const gridsz = gridDim.x;
-  int const start  = tid + blkid * blksz;
-  int const step   = blksz * gridsz;
-
-  Masked<TypeOut> output;
-  for (cudf::size_type i = start; i < size; i += step) {
-    auto func_args = cuda::std::tuple_cat(
-      cuda::std::make_tuple(&output.value),
-      make_args(i, args...)  // passed int64*, bool*, int64, int64*, bool*, int64
-    );
-    cuda::std::apply(GENERIC_OP, func_args);
-    out_data[i] = output.value;
-    out_mask[i] = output.valid;
-  }
-}
-
-}  // namespace jit
-}  // namespace transformation
-}  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 5230b853a79..0cca6699586 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -19,12 +19,10 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
-#include <jit_preprocessed_files/transform/jit/masked_udf_kernel.cu.jit.hpp>
 
 #include <jit/cache.hpp>
 #include <jit/parser.hpp>
@@ -65,80 +63,6 @@ void unary_operation(mutable_column_view output,
              cudf::jit::get_data_ptr(input));
 }
 
-std::vector<std::string> make_template_types(column_view outcol_view, table_view const& data_view)
-{
-  std::string mskptr_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::bitmask_type>())) + "*";
-  std::string offset_type =
-    cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id<cudf::offset_type>()));
-
-  std::vector<std::string> template_types;
-  template_types.reserve((3 * data_view.num_columns()) + 1);
-
-  template_types.push_back(cudf::jit::get_type_name(outcol_view.type()));
-  for (auto const& col : data_view) {
-    template_types.push_back(cudf::jit::get_type_name(col.type()) + "*");
-    template_types.push_back(mskptr_type);
-    template_types.push_back(offset_type);
-  }
-  return template_types;
-}
-
-void generalized_operation(table_view const& data_view,
-                           std::string const& udf,
-                           data_type output_type,
-                           mutable_column_view outcol_view,
-                           mutable_column_view outmsk_view,
-                           rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
-{
-  auto const template_types = make_template_types(outcol_view, data_view);
-
-  std::string generic_kernel_name =
-    jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel")
-      .instantiate(template_types);
-
-  std::string generic_cuda_source = cudf::jit::parse_single_function_ptx(
-    udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0});
-
-  std::vector<void*> kernel_args;
-  kernel_args.reserve((data_view.num_columns() * 3) + 3);
-
-  cudf::size_type size   = outcol_view.size();
-  const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view);
-  const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view);
-  kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr});
-
-  std::vector<const void*> data_ptrs;
-  std::vector<cudf::bitmask_type const*> mask_ptrs;
-  std::vector<cudf::offset_type> offsets;
-
-  data_ptrs.reserve(data_view.num_columns());
-  mask_ptrs.reserve(data_view.num_columns());
-  offsets.reserve(data_view.num_columns());
-
-  auto const iters = thrust::make_zip_iterator(
-    thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin()));
-
-  std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) {
-    kernel_args.push_back(&thrust::get<0>(tuple_vals));
-    kernel_args.push_back(&thrust::get<1>(tuple_vals));
-    kernel_args.push_back(&thrust::get<2>(tuple_vals));
-  });
-
-  std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) {
-    return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset());
-  });
-
-  cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit)
-    .get_kernel(generic_kernel_name,
-                {},
-                {{"transform/jit/operation-udf.hpp", generic_cuda_source}},
-                {"-arch=sm_."})
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
-    ->launch(kernel_args.data());
-}
-
 }  // namespace jit
 }  // namespace transformation
 
@@ -165,24 +89,6 @@ std::unique_ptr<column> transform(column_view const& input,
   return output;
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  std::unique_ptr<column> output = make_fixed_width_column(output_type, data_view.num_rows());
-  std::unique_ptr<column> output_mask =
-    make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows());
-
-  transformation::jit::generalized_operation(
-    data_view, udf, output_type, *output, *output_mask, stream, mr);
-
-  auto final_output_mask = cudf::bools_to_mask(*output_mask);
-  output.get()->set_null_mask(std::move(*(final_output_mask.first)));
-  return output;
-}
-
 }  // namespace detail
 
 std::unique_ptr<column> transform(column_view const& input,
@@ -195,12 +101,4 @@ std::unique_ptr<column> transform(column_view const& input,
   return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> generalized_masked_op(table_view const& data_view,
-                                              std::string const& udf,
-                                              data_type output_type,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr);
-}
-
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 3153427ce3c..590a371ff52 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -34,12 +34,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         bool is_ptx
     ) except +
 
-    cdef unique_ptr[column] generalized_masked_op(
-        const table_view& data_view,
-        string udf,
-        data_type output_type,
-    ) except +
-
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index a0eb7c68183..96d25cb92c9 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -123,30 +123,6 @@ def transform(Column input, op):
     return Column.from_unique_ptr(move(c_output))
 
 
-def masked_udf(incols, op, output_type):
-    cdef table_view data_view = table_view_from_table(
-        incols, ignore_index=True)
-    cdef string c_str = op.encode("UTF-8")
-    cdef type_id c_tid
-    cdef data_type c_dtype
-
-    c_tid = <type_id> (
-        <underlying_type_t_type_id> SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[
-            output_type
-        ]
-    )
-    c_dtype = data_type(c_tid)
-
-    with nogil:
-        c_output = move(libcudf_transform.generalized_masked_op(
-            data_view,
-            c_str,
-            c_dtype,
-        ))
-
-    return Column.from_unique_ptr(move(c_output))
-
-
 def table_encode(input):
     cdef table_view c_input = table_view_from_table(
         input, ignore_index=True)

From 1904d1a9ff54343471998523816c9e0a00f46797 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 1 Dec 2021 13:00:16 -0600
Subject: [PATCH 037/202] Fix overflow for min calculation in
 strings::from_timestamps (#9793)

This fixes #9790

When converting a timestamp to a String it is possible for the %M min calculation to overflow an int32_t part way through casting. This moves that result to be an int64_t which avoids the overflow issues.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9793
---
 cpp/src/strings/convert/convert_datetime.cu | 4 ++--
 cpp/tests/strings/datetime_tests.cpp        | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 51a6a796ba3..8d0c5704a7b 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -707,9 +707,9 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int32_t scale_time(int64_t time, int64_t base) const
+  __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
-    return static_cast<int32_t>((time - ((time < 0) * (base - 1L))) / base);
+    return (time - ((time < 0) * (base - 1L))) / base;
   };
 
   __device__ time_components get_time_components(int64_t tstamp) const
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 4543607614f..9a01d5dd041 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -311,13 +311,14 @@ TEST_F(StringsDatetimeTest, FromTimestampAmPm)
 TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
 {
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> timestamps_ms{
-    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047};
+    1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047, 128849018880000};
   auto results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%3f");
   cudf::test::strings_column_wrapper expected_ms{"2018-07-04 12:00:00.123",
                                                  "2020-02-29 00:01:01.007",
                                                  "2015-12-29 23:02:02.421",
                                                  "2011-10-11 03:03:03.999",
-                                                 "1776-07-04 11:59:59.953"};
+                                                 "1776-07-04 11:59:59.953",
+                                                 "6053-01-23 02:08:00.000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms);
 
   results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%f");
@@ -325,7 +326,8 @@ TEST_F(StringsDatetimeTest, FromTimestampMillisecond)
                                                     "2020-02-29 00:01:01.007000",
                                                     "2015-12-29 23:02:02.421000",
                                                     "2011-10-11 03:03:03.999000",
-                                                    "1776-07-04 11:59:59.953000"};
+                                                    "1776-07-04 11:59:59.953000",
+                                                    "6053-01-23 02:08:00.000000"};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms_6f);
 
   cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> timestamps_ns{

From 836f800e61acafa0fa6b3c7d9826904f0ba2ad06 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:14 -0500
Subject: [PATCH 038/202] Use CTAD with Thrust function objects (#9768)

While reviewing another PR, I noticed unnecessary usage of explicit template parameters with Thrust function objects and decided to open a small PR to clean this up (CTAD showed up in C++17).

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9768
---
 cpp/include/cudf/strings/detail/gather.cuh       |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp         |  7 ++-----
 cpp/src/copying/concatenate.cu                   |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu           |  4 ++--
 cpp/src/groupby/sort/group_rank_scan.cu          |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh         |  6 +++---
 .../sort/group_single_pass_reduction_util.cuh    | 16 ++++++++--------
 cpp/src/groupby/sort/group_tdigest.cu            | 10 +++++-----
 cpp/src/join/hash_join.cu                        |  2 +-
 cpp/src/join/join_utils.cu                       |  2 +-
 .../lists/combine/concatenate_list_elements.cu   |  2 +-
 cpp/src/lists/contains.cu                        |  7 ++-----
 cpp/src/lists/interleave_columns.cu              |  8 ++++----
 cpp/src/quantiles/tdigest/tdigest.cu             |  7 ++-----
 cpp/src/reductions/scan/scan_inclusive.cu        |  9 ++++-----
 cpp/src/rolling/grouped_rolling.cu               |  6 +++---
 cpp/src/rolling/rolling_collect_list.cu          |  2 +-
 cpp/src/sort/rank.cu                             | 10 +++++-----
 cpp/src/strings/copying/concatenate.cu           |  2 +-
 cpp/src/strings/findall.cu                       |  7 ++-----
 cpp/src/strings/repeat_strings.cu                |  2 +-
 cpp/src/strings/split/split.cu                   | 14 ++++----------
 cpp/tests/iterator/iterator_tests.cuh            | 11 +++--------
 .../apply_boolean_mask_tests.cpp                 |  4 ++--
 cpp/tests/strings/fixed_point_tests.cpp          |  2 +-
 cpp/tests/transform/row_bit_count_test.cu        |  6 ++----
 26 files changed, 63 insertions(+), 89 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ec4a88a0e46..eb7258830ce 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -315,7 +315,7 @@ std::unique_ptr<cudf::column> gather(
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
-    thrust::plus<size_t>{});
+    thrust::plus{});
   CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index cd2ac9f3ec1..ccfdde2270c 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1502,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper {
 
     // concatenate them together, skipping children that are null.
     std::vector<column_view> children;
-    thrust::copy_if(std::cbegin(cols),
-                    std::cend(cols),
-                    valids,  // stencil
-                    std::back_inserter(children),
-                    thrust::identity<bool>{});
+    thrust::copy_if(
+      std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{});
 
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children);
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index f4b6a8bf5fd..34c0cea683e 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     device_views.cend(),
     std::next(offsets.begin()),
     [](auto const& col) { return col.size(); },
-    thrust::plus<size_t>{});
+    thrust::plus{});
   auto d_offsets         = make_device_uvector_async(offsets, stream);
   auto const output_size = offsets.back();
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4e2a5b68abc..bde7c985df1 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -173,8 +173,8 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
 
   // Generate bitmask for the output.
   // Only mean and M2 values can be nullable. Count column must be non-nullable.
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+  auto [null_mask, null_count] =
+    cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
   if (null_count > 0) {
     result_means->set_null_mask(null_mask, null_count);           // copy null_mask
     result_M2s->set_null_mask(std::move(null_mask), null_count);  // take over null_mask
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 935ef9554a9..f36bdc0a660 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -79,7 +79,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                 group_labels.end(),
                                 mutable_ranks.begin<size_type>(),
                                 mutable_ranks.begin<size_type>(),
-                                thrust::equal_to<size_type>{},
+                                thrust::equal_to{},
                                 scan_op);
 
   return ranks;
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index ae3e3232e06..e25fdd6fc27 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -115,7 +115,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -160,7 +160,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
@@ -214,7 +214,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to<size_type>{},
+                                    thrust::equal_to{},
                                     binop);
     };
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index decb127b264..95a36f40e57 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -191,7 +191,7 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -215,10 +215,10 @@ struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_support
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     }
     return result;
@@ -264,7 +264,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to<size_type>{},
+                            thrust::equal_to{},
                             binop);
     };
 
@@ -283,10 +283,10 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or<bool>{});
+                   thrust::logical_or{});
 
-      auto [null_mask, null_count] = cudf::detail::valid_if(
-        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
     } else {
       auto const binop =
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 146a6a8c31c..551eb128231 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -625,7 +625,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to<size_type>{},    // key equality check
+                        thrust::equal_to{},               // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -850,8 +850,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::minimum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -864,8 +864,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to<size_type>{},  // key equality check
-                        thrust::maximum<double>{});
+                        thrust::equal_to{},  // key equality check
+                        thrust::maximum{});
 
   // for any empty groups, set the min and max to be 0. not technically necessary but it makes
   // testing simpler.
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index e4bd1938ecc..c5b680f129e 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
     left_join_complement_size = thrust::count_if(rmm::exec_policy(stream),
                                                  invalid_index_map->begin(),
                                                  invalid_index_map->end(),
-                                                 thrust::identity<size_type>());
+                                                 thrust::identity());
   }
   return join_size + left_join_complement_size;
 }
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 4aca4b4a9cf..9e98f87e7f0 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                               thrust::make_counting_iterator(end_counter),
                                               invalid_index_map->begin(),
                                               right_indices_complement->begin(),
-                                              thrust::identity<size_type>()) -
+                                              thrust::identity{}) -
                               right_indices_complement->begin();
     right_indices_complement->resize(indices_count, stream);
   }
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 4bef312b396..2ddede97ce4 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -225,7 +225,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
   auto list_entries =
     gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr);
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
 
   return make_lists_column(num_rows,
                            std::move(list_offsets),
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index bdbc9ae013c..b48982d205a 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -74,11 +74,8 @@ struct lookup_functor {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
     } else {
-      return cudf::detail::valid_if(result_validity.begin<bool>(),
-                                    result_validity.end<bool>(),
-                                    thrust::identity<bool>{},
-                                    stream,
-                                    mr);
+      return cudf::detail::valid_if(
+        result_validity.begin<bool>(), result_validity.end<bool>(), thrust::identity{}, stream, mr);
     }
   }
 
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index b9b73d98ed2..220cb25a942 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -228,8 +228,8 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
     auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
       comp_fn, num_output_lists, num_output_entries, stream, mr);
 
-    auto [null_mask, null_count] = cudf::detail::valid_if(
-      validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    auto [null_mask, null_count] =
+      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
 
     return make_strings_column(num_output_entries,
                                std::move(offsets_column),
@@ -306,7 +306,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
 
     if (data_has_null_mask) {
       auto [null_mask, null_count] = cudf::detail::valid_if(
-        validities.begin(), validities.end(), thrust::identity<int8_t>{}, stream, mr);
+        validities.begin(), validities.end(), thrust::identity{}, stream, mr);
       if (null_count > 0) { output->set_null_mask(null_mask, null_count); }
     }
 
@@ -405,7 +405,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
   }
 
   auto [null_mask, null_count] = cudf::detail::valid_if(
-    list_validities.begin(), list_validities.end(), thrust::identity<int8_t>{}, stream, mr);
+    list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr);
   return make_lists_column(num_output_lists,
                            std::move(list_offsets),
                            std::move(list_entries),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 57c221b15ed..18e7d02d086 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -348,11 +348,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(tdigest_is_empty,
-                                  tdigest_is_empty + tdv.size(),
-                                  thrust::logical_not<size_type>{},
-                                  stream,
-                                  mr);
+    return cudf::detail::valid_if(
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 70f5ca90539..b0e761c4c3b 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -50,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view,
   auto valid_itr = detail::make_validity_iterator(*d_input);
 
   auto first_null_position = [&] {
-    size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream),
-                                                     valid_itr,
-                                                     valid_itr + input_view.size(),
-                                                     thrust::identity<bool>{}) -
-                                 valid_itr;
+    size_type const first_null =
+      thrust::find_if_not(
+        rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) -
+      valid_itr;
     size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0;
     return std::min(input_view.size(), first_null + exclusive_offset);
   }();
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 509f67bb5c6..5a7f15148d8 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -142,8 +142,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                preceding_window] __device__(size_type idx) {
     auto group_label = d_group_labels[idx];
     auto group_start = d_group_offsets[group_label];
-    return thrust::minimum<size_type>{}(preceding_window,
-                                        idx - group_start + 1);  // Preceding includes current row.
+    return thrust::minimum{}(preceding_window,
+                             idx - group_start + 1);  // Preceding includes current row.
   };
 
   auto following_calculator = [d_group_offsets = group_offsets.data(),
@@ -152,7 +152,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
     auto group_label = d_group_labels[idx];
     auto group_end   = d_group_offsets[group_label + 1];  // Cannot fall off the end, since offsets
                                                           // is capped with `input.size()`.
-    return thrust::minimum<size_type>{}(following_window, (group_end - 1) - idx);
+    return thrust::minimum{}(following_window, (group_end - 1) - idx);
   };
 
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu
index ecef90dc8e1..30c39bde7d2 100644
--- a/cpp/src/rolling/rolling_collect_list.cu
+++ b/cpp/src/rolling/rolling_collect_list.cu
@@ -75,7 +75,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum<size_type>{});
+                         thrust::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c8a908e44cd..e9589e6c4b3 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to<size_type>{},
+                        thrust::equal_to{},
                         tie_breaker);
   auto sorted_tied_rank = thrust::make_transform_iterator(
     dense_rank_sorted.begin(),
@@ -171,8 +171,8 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::minimum{},
+                                       thrust::identity{},
                                        stream);
 }
 
@@ -189,8 +189,8 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum<size_type>{},
-                                       thrust::identity<outputType>{},
+                                       thrust::maximum{},
+                                       thrust::identity{},
                                        stream);
 }
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index db8b37a9592..3822fa8bf5a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -96,7 +96,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
                                    device_views_ptr + views.size(),
                                    std::next(d_partition_offsets.begin()),
                                    chars_size_transform{},
-                                   thrust::plus<size_t>{});
+                                   thrust::plus{});
   auto const output_chars_size = d_partition_offsets.back_element(stream);
   stream.synchronize();  // ensure copy of output_chars_size is complete before returning
 
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 3ab5b55020c..8d96f0de415 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -153,11 +153,8 @@ std::unique_ptr<table> findall_re(
 
   std::vector<std::unique_ptr<column>> results;
 
-  size_type const columns = thrust::reduce(rmm::exec_policy(stream),
-                                           find_counts.begin(),
-                                           find_counts.end(),
-                                           0,
-                                           thrust::maximum<size_type>{});
+  size_type const columns = thrust::reduce(
+    rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return all nulls column (issue #119)
   if (columns == 0)
     results.emplace_back(std::make_unique<column>(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 458f3ed885c..7820e0064a6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -369,7 +369,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
                              thrust::make_counting_iterator<size_type>(strings_count),
                              fn,
                              int64_t{0},
-                             thrust::plus<int64_t>{});
+                             thrust::plus{});
 
   return std::make_pair(std::move(output_sizes), total_bytes);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 5113b418501..c6e52a79059 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -490,11 +490,8 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
     });
 
   // the columns_count is the maximum number of tokens for any string
-  auto const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                            token_counts.begin(),
-                                            token_counts.end(),
-                                            0,
-                                            thrust::maximum<size_type>{});
+  auto const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
   // boundary case: if no columns, return one null column (custrings issue #119)
   if (columns_count == 0) {
     results.push_back(std::make_unique<column>(
@@ -748,11 +745,8 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                     [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); });
 
   // column count is the maximum number of tokens for any string
-  size_type const columns_count = thrust::reduce(rmm::exec_policy(stream),
-                                                 token_counts.begin(),
-                                                 token_counts.end(),
-                                                 0,
-                                                 thrust::maximum<size_type>{});
+  size_type const columns_count = thrust::reduce(
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 07eb595449c..d93c1275122 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -51,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture {
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr,
-                              temp_storage_bytes,
-                              d_in,
-                              dev_result.begin(),
-                              num_items,
-                              thrust::minimum<T_output>{},
-                              init);
+    cub::DeviceReduce::Reduce(
+      nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init);
 
     // Allocate temporary storage
     rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default);
@@ -68,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum<T_output>{},
+                              thrust::minimum{},
                               init);
 
     evaluate(expected, dev_result, "cub test");
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 813cceb0861..c80a8fba55c 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest)
                   dec32_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec32_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
   thrust::copy_if(thrust::seq,
                   dec64_data.cbegin(),
                   dec64_data.cend(),
                   mask_data.cbegin(),
                   std::back_inserter(expect_dec64_data),
-                  thrust::identity<bool>());
+                  thrust::identity{});
 
   decimal32_wrapper expect_col32(
     expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3});
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index ce4280e0733..5872a9e5bb7 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -329,4 +329,4 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator)
 
   auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}};
   EXPECT_EQ(static_cast<std::string>(c), "1.70141183460469231731687303715884105727");
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 7fb7326f221..43d63c9fd22 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -239,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(thrust::device,
-                   ints_view.begin<int32_t>(),
-                   ints_view.end<int32_t>(),
-                   thrust::identity<int32_t>());
+  thrust::tabulate(
+    thrust::device, ints_view.begin<int32_t>(), ints_view.end<int32_t>(), thrust::identity{});
 
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);

From 677e63236a81ea3c402df993845a1fdc98072c9e Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 1 Dec 2021 16:46:25 -0500
Subject: [PATCH 039/202] Avoid overflow for `fixed_point` `cudf::cast` and
 performance optimization (#9772)

This resolves https://github.com/rapidsai/cudf/issues/9000.

When using `cudf::cast` for a wider decimal type to a narrower decimal type, you can overflow. This PR modifies the code path for this specific use case so that the "rescale" happens for the type cast. A small perf improvement was added when you have identical scales to avoid rescaling.

CI depends on https://github.com/rapidsai/cudf/pull/9766

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9772
---
 cpp/src/unary/cast_ops.cu      | 49 +++++++++++++++++++++-------------
 cpp/tests/unary/cast_tests.cpp | 13 +++++++++
 2 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index e852b00796a..131fde11cf8 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -305,28 +305,39 @@ struct dispatch_unary_cast_to {
                                      rmm::mr::device_memory_resource* mr)
   {
     using namespace numeric;
-
-    auto const size = input.size();
-    auto temporary =
-      std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
-                               size,
-                               rmm::device_buffer{size * cudf::size_of(type), stream},
-                               copy_bitmask(input, stream),
-                               input.null_count());
-
     using SourceDeviceT = device_storage_type_t<SourceT>;
     using TargetDeviceT = device_storage_type_t<TargetT>;
 
-    mutable_column_view output_mutable = *temporary;
-
-    thrust::transform(rmm::exec_policy(stream),
-                      input.begin<SourceDeviceT>(),
-                      input.end<SourceDeviceT>(),
-                      output_mutable.begin<TargetDeviceT>(),
-                      device_cast<SourceDeviceT, TargetDeviceT>{});
-
-    // clearly there is a more efficient way to do this, can optimize in the future
-    return rescale<TargetT>(*temporary, numeric::scale_type{type.scale()}, stream, mr);
+    auto casted = [&]() {
+      auto const size = input.size();
+      auto output     = std::make_unique<column>(cudf::data_type{type.id(), input.type().scale()},
+                                             size,
+                                             rmm::device_buffer{size * cudf::size_of(type), stream},
+                                             copy_bitmask(input, stream),
+                                             input.null_count());
+
+      mutable_column_view output_mutable = *output;
+
+      thrust::transform(rmm::exec_policy(stream),
+                        input.begin<SourceDeviceT>(),
+                        input.end<SourceDeviceT>(),
+                        output_mutable.begin<TargetDeviceT>(),
+                        device_cast<SourceDeviceT, TargetDeviceT>{});
+
+      return output;
+    };
+
+    if (input.type().scale() == type.scale()) return casted();
+
+    if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) {
+      // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT
+      auto temporary = casted();
+      return detail::rescale<TargetT>(*temporary, scale_type{type.scale()}, stream, mr);
+    } else {
+      // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow
+      auto temporary = detail::rescale<SourceT>(input, scale_type{type.scale()}, stream, mr);
+      return detail::cast(*temporary, type, stream, mr);
+    }
   }
 
   template <typename TargetT,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index 4d0009ab20a..db457623d8d 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1003,3 +1003,16 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
+
+TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
+{
+  using namespace numeric;
+  using fp_wrapperA = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using fp_wrapperB = cudf::test::fixed_point_column_wrapper<int64_t>;
+
+  auto const input    = fp_wrapperB{{141230900000L}, scale_type{-10}};
+  auto const expected = fp_wrapperA{{14123}, scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimal32>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}

From 7d8a8e53f495279ae129fa46948c07230d6e77b4 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Wed, 1 Dec 2021 13:53:05 -0800
Subject: [PATCH 040/202] Allow cast decimal128 to string and add tests (#9756)

Small PR that enables Decimal128 cast

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9756
---
 java/src/main/native/src/ColumnViewJni.cpp       |  3 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4efac307627..02d5dc4569c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
         case cudf::type_id::INT64:
         case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break;
         case cudf::type_id::DECIMAL32:
-        case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break;
+        case cudf::type_id::DECIMAL64:
+        case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break;
         default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0);
       }
     } else if (column->type().id() == cudf::type_id::STRING) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index fa9052029cc..31a52eb2ec0 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3372,6 +3372,22 @@ void testFixedWidthCast() {
     }
   }
 
+  @Test
+  void testCastBigDecimalToString() {
+    BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
+        new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("9328323982309091029831.002")
+    };
+
+    try (ColumnVector cv = ColumnVector.fromDecimals(bigValues);
+         ColumnVector values = cv.castTo(DType.STRING);
+         ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
+             "9223372036854775808.191",
+             "9328323982309091029831.002")) {
+      assertColumnsAreEqual(expected, values);
+    }
+  }
+
   @Test
   void testCastStringToBigDecimal() {
     String[] bigValues = {"923121331938210123.321",

From 5491cc789bbfbaad7099124dcfe004719e7f013c Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 Dec 2021 03:30:50 +0530
Subject: [PATCH 041/202] Fix memory error due to lambda return type deduction
 limitation (#9778)

Fixes #9703
replace device lambda with device functor with return type. (due to [14. extended-lambda-restrictions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions) )
~add `__host__` to lambda for nvcc return type deduction to work properly.~
~replaced `auto` (generic lambda) with `size_type`.~
fixes shared memory write error caused in #9703

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9778
---
 cpp/src/sort/rank.cu         | 13 +++++++++----
 cpp/tests/sort/rank_test.cpp | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index e9589e6c4b3..de0a44e3234 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -194,6 +194,12 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        stream);
 }
 
+// Returns index, count
+template <typename T>
+struct index_counter {
+  __device__ T operator()(size_type i) { return T{i, 1}; }
+};
+
 void rank_average(cudf::device_span<size_type const> group_keys,
                   column_view sorted_order_view,
                   mutable_column_view rank_mutable_view,
@@ -208,10 +214,9 @@ void rank_average(cudf::device_span<size_type const> group_keys,
   using MinCount = thrust::pair<size_type, size_type>;
   tie_break_ranks_transform<MinCount>(
     group_keys,
-    cudf::detail::make_counting_transform_iterator(1,
-                                                   [] __device__(auto i) {
-                                                     return MinCount{i, 1};
-                                                   }),
+    // Use device functor with return type. Cannot use device lambda due to limitation.
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions
+    cudf::detail::make_counting_transform_iterator(1, index_counter<MinCount>{}),
     sorted_order_view,
     rank_mutable_view.begin<double>(),
     [] __device__(auto rank_count1, auto rank_count2) {
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 94e389fc7ce..926ad1e203e 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct)
   this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true);
 }
 
+struct RankLarge : public BaseFixture {
+};
+
+TEST_F(RankLarge, average_large)
+{
+  // testcase of https://github.com/rapidsai/cudf/issues/9703
+  auto iter = thrust::counting_iterator<int64_t>(0);
+  fixed_width_column_wrapper<int64_t> col1(iter, iter + 10558);
+  auto result =
+    cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false);
+  fixed_width_column_wrapper<double, int> expected(iter + 1, iter + 10559);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 }  // namespace test
 }  // namespace cudf

From c10966cc3847ca9837ddc7ce5df9c4d9b7c743d8 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Thu, 2 Dec 2021 18:48:03 +0800
Subject: [PATCH 042/202] Fix make_empty_scalar_like on list_type (#9759)

Fixes #9758

In `make_empty_scalar_like`, we create list scalar with the list column itself, which is wrong. The correct way is with the child of list column.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9759
---
 cpp/src/scalar/scalar_factories.cpp      | 7 +++++--
 cpp/tests/reductions/reduction_tests.cpp | 8 ++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d2876435780..c18b57d220f 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf/detail/copy.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -184,10 +185,12 @@ std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
-    case type_id::LIST:
-      result = make_list_scalar(empty_like(column)->view(), stream, mr);
+    case type_id::LIST: {
+      auto const empty_child = empty_like(lists_column_view(column).child());
+      result                 = make_list_scalar(empty_child->view(), stream, mr);
       result->set_valid_async(false, stream);
       break;
+    }
     case type_id::STRUCT:
       // The input column must have at least 1 row to extract a scalar (row) from it.
       result = detail::get_element(column, 0, stream, mr);
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index d8ee8f9d08d..e138cd6f68e 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -1961,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture {
         cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST));
       auto list_result = dynamic_cast<cudf::list_scalar*>(result.get());
       EXPECT_EQ(is_valid, list_result->is_valid());
-      if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); }
+      if (is_valid) {
+        CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view());
+      } else {
+        CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view());
+      }
     };
 
     if (succeeded_condition) {
@@ -2047,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
 
   // test against empty input
   this->reduction_test(LCW{},
-                       ElementCol{{0}, {0}},  // expected_value,
+                       ElementCol{},  // expected_value,
                        true,
                        false,
                        cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE));

From 582cc6e466c7d941e1b34893fd56fbd42fe90d68 Mon Sep 17 00:00:00 2001
From: Chong Gao <gaochong.gc@qq.com>
Date: Thu, 2 Dec 2021 21:12:01 +0800
Subject: [PATCH 043/202] Add sample JNI API (#9728)

Add sample JNI

Signed-off-by: Chong Gao <res_life@163.com>

Authors:
  - Chong Gao (https://github.com/res-life)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9728
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 30 +++++++++++++++++++
 java/src/main/native/src/TableJni.cpp         | 15 ++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 21 +++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b0791fb440f..b11808ed023 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -678,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable,
                                                                 boolean[] keysDescending,
                                                                 boolean[] keysNullSmallest);
 
+  private static native long[] sample(long tableHandle, long n, boolean replacement, long seed);
+
   /////////////////////////////////////////////////////////////////////////////
   // TABLE CREATION APIs
   /////////////////////////////////////////////////////////////////////////////
@@ -2801,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data
     return result;
   }
 
+
+  /**
+   * Gather `n` samples from table randomly
+   * Note: does not preserve the ordering
+   * Example:
+   * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}}
+   * n: 3
+   * replacement: false
+   *
+   * output:       {col1: {3, 1, 4}, col2: {8, 6, 9}}
+   *
+   * replacement: true
+   *
+   * output:       {col1: {3, 1, 1}, col2: {8, 6, 6}}
+   *
+   * throws "logic_error" if `n` > table rows and `replacement` == FALSE.
+   * throws "logic_error" if `n` < 0.
+   *
+   * @param n non-negative number of samples expected from table
+   * @param replacement Allow or disallow sampling of the same row more than once.
+   * @param seed Seed value to initiate random number generator.
+   *
+   * @return Table containing samples
+   */
+  public Table sample(long n, boolean replacement, long seed) {
+    return new Table(sample(nativeHandle, n, replacement, seed));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // HELPER CLASSES
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a78d40a58f7..f3377bb002d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -20,6 +20,7 @@
 #include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/hashing.hpp>
@@ -3147,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
+                                                              jlong n, jboolean replacement,
+                                                              jlong seed) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *input = reinterpret_cast<cudf::table_view *>(j_input);
+    auto sample_with_replacement =
+        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+    std::unique_ptr<cudf::table> result = cudf::sample(*input, n, sample_with_replacement, seed);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
 } // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index fa221e19387..0b2f56895e9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7584,4 +7584,25 @@ void testExplodeOuterPosition() {
       }
     }
   }
+
+  @Test
+  void testSample() {
+    try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) {
+      try (Table ret = t.sample(3, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(5, false, 0);
+           Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+
+      try (Table ret = t.sample(8, true, 0);
+           Table expected = new Table.TestBuilder()
+               .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) {
+        assertTablesAreEqual(expected, ret);
+      }
+    }
+  }
 }

From 1077daeaad8ff710de6f4fbb99f2e7371b4af8de Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:51:04 -0600
Subject: [PATCH 044/202] Fix caching in `Series.applymap` (#9821)

The cache key we were generating for these functions didn't take into account the constants that could be different in the bytecode. Hence certain functions were causing cache hits when they actually differ by a constant value somewhere in the logic.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9821
---
 python/cudf/cudf/tests/test_udf_masked_ops.py | 19 +++++++++++++++++++
 python/cudf/cudf/utils/cudautils.py           |  4 +++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index dc126546f15..c9c2c440632 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -593,3 +593,22 @@ def func(row, c, k):
         return y
 
     run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
+
+
+def test_masked_udf_caching():
+    # Make sure similar functions that differ
+    # by simple things like constants actually
+    # recompile
+
+    data = cudf.Series([1, 2, 3])
+    expect = data ** 2
+    got = data.applymap(lambda x: x ** 2)
+
+    assert_eq(expect, got, check_dtype=False)
+
+    # update the constant value being used and make sure
+    # it does not result in a cache hit
+
+    expect = data ** 3
+    got = data.applymap(lambda x: x ** 3)
+    assert_eq(expect, got, check_dtype=False)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 5fa091a0081..f0533dcaa72 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -216,12 +216,14 @@ def make_cache_key(udf, sig):
     recompiling the same function for the same set of types
     """
     codebytes = udf.__code__.co_code
+    constants = udf.__code__.co_consts
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
         cvarbytes = dumps(cvars)
     else:
         cvarbytes = b""
-    return codebytes, cvarbytes, sig
+
+    return constants, codebytes, cvarbytes, sig
 
 
 def compile_udf(udf, type_signature):

From 50acf076d4a35bc57dc00a416f0d9507b1992c0f Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 2 Dec 2021 14:07:31 -0800
Subject: [PATCH 045/202] Fix stream usage in `segmented_gather()` (#9679)

`detail::segmented_gather()` inadvertently uses `cuda_default_stream` in some parts of its implementation, while using the user-specified stream in others.

This applies to the calls to `copy_range_in_place()`, `allocate_like()`, and `make_lists_column()`. ~This might produce race conditions, which might explain NVIDIA/spark-rapids/issues/4060. It's a rare failure that's quite hard to reproduce.~ This might lead to over-synchronization, though bad output is unlikely.

The commit here should sort this out, by switching to the `detail` APIs corresponding to the calls above.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9679
---
 cpp/src/lists/copying/segmented_gather.cu | 21 ++++++++++++---------
 cpp/src/lists/extract.cu                  |  2 +-
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 8cbcddc1c58..41187b96cdb 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
-#include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -88,14 +88,15 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto child       = std::move(child_table->release().front());
 
   // Create list offsets from gather_map.
-  auto output_offset = cudf::allocate_like(
-    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr);
+  auto output_offset = cudf::detail::allocate_like(
+    gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr);
   auto output_offset_view = output_offset->mutable_view();
-  cudf::copy_range_in_place(gather_map.offsets(),
-                            output_offset_view,
-                            gather_map.offset(),
-                            gather_map.offset() + output_offset_view.size(),
-                            0);
+  cudf::detail::copy_range_in_place(gather_map.offsets(),
+                                    output_offset_view,
+                                    gather_map.offset(),
+                                    gather_map.offset() + output_offset_view.size(),
+                                    0,
+                                    stream);
   // Assemble list column & return
   auto null_mask       = cudf::detail::copy_bitmask(value_column.parent(), stream, mr);
   size_type null_count = value_column.null_count();
@@ -103,7 +104,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                            std::move(output_offset),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 381864e1a68..7c6c612eb25 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -53,7 +53,7 @@ std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
   // `segmented_gather()` on a null index should produce a null row.
   if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
 
-  auto const d_indices = column_device_view::create(indices);
+  auto const d_indices = column_device_view::create(indices, stream);
   // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
   auto const null_replaced_iter_begin =
     cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());

From b848dd5c9cfef7e3523810d67296e037f31945c1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 2 Dec 2021 14:40:57 -0800
Subject: [PATCH 046/202] Fix ORC writer crash with empty input columns (#9808)

Fixes https://github.com/rapidsai/cudf/issues/9783

Skip some parts of writing when the input table was zero rows.
Add is_empty to `hostdevice_2dvector`.
Add Python test with empty columns.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9808
---
 cpp/src/io/orc/writer_impl.cu              | 338 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |   1 +
 python/cudf/cudf/tests/test_orc.py         |  15 +
 3 files changed, 188 insertions(+), 166 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e53fb3589bc..db02125ce77 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         }
 
         auto const direct_data_size =
-          std::accumulate(segmentation.stripes.front().cbegin(),
-                          segmentation.stripes.back().cend(),
-                          size_t{0},
-                          [&](auto data_size, auto rg_idx) {
-                            return data_size + column.host_dict_chunk(rg_idx)->string_char_count;
-                          });
+          segmentation.num_stripes() == 0
+            ? 0
+            : std::accumulate(segmentation.stripes.front().cbegin(),
+                              segmentation.stripes.back().cend(),
+                              size_t{0},
+                              [&](auto data_size, auto rg_idx) {
+                                return data_size +
+                                       column.host_dict_chunk(rg_idx)->string_char_count;
+                              });
         if (enable_dict) {
           uint32_t dict_bits = 0;
           for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) {
@@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table,
   }
   chunk_streams.host_to_device(stream);
 
-  if (orc_table.num_string_columns() != 0) {
-    auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
-    gpu::EncodeStripeDictionaries(d_stripe_dict,
-                                  chunks,
-                                  orc_table.num_string_columns(),
-                                  segmentation.num_stripes(),
-                                  chunk_streams,
-                                  stream);
-  }
+  if (orc_table.num_rows() > 0) {
+    if (orc_table.num_string_columns() != 0) {
+      auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
+      gpu::EncodeStripeDictionaries(d_stripe_dict,
+                                    chunks,
+                                    orc_table.num_string_columns(),
+                                    segmentation.num_stripes(),
+                                    chunk_streams,
+                                    stream);
+    }
 
-  gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+    gpu::EncodeOrcColumnData(chunks, chunk_streams, stream);
+  }
   dictionaries.data.clear();
   dictionaries.index.clear();
   stream.synchronize();
@@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table)
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
   hostdevice_2dvector<gpu::DictionaryChunk> dict(
     rowgroup_bounds.size().first, orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not dict.is_empty()) {
     init_dictionaries(orc_table,
                       rowgroup_bounds,
                       dictionaries.d_data_view,
@@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table)
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
     segmentation.num_stripes(), orc_table.num_string_columns(), stream);
-  if (orc_table.num_string_columns() != 0) {
+  if (not stripe_dict.is_empty()) {
     build_dictionaries(orc_table,
                        segmentation.stripes,
                        dict,
@@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table)
     segmentation.num_stripes(), num_data_streams, stream);
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
-  // Gather column statistics
-  std::vector<ColStatsBlob> column_stats;
-  if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) {
-    column_stats = gather_statistic_blobs(orc_table, segmentation);
-  }
+  if (num_rows > 0) {
+    // Gather column statistics
+    auto const column_stats = enable_statistics_ && table.num_columns() > 0
+                                ? gather_statistic_blobs(orc_table, segmentation)
+                                : std::vector<ColStatsBlob>{};
 
-  // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size       = 0;
-  size_t num_compressed_blocks     = 0;
-  size_t max_compressed_block_size = 0;
-  if (compression_kind_ != NONE) {
-    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
-      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
-  }
-  auto stream_output = [&]() {
-    size_t max_stream_size = 0;
-    bool all_device_write  = true;
+    // Allocate intermediate output stream buffer
+    size_t compressed_bfr_size       = 0;
+    size_t num_compressed_blocks     = 0;
+    size_t max_compressed_block_size = 0;
+    if (compression_kind_ != NONE) {
+      nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+        compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+    }
+    auto stream_output = [&]() {
+      size_t max_stream_size = 0;
+      bool all_device_write  = true;
+
+      for (auto& ss : strm_descs.host_view().flat_view()) {
+        if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+        size_t stream_size = ss.stream_size;
+        if (compression_kind_ != NONE) {
+          ss.first_block = num_compressed_blocks;
+          ss.bfr_offset  = compressed_bfr_size;
+
+          auto num_blocks = std::max<uint32_t>(
+            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+          stream_size += num_blocks * BLOCK_HEADER_SIZE;
+          num_compressed_blocks += num_blocks;
+          compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+        }
+        max_stream_size = std::max(max_stream_size, stream_size);
+      }
 
-    for (auto& ss : strm_descs.host_view().flat_view()) {
-      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
-      size_t stream_size = ss.stream_size;
-      if (compression_kind_ != NONE) {
-        ss.first_block = num_compressed_blocks;
-        ss.bfr_offset  = compressed_bfr_size;
-
-        auto num_blocks = std::max<uint32_t>(
-          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-        stream_size += num_blocks * BLOCK_HEADER_SIZE;
-        num_compressed_blocks += num_blocks;
-        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
+      if (all_device_write) {
+        return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
+      } else {
+        return pinned_buffer<uint8_t>{[](size_t size) {
+                                        uint8_t* ptr = nullptr;
+                                        CUDA_TRY(cudaMallocHost(&ptr, size));
+                                        return ptr;
+                                      }(max_stream_size),
+                                      cudaFreeHost};
       }
-      max_stream_size = std::max(max_stream_size, stream_size);
-    }
+    }();
 
-    if (all_device_write) {
-      return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
-    } else {
-      return pinned_buffer<uint8_t>{[](size_t size) {
-                                      uint8_t* ptr = nullptr;
-                                      CUDA_TRY(cudaMallocHost(&ptr, size));
-                                      return ptr;
-                                    }(max_stream_size),
-                                    cudaFreeHost};
+    // Compress the data streams
+    rmm::device_buffer compressed_data(compressed_bfr_size, stream);
+    hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
+    hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
+    if (compression_kind_ != NONE) {
+      strm_descs.host_to_device(stream);
+      gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
+                                  num_compressed_blocks,
+                                  compression_kind_,
+                                  compression_blocksize_,
+                                  max_compressed_block_size,
+                                  strm_descs,
+                                  enc_data.streams,
+                                  comp_in,
+                                  comp_out,
+                                  stream);
+      strm_descs.device_to_host(stream);
+      comp_out.device_to_host(stream, true);
     }
-  }();
-
-  // Compress the data streams
-  rmm::device_buffer compressed_data(compressed_bfr_size, stream);
-  hostdevice_vector<gpu_inflate_status_s> comp_out(num_compressed_blocks, stream);
-  hostdevice_vector<gpu_inflate_input_s> comp_in(num_compressed_blocks, stream);
-  if (compression_kind_ != NONE) {
-    strm_descs.host_to_device(stream);
-    gpu::CompressOrcDataStreams(static_cast<uint8_t*>(compressed_data.data()),
-                                num_compressed_blocks,
-                                compression_kind_,
-                                compression_blocksize_,
-                                max_compressed_block_size,
-                                strm_descs,
-                                enc_data.streams,
-                                comp_in,
-                                comp_out,
-                                stream);
-    strm_descs.device_to_host(stream);
-    comp_out.device_to_host(stream, true);
-  }
 
-  ProtobufWriter pbw_(&buffer_);
-
-  // Write stripes
-  std::vector<std::future<void>> write_tasks;
-  for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-    auto const& rowgroups_range = segmentation.stripes[stripe_id];
-    auto& stripe                = stripes[stripe_id];
-
-    stripe.offset = out_sink_->bytes_written();
-
-    // Column (skippable) index streams appear at the start of the stripe
-    for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
-      write_index_stream(stripe_id,
-                         stream_id,
-                         orc_table.columns,
-                         rowgroups_range,
-                         enc_data.streams,
-                         strm_descs,
-                         comp_out,
-                         &stripe,
-                         &streams,
-                         &pbw_);
-    }
+    ProtobufWriter pbw_(&buffer_);
+
+    // Write stripes
+    std::vector<std::future<void>> write_tasks;
+    for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
+      auto const& rowgroups_range = segmentation.stripes[stripe_id];
+      auto& stripe                = stripes[stripe_id];
+
+      stripe.offset = out_sink_->bytes_written();
+
+      // Column (skippable) index streams appear at the start of the stripe
+      for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) {
+        write_index_stream(stripe_id,
+                           stream_id,
+                           orc_table.columns,
+                           rowgroups_range,
+                           enc_data.streams,
+                           strm_descs,
+                           comp_out,
+                           &stripe,
+                           &streams,
+                           &pbw_);
+      }
 
-    // Column data consisting one or more separate streams
-    for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_tasks.push_back(
-        write_data_stream(strm_desc,
-                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                          static_cast<uint8_t const*>(compressed_data.data()),
-                          stream_output.get(),
-                          &stripe,
-                          &streams));
-    }
+      // Column data consisting one or more separate streams
+      for (auto const& strm_desc : strm_descs[stripe_id]) {
+        write_tasks.push_back(
+          write_data_stream(strm_desc,
+                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                            static_cast<uint8_t const*>(compressed_data.data()),
+                            stream_output.get(),
+                            &stripe,
+                            &streams));
+      }
 
-    // Write stripefooter consisting of stream information
-    StripeFooter sf;
-    sf.streams = streams;
-    sf.columns.resize(orc_table.num_columns() + 1);
-    sf.columns[0].kind = DIRECT;
-    for (size_t i = 1; i < sf.columns.size(); ++i) {
-      sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
-      sf.columns[i].dictionarySize =
-        (sf.columns[i].kind == DICTIONARY_V2)
-          ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
-          : 0;
-      if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      // Write stripefooter consisting of stream information
+      StripeFooter sf;
+      sf.streams = streams;
+      sf.columns.resize(orc_table.num_columns() + 1);
+      sf.columns[0].kind = DIRECT;
+      for (size_t i = 1; i < sf.columns.size(); ++i) {
+        sf.columns[i].kind = orc_table.column(i - 1).orc_encoding();
+        sf.columns[i].dictionarySize =
+          (sf.columns[i].kind == DICTIONARY_V2)
+            ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings
+            : 0;
+        if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
+      }
+      buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
+      pbw_.write(sf);
+      stripe.footerLength = buffer_.size();
+      if (compression_kind_ != NONE) {
+        uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
+        buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
+        buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
+        buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+      }
+      out_sink_->host_write(buffer_.data(), buffer_.size());
     }
-    buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
-    pbw_.write(sf);
-    stripe.footerLength = buffer_.size();
-    if (compression_kind_ != NONE) {
-      uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
-      buffer_[0]             = static_cast<uint8_t>(uncomp_sf_len >> 0);
-      buffer_[1]             = static_cast<uint8_t>(uncomp_sf_len >> 8);
-      buffer_[2]             = static_cast<uint8_t>(uncomp_sf_len >> 16);
+    for (auto const& task : write_tasks) {
+      task.wait();
     }
-    out_sink_->host_write(buffer_.data(), buffer_.size());
-  }
-  for (auto const& task : write_tasks) {
-    task.wait();
-  }
 
-  if (column_stats.size() != 0) {
-    // File-level statistics
-    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-    if (single_write_mode) {
-      // First entry contains total number of rows
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(num_rows);
-      ff.statistics.reserve(1 + orc_table.num_columns());
-      ff.statistics.emplace_back(std::move(buffer_));
-      // Add file stats, stored after stripe stats in `column_stats`
-      ff.statistics.insert(
-        ff.statistics.end(),
-        std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-        std::make_move_iterator(column_stats.end()));
-    }
-    // Stripe-level statistics
-    size_t first_stripe = md.stripeStats.size();
-    md.stripeStats.resize(first_stripe + stripes.size());
-    for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
-      md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
-      buffer_.resize(0);
-      pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-      pbw_.put_uint(stripes[stripe_id].numberOfRows);
-      md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
-      for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
-        size_t idx = stripes.size() * col_idx + stripe_id;
-        if (idx < column_stats.size()) {
-          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-            std::move(column_stats[idx]);
+    if (not column_stats.empty()) {
+      // File-level statistics
+      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+      if (single_write_mode) {
+        // First entry contains total number of rows
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(num_rows);
+        ff.statistics.reserve(1 + orc_table.num_columns());
+        ff.statistics.emplace_back(std::move(buffer_));
+        // Add file stats, stored after stripe stats in `column_stats`
+        ff.statistics.insert(
+          ff.statistics.end(),
+          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
+          std::make_move_iterator(column_stats.end()));
+      }
+      // Stripe-level statistics
+      size_t first_stripe = md.stripeStats.size();
+      md.stripeStats.resize(first_stripe + stripes.size());
+      for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
+        md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
+        buffer_.resize(0);
+        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(stripes[stripe_id].numberOfRows);
+        md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
+        for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
+          size_t idx = stripes.size() * col_idx + stripe_id;
+          if (idx < column_stats.size()) {
+            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+              std::move(column_stats[idx]);
+          }
         }
       }
     }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 283715478a0..a7f9aec7bb4 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -179,6 +179,7 @@ class hostdevice_2dvector {
 
   auto size() const noexcept { return _size; }
   auto count() const noexcept { return _size.first * _size.second; }
+  auto is_empty() const noexcept { return count() == 0; }
 
   T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); }
   T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); }
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 6b02874146e..dc176992434 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
     assert_eq(df.to_pandas(), pa_out)
+
+
+def test_empty_columns():
+    buffer = BytesIO()
+    # string and decimal columns have additional steps that need to be skipped
+    expected = cudf.DataFrame(
+        {
+            "string": cudf.Series([], dtype="str"),
+            "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)),
+        }
+    )
+    expected.to_orc(buffer, compression="snappy")
+
+    got_df = cudf.read_orc(buffer)
+    assert_eq(expected, got_df)

From 0c08543955a01470baa4fbdbab927298dcf6afd9 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 3 Dec 2021 04:53:37 +0530
Subject: [PATCH 047/202] Update cmake and conda to 22.02 (#9746)

Changes related to update to 22.02 in one conda environment recipe (only 11.5) was missed. This adds that.
Also makes project version changes in cmake related to update from 21.12 to 22.02.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9746
---
 ci/release/update-version.sh     | 6 +++---
 cpp/CMakeLists.txt               | 2 +-
 cpp/libcudf_kafka/CMakeLists.txt | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index eeb76a15fcc..86432a92128 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -30,13 +30,13 @@ function sed_runner() {
 }
 
 # cpp update
-sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
-sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
 
 # cpp cudf_jni update
-sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt
 
 # rapids-cmake version
 sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 50bdc30b292..e2b317f2e03 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES C CXX CUDA
 )
 
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 435ff3b5987..d0874b57c2d 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 21.12.00
+  VERSION 22.02.00
   LANGUAGES CXX
 )
 

From ce64e53264d21c6e59fe98548796a7b6bae24c07 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 2 Dec 2021 20:19:12 -0600
Subject: [PATCH 048/202] Add directory-partitioned data support to
 cudf.read_parquet (#9720)

Closes #9684
Closes #9690

This PR refactors path handling in `cudf.read_parquet` and uses `pyarrow.dataset` to support for directory-partitioned datasets (with full filterings support at row-group granularity). Since it is my understanding that some users may wish for directory-partitioned columns to be represented as a raw dtype (rather than always becoming categorical), I also added an optional `categorical_partitions` argument (open to suggestions on a better name).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9720
---
 python/cudf/cudf/io/json.py              |   2 +-
 python/cudf/cudf/io/orc.py               |   2 +-
 python/cudf/cudf/io/parquet.py           | 286 +++++++++++++++++++----
 python/cudf/cudf/tests/test_parquet.py   |  94 +++++++-
 python/cudf/cudf/tests/test_s3.py        |   9 +-
 python/cudf/cudf/utils/ioutils.py        |  26 ++-
 python/dask_cudf/dask_cudf/io/parquet.py |   7 +-
 7 files changed, 355 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index a48cfd07d3f..1f876214b16 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -37,7 +37,7 @@ def read_json(
         for source in path_or_buf:
             if ioutils.is_directory(source, **kwargs):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source
+                    passed_filesystem=None, path=source, **kwargs
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index ecb1b0cd185..c1cce3f996f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -316,7 +316,7 @@ def read_orc(
     for source in filepath_or_buffer:
         if ioutils.is_directory(source, **kwargs):
             fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+                passed_filesystem=None, path=source, **kwargs,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 9d665d9a0a5..04d64969a16 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -12,6 +12,7 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
+from cudf.core.column import as_column, build_categorical_column
 from cudf.utils import ioutils
 
 
@@ -80,7 +81,7 @@ def write_to_dataset(
         kwargs for to_parquet function.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path)
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
     metadata = []
 
@@ -163,11 +164,19 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
-def _process_row_groups(paths, fs, filters=None, row_groups=None):
+def _process_dataset(
+    paths, fs, filters=None, row_groups=None, categorical_partitions=True,
+):
+    # Returns:
+    #     file_list - Expanded/filtered list of paths
+    #     row_groups - Filtered list of row-group selections
+    #     partition_keys - list of partition keys for each file
+    #     partition_categories - Categories for each partition
 
     # The general purpose of this function is to (1) expand
     # directory input into a list of paths (using the pyarrow
-    # dataset API), and (2) to apply row-group filters.
+    # dataset API), (2) to apply row-group filters, and (3)
+    # to discover directory-partitioning information
 
     # Deal with case that the user passed in a directory name
     file_list = paths
@@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None):
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
 
-    if filters is not None:
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+    # Deal with directory partitioning
+    # Get all partition keys (without filters)
+    partition_categories = defaultdict(list)
+    file_fragment = None
+    for file_fragment in dataset.get_fragments():
+        keys = ds._get_partition_keys(file_fragment.partition_expression)
+        if not (keys or partition_categories):
+            # Bail - This is not a directory-partitioned dataset
+            break
+        for k, v in keys.items():
+            if v not in partition_categories[k]:
+                partition_categories[k].append(v)
+        if not categorical_partitions:
+            # Bail - We don't need to discover all categories.
+            # We only need to save the partition keys from this
+            # first `file_fragment`
+            break
+
+    if partition_categories and file_fragment is not None:
+        # Check/correct order of `categories` using last file_frag,
+        # because `_get_partition_keys` does NOT preserve the
+        # partition-hierarchy order of the keys.
+        cat_keys = [
+            part.split("=")[0]
+            for part in file_fragment.path.split(fs.sep)
+            if "=" in part
+        ]
+        if set(partition_categories) == set(cat_keys):
+            partition_categories = {
+                k: partition_categories[k]
+                for k in cat_keys
+                if k in partition_categories
+            }
+
+    # If we do not have partitioned data and
+    # are not filtering, we can return here
+    if filters is None and not partition_categories:
+        return file_list, row_groups, [], {}
+
+    # Record initial row_groups input
+    row_groups_map = {}
+    if row_groups is not None:
+        # Make sure paths and row_groups map 1:1
+        # and save the initial mapping
+        if len(paths) != len(file_list):
+            raise ValueError(
+                "Cannot specify a row_group selection for a directory path."
+            )
+        row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)}
+
+    # Apply filters and discover partition columns
+    partition_keys = []
+    if partition_categories or filters is not None:
+        file_list = []
+        if filters is not None:
+            row_groups = []
+        for file_fragment in dataset.get_fragments(filter=filters):
+            path = file_fragment.path
+
+            # Extract hive-partition keys, and make sure they
+            # are orederd the same as they are in `partition_categories`
+            if partition_categories:
+                raw_keys = ds._get_partition_keys(
+                    file_fragment.partition_expression
+                )
+                partition_keys.append(
+                    [
+                        (name, raw_keys[name])
+                        for name in partition_categories.keys()
+                    ]
                 )
 
-    return file_list, row_groups
+            # Apply row-group filtering
+            selection = row_groups_map.get(path, None)
+            if selection is not None or filters is not None:
+                filtered_row_groups = [
+                    rg_info.id
+                    for rg_fragment in file_fragment.split_by_row_group(
+                        filters, schema=dataset.schema,
+                    )
+                    for rg_info in rg_fragment.row_groups
+                ]
+            file_list.append(path)
+            if filters is not None:
+                if selection is None:
+                    row_groups.append(filtered_row_groups)
+                else:
+                    row_groups.append(
+                        [
+                            rg_id
+                            for rg_id in filtered_row_groups
+                            if rg_id in selection
+                        ]
+                    )
+
+    return (
+        file_list,
+        row_groups,
+        partition_keys,
+        partition_categories if categorical_partitions else {},
+    )
 
 
 def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
@@ -319,6 +407,7 @@ def read_parquet(
     strings_to_categorical=False,
     use_pandas_metadata=True,
     use_python_file_object=False,
+    categorical_partitions=True,
     *args,
     **kwargs,
 ):
@@ -345,17 +434,29 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
-    filepath_or_buffer = paths if paths else filepath_or_buffer
-    if fs is None and filters is not None:
-        raise ValueError("cudf cannot apply filters to open file objects.")
 
-    # Apply filters now (before converting non-local paths to buffers).
-    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
-    # into a full list of files if it is a directory.
-    if fs is not None:
-        filepath_or_buffer, row_groups = _process_row_groups(
-            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+    # Use pyarrow dataset to detect/process directory-partitioned
+    # data and apply filters. Note that we can only support partitioned
+    # data and filtering if the input is a single directory or list of
+    # paths.
+    partition_keys = []
+    partition_categories = {}
+    if fs and paths:
+        (
+            paths,
+            row_groups,
+            partition_keys,
+            partition_categories,
+        ) = _process_dataset(
+            paths,
+            fs,
+            filters=filters,
+            row_groups=row_groups,
+            categorical_partitions=categorical_partitions,
         )
+    elif filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # Check if we should calculate the specific byte-ranges
     # needed for each parquet file. We always do this when we
@@ -380,15 +481,6 @@ def read_parquet(
     filepaths_or_buffers = []
     for i, source in enumerate(filepath_or_buffer):
 
-        if ioutils.is_directory(source, **kwargs):
-            # Note: For now, we know `fs` is an fsspec filesystem
-            # object, but it may be an arrow object in the future
-            fsspec_fs = ioutils._ensure_filesystem(
-                passed_filesystem=fs, path=source
-            )
-            source = ioutils.stringify_pathlike(source)
-            source = fsspec_fs.sep.join([source, "*.parquet"])
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
@@ -410,6 +502,117 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
+    # Warn user if they are not using cudf for IO
+    # (There is a good chance this was not the intention)
+    if engine != "cudf":
+        warnings.warn(
+            "Using CPU via PyArrow to read Parquet dataset."
+            "This option is both inefficient and unstable!"
+        )
+        if filters is not None:
+            warnings.warn(
+                "Parquet row-group filtering is only supported with "
+                "'engine=cudf'. Use pandas or pyarrow API directly "
+                "for full CPU-based filtering functionality."
+            )
+
+    return _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        skiprows=skiprows,
+        num_rows=num_rows,
+        strings_to_categorical=strings_to_categorical,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        **kwargs,
+    )
+
+
+def _parquet_to_frame(
+    paths_or_buffers,
+    *args,
+    row_groups=None,
+    partition_keys=None,
+    partition_categories=None,
+    **kwargs,
+):
+
+    # If this is not a partitioned read, only need
+    # one call to `_read_parquet`
+    if not partition_keys:
+        return _read_parquet(
+            paths_or_buffers, *args, row_groups=row_groups, **kwargs,
+        )
+
+    # For partitioned data, we need a distinct read for each
+    # unique set of partition keys. Therefore, we start by
+    # aggregating all paths with matching keys using a dict
+    plan = {}
+    for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)):
+        rgs = row_groups[i] if row_groups else None
+        tkeys = tuple(keys)
+        if tkeys in plan:
+            plan[tkeys][0].append(path)
+            if rgs is not None:
+                plan[tkeys][1].append(rgs)
+        else:
+            plan[tkeys] = ([path], None if rgs is None else [rgs])
+
+    dfs = []
+    for part_key, (key_paths, key_row_groups) in plan.items():
+        # Add new DataFrame to our list
+        dfs.append(
+            _read_parquet(
+                key_paths, *args, row_groups=key_row_groups, **kwargs,
+            )
+        )
+        # Add partition columns to the last DataFrame
+        for (name, value) in part_key:
+            if partition_categories and name in partition_categories:
+                # Build the categorical column from `codes`
+                codes = as_column(
+                    partition_categories[name].index(value),
+                    length=len(dfs[-1]),
+                )
+                dfs[-1][name] = build_categorical_column(
+                    categories=partition_categories[name],
+                    codes=codes,
+                    size=codes.size,
+                    offset=codes.offset,
+                    ordered=False,
+                )
+            else:
+                # Not building categorical columns, so
+                # `value` is already what we want
+                dfs[-1][name] = as_column(value, length=len(dfs[-1]))
+
+    # Concatenate dfs and return.
+    # Assume we can ignore the index if it has no name.
+    return (
+        cudf.concat(dfs, ignore_index=dfs[-1].index.name is None)
+        if len(dfs) > 1
+        else dfs[0]
+    )
+
+
+def _read_parquet(
+    filepaths_or_buffers,
+    engine,
+    columns=None,
+    row_groups=None,
+    skiprows=None,
+    num_rows=None,
+    strings_to_categorical=None,
+    use_pandas_metadata=None,
+    *args,
+    **kwargs,
+):
+    # Simple helper function to dispatch between
+    # cudf and pyarrow to read parquet data
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
@@ -421,7 +624,6 @@ def read_parquet(
             use_pandas_metadata=use_pandas_metadata,
         )
     else:
-        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
         return cudf.DataFrame.from_arrow(
             pq.ParquetDataset(filepaths_or_buffers).read_pandas(
                 columns=columns, *args, **kwargs
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b6595be9566..516ee0d17d3 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1578,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf):
 
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
-def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
+def test_parquet_partitioned(tmpdir_factory, cols, filename):
     # Checks that write_to_dataset is wrapping to_parquet
     # as expected
     gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
@@ -1597,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename):
         gdf_dir, index=False, partition_cols=cols, partition_file_name=filename
     )
 
-    # Use pandas since dataset may be partitioned
-    expect = pd.read_parquet(pdf_dir)
-    got = pd.read_parquet(gdf_dir)
-    assert_eq(expect, got)
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
 
     # If filename is specified, check that it is correct
     if filename:
@@ -1629,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
     gdf.to_parquet(dir1, partition_cols=cols)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
-    # cudf read_parquet cannot handle partitioned dataset
-    expect = pd.read_parquet(dir1)
-    got = pd.read_parquet(dir2)
+    # Read back with cudf
+    expect = cudf.read_parquet(dir1)
+    got = cudf.read_parquet(dir2)
     assert_eq(expect, got)
 
     gdf = cudf.DataFrame(
@@ -1645,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         gdf.to_parquet(dir1, partition_cols=cols)
 
 
+@pytest.mark.parametrize(
+    "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]],
+)
+@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"])
+@pytest.mark.parametrize("use_cat", [True, False])
+def test_read_parquet_partitioned_filtered(
+    tmpdir, pfilters, selection, use_cat
+):
+    path = str(tmpdir)
+    size = 100
+    df = cudf.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    df.to_parquet(path, partition_cols=["c", "b"])
+
+    if selection == "files":
+        # Pass in a list of paths
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = None
+    elif selection == "row-groups":
+        # Pass in a list of paths AND row-group ids
+        fs = get_fs_token_paths(path)[0]
+        read_path = fs.find(path)
+        row_groups = [[0] for p in read_path]
+    else:
+        # Pass in a directory path
+        # (row-group selection not allowed in this case)
+        read_path = path
+        row_groups = None
+
+    # Filter on partitioned columns
+    expect = pd.read_parquet(read_path, filters=pfilters)
+    got = cudf.read_parquet(
+        read_path,
+        filters=pfilters,
+        row_groups=row_groups,
+        categorical_partitions=use_cat,
+    )
+    if use_cat:
+        assert got.dtypes["b"] == "category"
+        assert got.dtypes["c"] == "category"
+    else:
+        # Check that we didn't get categorical
+        # columns, but convert back to categorical
+        # for comparison with pandas
+        assert got.dtypes["b"] == "object"
+        assert got.dtypes["c"] == "int"
+        got["b"] = pd.Categorical(
+            got["b"].to_pandas(), categories=list("abcd")
+        )
+        got["c"] = pd.Categorical(
+            got["c"].to_pandas(), categories=np.arange(4)
+        )
+    assert_eq(expect, got)
+
+    # Filter on non-partitioned column.
+    # Cannot compare to pandas, since the pyarrow
+    # backend will filter by row (and cudf can
+    # only filter by column, for now)
+    filters = [("a", "==", 10)]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and 10 in got["a"]
+
+    # Filter on both kinds of columns
+    filters = [[("a", "==", 10)], [("c", "==", 1)]]
+    got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,)
+    assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"])
+
+
 def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf):
     gdf_fname = tmpdir.join("gdf.parquet")
     test_path = "test/path"
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index dea876891f8..5738e1f0d00 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
-def test_write_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("partition_cols", [None, ["String"]])
+def test_write_parquet(s3_base, s3so, pdf, partition_cols):
     fname = "test_parquet_writer.parquet"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
-        gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so)
+        gdf.to_parquet(
+            "s3://{}/{}".format(bname, fname),
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
         assert s3fs.exists("s3://{}/{}".format(bname, fname))
 
         got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname)))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0f9d9d53b23..e6c031acac7 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -154,6 +154,9 @@
 strings_to_categorical : boolean, default False
     If True, return string columns as GDF_CATEGORY dtype; if False, return a
     as GDF_STRING dtype.
+categorical_partitions : boolean, default True
+    Whether directory-partitioned columns should be interpreted as categorical
+    or raw dtypes.
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
@@ -1129,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs, _, paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
         except ValueError as e:
@@ -1153,9 +1156,9 @@ def is_directory(path_or_data, **kwargs):
         storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
-            fs, _, paths = fsspec.get_fs_token_paths(
+            fs = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
-            )
+            )[0]
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
                 return False
@@ -1189,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         else:
             path_or_data = [path_or_data]
 
-        # Pyarrow did not support the protocol or storage options.
-        # Fall back to fsspec
         try:
-            fs, _, fs_paths = fsspec.get_fs_token_paths(
+            fs, _, fs_paths = get_fs_token_paths(
                 path_or_data, mode="rb", storage_options=storage_options
             )
             return_paths = fs_paths
@@ -1322,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
-        fs, _, _ = fsspec.get_fs_token_paths(
+        fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
-        )
+        )[0]
 
         if not _is_local_filesystem(fs):
             filepath_or_buffer = fsspec.open(
@@ -1513,11 +1514,12 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path):
+def _ensure_filesystem(passed_filesystem, path, **kwargs):
     if passed_filesystem is None:
-        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
-            0
-        ]
+        return get_fs_token_paths(
+            path[0] if isinstance(path, list) else path,
+            storage_options=kwargs.get("storage_options", {}),
+        )[0]
     return passed_filesystem
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index b47a5e78095..a49d73493ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -126,11 +126,8 @@ def _read_paths(
 
                 # Build the column from `codes` directly
                 # (since the category is often a larger dtype)
-                codes = (
-                    as_column(partitions[i].keys.index(index2))
-                    .as_frame()
-                    .repeat(len(df))
-                    ._data[None]
+                codes = as_column(
+                    partitions[i].keys.index(index2), length=len(df),
                 )
                 df[name] = build_categorical_column(
                     categories=partitions[i].keys,

From e82cc62e2ea61211c64ba4784cb131d5b535644c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 3 Dec 2021 04:46:25 -0800
Subject: [PATCH 049/202] Fix join of MultiIndex to Index with one column and
 overlapping name. (#9830)

This PR resolves #9823

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9830
---
 python/cudf/cudf/core/_base_index.py   |  4 ++--
 python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d688b75ed14..2fcc976d8e1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1147,14 +1147,14 @@ def join(
         if isinstance(lhs, cudf.MultiIndex):
             if level is not None and isinstance(level, int):
                 on = lhs._data.select_by_index(level).names[0]
-            right_names = (on,) or right_names
+            right_names = (on,) if on is not None else right_names
             on = right_names[0]
             if how == "outer":
                 how = "left"
             elif how == "right":
                 how = "inner"
         else:
-            # Both are nomal indices
+            # Both are normal indices
             right_names = left_names
             on = right_names[0]
 
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 0518cc2c9b9..d25c6130bfb 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -2150,3 +2150,16 @@ def test_join_redundant_params():
         lhs.merge(rhs, right_on="a", left_index=True, right_index=True)
     with pytest.raises(ValueError):
         lhs.merge(rhs, left_on="c", right_on="b")
+
+
+def test_join_multiindex_index():
+    # test joining a MultiIndex with an Index with overlapping name
+    lhs = (
+        cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]})
+        .set_index(["a", "b"])
+        .index
+    )
+    rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index
+    expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner")
+    got = lhs.join(rhs, how="inner")
+    assert_join_results_equal(expect, got, how="inner")

From 62103c6a99b4f2df00965e733542e08ce4b11448 Mon Sep 17 00:00:00 2001
From: Raza Jafri <razajafri@users.noreply.github.com>
Date: Fri, 3 Dec 2021 08:34:47 -0800
Subject: [PATCH 050/202] Added a few more tests for Decimal to String cast
 (#9818)

This PR adds a few more edge cases as a sanity test on the request of @sameerz

Authors:
  - Raza Jafri (https://github.com/razajafri)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9818
---
 .../java/ai/rapids/cudf/ColumnVectorTest.java    | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 31a52eb2ec0..7120a40a26a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3376,6 +3376,8 @@ void testFixedWidthCast() {
   void testCastBigDecimalToString() {
     BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"),
         new BigDecimal("9223372036854775808.191"),
+        new BigDecimal("-9.223"),
+        new BigDecimal("0.000"),
         new BigDecimal("9328323982309091029831.002")
     };
 
@@ -3383,9 +3385,21 @@ void testCastBigDecimalToString() {
          ColumnVector values = cv.castTo(DType.STRING);
          ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321",
              "9223372036854775808.191",
-             "9328323982309091029831.002")) {
+             "-9.223",
+             "0.000",
+            "9328323982309091029831.002")) {
       assertColumnsAreEqual(expected, values);
     }
+
+    BigDecimal[] bigValues0 = {new BigDecimal("992983283728193827182918744829283742232")};
+    try {
+      ColumnVector cv = ColumnVector.fromDecimals(bigValues0);
+      if (cv != null) {
+        cv.close();
+      }
+      fail("Precision check should've thrown an IllegalArgumentException");
+    } catch (IllegalArgumentException iae) {
+    }
   }
 
   @Test

From fdd9bb00dc0ba5ac373feaa079b782029130dae3 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Fri, 3 Dec 2021 16:13:28 -0700
Subject: [PATCH 051/202] Add JNI for `cudf::drop_duplicates` (#9841)

This adds Java binding for `cudf::drop_duplicates`.

Note that when choosing which duplicate element to keep, only `KEEP_FIRST` or `KEEP_LAST` option can be selected. In other words, this does not support `KEEP_NONE` to remove all duplicate elements.

Closes #9115.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9841
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 92 ++++++++++++-------
 java/src/main/native/src/TableJni.cpp         | 26 ++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 26 ++++++
 3 files changed, 112 insertions(+), 32 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b11808ed023..e32d466e853 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -645,6 +645,10 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left
 
   private static native long[] filter(long input, long mask);
 
+  private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns,
+                                              boolean keepFirst, boolean nullsEqual,
+                                              boolean nullsBefore) throws CudfException;
+
   private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds);
 
   private static native long[] convertToRows(long nativeHandle);
@@ -1820,6 +1824,30 @@ public Table filter(ColumnView mask) {
     return new Table(filter(nativeHandle, mask.getNativeView()));
   }
 
+  /**
+   * Copy rows of the current table to an output table such that duplicate rows in the key columns
+   * are ignored (i.e., only one row from the duplicate ones will be copied). These keys columns are
+   * a subset of the current table columns and their indices are specified by an input array.
+   *
+   * Currently, the output table is sorted by key columns, using stable sort. However, this is not
+   * guaranteed in the future.
+   *
+   * @param keyColumns Array of indices representing key columns from the current table.
+   * @param keepFirst If it is true, the first row with a duplicated key will be copied. Otherwise,
+   *                  copy the last row with a duplicated key.
+   * @param nullsEqual Flag to denote whether nulls are treated as equal when comparing rows of the
+   *                   key columns to check for uniqueness.
+   * @param nullsBefore Flag to specify whether nulls in the key columns will appear before or
+   *                    after non-null elements when sorting the table.
+   *
+   * @return Table with unique keys.
+   */
+  public Table dropDuplicates(int[] keyColumns, boolean keepFirst, boolean nullsEqual,
+                              boolean nullsBefore) {
+    assert keyColumns.length >= 1 : "Input keyColumns must contain indices of at least one column";
+    return new Table(dropDuplicates(nativeHandle, keyColumns, keepFirst, nullsEqual, nullsBefore));
+  }
+
   /**
    * Split a table at given boundaries, but the result of each split has memory that is laid out
    * in a contiguous range of memory.  This allows for us to optimize copying the data in a single
@@ -3005,27 +3033,27 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) {
     }
 
     /**
-     * Computes row-based window aggregation functions on the Table/projection, 
+     * Computes row-based window aggregation functions on the Table/projection,
      * based on windows specified in the argument.
-     * 
+     *
      * This method enables queries such as the following SQL:
-     * 
-     *  SELECT user_id, 
-     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date 
+     *
+     *  SELECT user_id,
+     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date
      *                             ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING)
      *  FROM my_sales_table WHERE ...
-     * 
+     *
      * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the number of rows preceding and following the current row, within a window,
      *  3. the minimum number of observations within the defined window
-     * 
+     *
      * This method returns a {@link Table} instance, with one result column for each specified
      * window aggregation.
-     * 
+     *
      * In this example, for the following input:
-     * 
+     *
      *  [ // user_id,  sales_amt
      *    { "user1",     10      },
      *    { "user2",     20      },
@@ -3037,19 +3065,19 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) {
      *    { "user1",     60      },
      *    { "user2",     40      }
      *  ]
-     * 
-     * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector 
+     *
+     * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector
      * (with 2 groups, one for each distinct `user_id`):
-     * 
+     *
      *    [ 10,  20,  10,  50,  60,  20,  30,  80,  40 ]
      *      <-------user1-------->|<------user2------->
-     * 
+     *
      * The SUM aggregation is applied with 1 preceding and 1 following
      * row, with a minimum of 1 period. The aggregation window is thus 3 rows wide,
      * yielding the following column:
-     * 
+     *
      *    [ 30, 40,  80, 120, 110,  50, 130, 150, 120 ]
-     * 
+     *
      * @param windowAggregates the window-aggregations to be performed
      * @return Table instance, with each column containing the result of each aggregation.
      * @throws IllegalArgumentException if the window arguments are not of type
@@ -3068,7 +3096,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
       for (int outputIndex = 0; outputIndex < windowAggregates.length; outputIndex++) {
         AggregationOverWindow agg = windowAggregates[outputIndex];
         if (agg.getWindowOptions().getFrameType() != WindowOptions.FrameType.ROWS) {
-          throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: " 
+          throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: "
                   + agg.getWindowOptions().getFrameType());
         }
         ColumnWindowOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnWindowOps());
@@ -3129,27 +3157,27 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
     /**
      * Computes range-based window aggregation functions on the Table/projection,
      * based on windows specified in the argument.
-     * 
+     *
      * This method enables queries such as the following SQL:
-     * 
-     *  SELECT user_id, 
-     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date 
+     *
+     *  SELECT user_id,
+     *         MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date
      *                             RANGE BETWEEN INTERVAL 1 DAY PRECEDING and CURRENT ROW)
      *  FROM my_sales_table WHERE ...
-     * 
+     *
      * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument,
      * indicating:
      *  1. the {@link Aggregation.Kind},
      *  2. the index for the timestamp column to base the window definitions on
      *  2. the number of DAYS preceding and following the current row's date, to consider in the window
      *  3. the minimum number of observations within the defined window
-     * 
+     *
      * This method returns a {@link Table} instance, with one result column for each specified
      * window aggregation.
-     * 
+     *
      * In this example, for the following input:
-     * 
-     *  [ // user,  sales_amt,  YYYYMMDD (date)  
+     *
+     *  [ // user,  sales_amt,  YYYYMMDD (date)
      *    { "user1",   10,      20200101    },
      *    { "user2",   20,      20200101    },
      *    { "user1",   20,      20200102    },
@@ -3160,19 +3188,19 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) {
      *    { "user1",   60,      20200107    },
      *    { "user2",   40,      20200104    }
      *  ]
-     * 
-     * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector 
+     *
+     * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector
      * (with 2 groups, one for each distinct `user_id`):
-     * 
+     *
      * Date :(202001-)  [ 01,  02,  03,  07,  07,    01,   01,   02,  04 ]
      * Input:           [ 10,  20,  10,  50,  60,    20,   30,   80,  40 ]
      *                    <-------user1-------->|<---------user2--------->
-     * 
-     * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period. 
+     *
+     * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period.
      * The aggregation window is thus 3 *days* wide, yielding the following output column:
-     * 
+     *
      *  Results:        [ 30,  40,  30,  110, 110,  130,  130,  130,  40 ]
-     * 
+     *
      * @param windowAggregates the window-aggregations to be performed
      * @return Table instance, with each column containing the result of each aggregation.
      * @throws IllegalArgumentException if the window arguments are not of type
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index f3377bb002d..18e7936f322 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2676,6 +2676,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
+    JNIEnv *env, jclass, jlong input_jtable, jintArray key_columns, jboolean keep_first,
+    jboolean nulls_equal, jboolean nulls_before) {
+  JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
+  JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+
+    static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched.");
+    auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns);
+    auto const keys_indices =
+        std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
+
+    auto result = cudf::drop_duplicates(
+        *input, keys_indices,
+        keep_first ? cudf::duplicate_keep_option::KEEP_FIRST :
+                     cudf::duplicate_keep_option::KEEP_LAST,
+        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
+        nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER,
+        rmm::mr::get_current_device_resource());
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
                                                               jlong j_map, jboolean check_bounds) {
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 0b2f56895e9..a5779bf9dbb 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6592,6 +6592,32 @@ void testTableBasedFilter() {
     }
   }
 
+  @Test
+  void testDropDuplicates() {
+    int[] keyColumns = new int[]{ 1 };
+
+    try (ColumnVector col1 = ColumnVector.fromBoxedInts(5, null, 3, 5, 8, 1);
+         ColumnVector col2 = ColumnVector.fromBoxedInts(20, null, null, 19, 21, 19);
+         Table input = new Table(col1, col2)) {
+
+      // Keep the first duplicate element.
+      try (Table result = input.dropDuplicates(keyColumns, true, true, true);
+           ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(null, 5, 5, 8);
+           ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21);
+           Table expected = new Table(expectedCol1, expectedCol2)) {
+        assertTablesAreEqual(expected, result);
+      }
+
+      // Keep the last duplicate element.
+      try (Table result = input.dropDuplicates(keyColumns, false, true, true);
+           ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(3, 1, 5, 8);
+           ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21);
+           Table expected = new Table(expectedCol1, expectedCol2)) {
+        assertTablesAreEqual(expected, result);
+      }
+    }
+  }
+
   private enum Columns {
     BOOL("BOOL"),
     INT("INT"),

From 8002cbd87367135a941d1145c9d489a8f82dc76d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 6 Dec 2021 08:38:41 -0500
Subject: [PATCH 052/202] Allow runtime has_nulls parameter for row operators
 (#9623)

Closes #6952

This PR allows the `has_nulls` template parameter for row operators to be used a runtime parameter in places where the null-handling logic has little to no affect on runtime performance.
This can improve compile time as described in #6952.

This will also close #9152 and #9580

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9623
---
 cpp/benchmarks/groupby/group_sum_benchmark.cu |   7 +-
 cpp/benchmarks/hashing/hash_benchmark.cpp     |  37 +-
 .../cudf/column/column_device_view.cuh        | 262 ++++----------
 cpp/include/cudf/detail/iterator.cuh          | 330 +++++-------------
 cpp/include/cudf/detail/merge.cuh             |   4 +-
 cpp/include/cudf/table/row_operators.cuh      | 173 +++++----
 cpp/src/copying/copy.cu                       |  12 +-
 cpp/src/copying/segmented_shift.cu            |  19 +-
 cpp/src/groupby/hash/groupby.cu               |  49 ++-
 cpp/src/groupby/sort/group_nunique.cu         |   4 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  34 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |  21 +-
 .../sort/group_single_pass_reduction_util.cuh |  21 +-
 cpp/src/groupby/sort/sort_helper.cu           |  34 +-
 cpp/src/hash/hashing.cu                       |  55 +--
 cpp/src/hash/murmur_hash.cu                   |  34 +-
 cpp/src/join/hash_join.cu                     |  10 +-
 cpp/src/join/hash_join.cuh                    |   4 +-
 cpp/src/join/join_common_utils.cuh            |   6 +-
 cpp/src/join/join_common_utils.hpp            |   4 +-
 cpp/src/join/semi_join.cu                     |   8 +-
 cpp/src/partitioning/partitioning.cu          |   3 +-
 cpp/src/reductions/arg_minmax_util.cuh        |  19 +-
 cpp/src/reductions/scan/rank_scan.cu          |  30 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  15 +-
 cpp/src/reductions/simple.cuh                 |  16 +-
 cpp/src/replace/clamp.cu                      |  10 +-
 cpp/src/replace/nans.cu                       |   4 +-
 cpp/src/search/search.cu                      |  18 +-
 cpp/src/sort/is_sorted.cu                     |  18 +-
 cpp/src/sort/rank.cu                          |  28 +-
 cpp/src/sort/sort_impl.cuh                    |  49 +--
 cpp/src/stream_compaction/distinct_count.cu   |  30 +-
 cpp/src/stream_compaction/drop_duplicates.cu  |  41 +--
 cpp/src/transform/one_hot_encode.cu           |  20 +-
 cpp/tests/iterator/optional_iterator_test.cuh |  22 +-
 .../optional_iterator_test_numeric.cu         |   2 +-
 cpp/tests/table/table_view_tests.cu           |   4 +-
 cpp/tests/utilities/column_utilities.cu       |  16 +-
 39 files changed, 542 insertions(+), 931 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu
index f64022690d9..0e9f5061a1a 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu
@@ -44,7 +44,6 @@ void BM_basic_sum(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
 
-  // const cudf::size_type num_columns{(cudf::size_type)state.range(0)};
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
 
   auto data_it = cudf::detail::make_counting_transform_iterator(
@@ -53,7 +52,7 @@ void BM_basic_sum(benchmark::State& state)
   wrapper keys(data_it, data_it + column_size);
   wrapper vals(data_it, data_it + column_size);
 
-  cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back(cudf::groupby::aggregation_request());
@@ -73,7 +72,9 @@ BENCHMARK_REGISTER_F(Groupby, Basic)
   ->UseManualTime()
   ->Unit(benchmark::kMillisecond)
   ->Arg(10000)
-  ->Arg(10000000);
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
 
 void BM_pre_sorted_sum(benchmark::State& state)
 {
diff --git a/cpp/benchmarks/hashing/hash_benchmark.cpp b/cpp/benchmarks/hashing/hash_benchmark.cpp
index 77b10399693..4ccb0bfad9d 100644
--- a/cpp/benchmarks/hashing/hash_benchmark.cpp
+++ b/cpp/benchmarks/hashing/hash_benchmark.cpp
@@ -25,10 +25,14 @@
 class HashBenchmark : public cudf::benchmark {
 };
 
-static void BM_hash(benchmark::State& state, cudf::hash_id hid)
+enum contains_nulls { no_nulls, nulls };
+
+static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const data = create_random_table({cudf::type_id::INT64}, 1, row_count{n_rows});
+  if (has_nulls == contains_nulls::no_nulls)
+    data->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true, rmm::cuda_stream_default);
@@ -36,16 +40,25 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid)
   }
 }
 
-#define HASH_BENCHMARK_DEFINE(name)                               \
-  BENCHMARK_DEFINE_F(HashBenchmark, name)                         \
-  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::name); } \
-  BENCHMARK_REGISTER_F(HashBenchmark, name)                       \
-    ->RangeMultiplier(4)                                          \
-    ->Ranges({{1 << 14, 1 << 24}})                                \
-    ->UseManualTime()                                             \
+#define concat(a, b, c) a##b##c
+
+#define H_BENCHMARK_DEFINE(name, hid, n)                                            \
+  BENCHMARK_DEFINE_F(HashBenchmark, name)                                           \
+  (::benchmark::State & st) { BM_hash(st, cudf::hash_id::hid, contains_nulls::n); } \
+  BENCHMARK_REGISTER_F(HashBenchmark, name)                                         \
+    ->RangeMultiplier(4)                                                            \
+    ->Ranges({{1 << 14, 1 << 24}})                                                  \
+    ->UseManualTime()                                                               \
     ->Unit(benchmark::kMillisecond);
 
-HASH_BENCHMARK_DEFINE(HASH_MURMUR3)
-HASH_BENCHMARK_DEFINE(HASH_MD5)
-HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3)
-HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3)
+#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n)
+
+HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls)
+HASH_BENCHMARK_DEFINE(HASH_MD5, nulls)
+HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, nulls)
+HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls)
+
+HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, no_nulls)
+HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls)
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 6ecb0796283..a15f20ef52d 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -44,23 +44,30 @@
 namespace cudf {
 
 /**
- * @brief Policy for what assumptions the optional iterator has about null values
+ * @brief Indicates the presence of nulls at compile-time or runtime.
  *
- * - `YES` means that the column supports nulls and has null values, therefore
- *    the optional might not contain a value
+ * If used at compile-time, this indicator can tell the optimizer
+ * to include or exclude any null-checking clauses.
  *
- * - `NO` means that the column has no null values, therefore the optional will
- *    always have a value
- *
- * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
- *    on construction of the iterator if column has nulls.
  */
-struct contains_nulls {
-  struct YES {
+struct nullate {
+  struct YES : std::bool_constant<true> {
   };
-  struct NO {
+  struct NO : std::bool_constant<false> {
   };
   struct DYNAMIC {
+    DYNAMIC() = delete;
+    /**
+     * @brief Create a runtime nullate object.
+     *
+     * @see cudf::column_device_view::optional_begin for example usage
+     *
+     * @param b True if nulls are expected in the operation in which this
+     *          object is applied.
+     */
+    constexpr explicit DYNAMIC(bool b) noexcept : value{b} {}
+    constexpr operator bool() const noexcept { return value; }
+    bool value;  ///< True if nulls are expected
   };
 };
 
@@ -282,7 +289,7 @@ class alignas(16) column_device_view_base {
 // Forward declaration
 template <typename T>
 struct value_accessor;
-template <typename T, typename contains_nulls_mode>
+template <typename T, typename Nullate>
 struct optional_accessor;
 template <typename T, bool has_nulls>
 struct pair_accessor;
@@ -493,11 +500,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   }
 
   /**
-   * @brief optional iterator for navigating this column
+   * @brief Optional iterator for navigating this column
    */
-  template <typename T, typename contains_nulls_mode>
+  template <typename T, typename Nullate>
   using const_optional_iterator =
-    thrust::transform_iterator<detail::optional_accessor<T, contains_nulls_mode>, count_it>;
+    thrust::transform_iterator<detail::optional_accessor<T, Nullate>, count_it>;
 
   /**
    * @brief Pair iterator for navigating this column
@@ -520,117 +527,57 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * Dereferencing the returned iterator returns a `thrust::optional<T>`.
    *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
+   * The element of this iterator contextually converts to bool. The conversion returns true
    * if the object contains a value and false if it does not contain a value.
    *
-   * optional_begin with mode `DYNAMIC` defers the assumption of nullability to
-   * runtime, with the user stating on construction of the iterator if column has nulls.
-   * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
-   * iterators and you don't want to compile all the combinations of iterator types
-   *
-   * Example:
+   * Calling this method with `nullate::DYNAMIC` defers the assumption of nullability to
+   * runtime with the caller indicating if the column has nulls. The `nullate::DYNAMIC` is
+   * useful when an algorithm is going to execute on multiple iterators and all the combinations of
+   * iterator types are not required at compile time.
    *
-   * \code{.cpp}
+   * @code{.cpp}
    * template<typename T>
    * void some_function(cudf::column_view<T> const& col_view){
    *    auto d_col = cudf::column_device_view::create(col_view);
    *    // Create a `DYNAMIC` optional iterator
-   *    auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{},
-   *                                                      col_view.has_nulls());
-   * }
-   * \endcode
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
-   *         the user has stated nulls exist
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::DYNAMIC, bool has_nulls) const
-  {
-    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the first element of the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
-   * if the object contains a value and false if it does not contain a value.
-   *
-   * optional_begin with mode `YES` means that the column supports nulls and
-   * potentially has null values, therefore the optional might not contain a value
-   *
-   * Example:
-   *
-   * \code{.cpp}
-   * template<typename T, bool has_nulls>
-   * void some_function(cudf::column_view<T> const& col_view){
-   *    auto d_col = cudf::column_device_view::create(col_view);
-   *    if constexpr(has_nulls) {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
-   *      //use optional_iterator
-   *    } else {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
-   *      //use optional_iterator
-   *    }
+   *    auto optional_iterator =
+   *       d_col->optional_begin<T>(cudf::nullate::DYNAMIC{col_view.has_nulls()});
    * }
-   * \endcode
+   * @endcode
    *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
+   * Calling this method with `nullate::YES` means that the column supports nulls and
+   * the optional returned might not contain a value.
    *
-   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::YES) const
-  {
-    return const_optional_iterator<T, contains_nulls::YES>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the first element of the column.
+   * Calling this method with `nullate::NO` means that the column has no null values
+   * and the optional returned will always contain a value.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * When the element of an iterator contextually converted to bool, the conversion returns true
-   * if the object contains a value and false if it does not contain a value.
-   *
-   * optional_begin with mode `NO` means that the column has no null values,
-   * therefore the optional will always contain a value.
-   *
-   * Example:
-   *
-   * \code{.cpp}
+   * @code{.cpp}
    * template<typename T, bool has_nulls>
    * void some_function(cudf::column_view<T> const& col_view){
    *    auto d_col = cudf::column_device_view::create(col_view);
    *    if constexpr(has_nulls) {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::nullate::YES{});
    *      //use optional_iterator
    *    } else {
-   *      auto optional_iterator = d_col->optional_begin<T>(cudf::contains_nulls::NO{});
+   *      auto optional_iterator = d_col->optional_begin<T>(cudf::nullate::NO{});
    *      //use optional_iterator
    *    }
    * }
-   * \endcode
+   * @endcode
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
    *
+   * @throws cudf::logic_error if the column is not nullable and `has_nulls` evaluates to true.
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_begin(contains_nulls::NO) const
+  template <typename T,
+            typename Nullate,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_begin(Nullate has_nulls) const
   {
-    return const_optional_iterator<T, contains_nulls::NO>{
-      count_it{0}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+    return const_optional_iterator<T, Nullate>{
+      count_it{0}, detail::optional_accessor<T, Nullate>{*this, has_nulls}};
   }
 
   /**
@@ -695,57 +642,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @brief Return an optional iterator to the element following the last element of
    * the column.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   * The returned iterator represents a `thrust::optional<T>` element.
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
    *
-   * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
-   *         the user has stated nulls exist
+   * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true
    * @throws cudf::logic_error if column datatype and Element type mismatch.
    */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::DYNAMIC, bool has_nulls) const
-  {
-    return const_optional_iterator<T, contains_nulls::DYNAMIC>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::DYNAMIC>{*this, has_nulls}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the element following the last element of
-   * the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::YES) const
-  {
-    return const_optional_iterator<T, contains_nulls::YES>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::YES>{*this}};
-  }
-
-  /**
-   * @brief Return an optional iterator to the element following the last element of
-   * the column.
-   *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
-   *
-   * This function does not participate in overload resolution if
-   * `column_device_view::has_element_accessor<T>()` is false.
-   *
-   * @throws cudf::logic_error if column datatype and Element type mismatch.
-   */
-  template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  auto optional_end(contains_nulls::NO) const
+  template <typename T,
+            typename Nullate,
+            CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+  auto optional_end(Nullate has_nulls) const
   {
-    return const_optional_iterator<T, contains_nulls::NO>{
-      count_it{size()}, detail::optional_accessor<T, contains_nulls::NO>{*this}};
+    return const_optional_iterator<T, Nullate>{
+      count_it{size()}, detail::optional_accessor<T, Nullate>{*this, has_nulls}};
   }
 
   /**
@@ -1201,77 +1112,56 @@ struct value_accessor {
  * @brief optional accessor of a column
  *
  *
- * The optional_accessor always returns a thrust::optional of column[i]. The validity
- * of the optional is determined by the contains_nulls_mode template parameter
- * which has the following modes:
+ * The optional_accessor always returns a `thrust::optional` of `column[i]`. The validity
+ * of the optional is determined by the `Nullate` parameter which may be one of the following:
  *
- * - `YES` means that the column supports nulls and has null values, therefore
- *    the optional might be valid or invalid
+ * - `nullate::YES` means that the column supports nulls and the optional returned
+ *    might be valid or invalid.
  *
- * - `NO` the user has attested that the column has no null values,
+ * - `nullate::NO` means the caller attests that the column has no null values,
  *    no checks will occur and `thrust::optional{column[i]}` will be
  *    return for each `i`.
  *
- * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating
- *    on construction of the iterator if column has nulls.
- *    When `with_nulls=true` the return value validity will be determined if column[i]
- *    is not null.
- *    When `with_nulls=false` the return value will always be valid
+ * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
+ *    specifies if the column has nulls at runtime.
+ *    For `DYNAMIC{true}` the return value will be `thrust::optional{column[i]}` if
+ *      element `i` is not null and `thrust::optional{}` if element `i` is null.
+ *    For `DYNAMIC{false}` the return value will always be `thrust::optional{column[i]}`.
  *
  * @throws cudf::logic_error if column datatype and template T type mismatch.
- * @throws cudf::logic_error if the column is not nullable, and `with_nulls=true`
- *
+ * @throws cudf::logic_error if the column is not nullable and `with_nulls` evaluates to true
  *
  * @tparam T The type of elements in the column
- * @tparam contains_nulls_mode Specifies if nulls are checked at runtime or compile time.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <typename T, typename contains_nulls_mode>
+template <typename T, typename Nullate>
 struct optional_accessor {
   column_device_view const col;  ///< column view of column in device
 
   /**
-   * @brief constructor
-   * @param[in] _col column device view of cudf column
+   * @brief Constructor
+   *
+   * @param col Column on which to iterator over its elements.
+   * @param with_nulls Indicates if the `col` should be checked for nulls.
    */
-  optional_accessor(column_device_view const& _col) : col{_col}
+  optional_accessor(column_device_view const& _col, Nullate with_nulls)
+    : col{_col}, has_nulls{with_nulls}
   {
     CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
+    if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
   CUDA_DEVICE_CALLABLE
   thrust::optional<T> operator()(cudf::size_type i) const
   {
-    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+    if (has_nulls) {
       return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
                                        : thrust::optional<T>{thrust::nullopt};
     }
     return thrust::optional<T>{col.element<T>(i)};
   }
-};
-
-template <typename T>
-struct optional_accessor<T, contains_nulls::DYNAMIC> {
-  column_device_view const col;  ///< column view of column in device
-  bool has_nulls;
-
-  /**
-   * @brief constructor
-   * @param[in] _col column device view of cudf column
-   * @param[in] with_nulls Indicates if @p _col has nulls
-   */
-  optional_accessor(column_device_view const& _col, bool with_nulls)
-    : col{_col}, has_nulls{with_nulls}
-  {
-    CUDF_EXPECTS(type_id_matches_device_storage_type<T>(col.type().id()), "the data type mismatch");
-    if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
-  }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::optional<T> operator()(cudf::size_type i) const
-  {
-    return (has_nulls and col.is_null_nocheck(i)) ? thrust::optional<T>{thrust::nullopt}
-                                                  : thrust::optional<T>{col.element<T>(i)};
-  }
+  Nullate has_nulls{};
 };
 
 /**
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 3e789299716..01742384972 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -171,127 +171,61 @@ auto make_null_replacement_iterator(column_device_view const& column,
  *
  * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
  *
- * When the element of an iterator contextually converted to bool, the conversion returns true
+ * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
  *
- * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
- * runtime, with the user stating on construction of the iterator if column has nulls.
- * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple
- * iterators and you don't want to compile all the combinations of iterator types
+ * Calling this function with `nullate::DYNAMIC` defers the assumption
+ * of nullability to runtime with the caller indicating if the column has nulls.
+ * This is useful when an algorithm is going to execute on multiple iterators and all
+ * the combinations of iterator types are not required at compile time.
  *
- * Example:
- *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T>
  * void some_function(cudf::column_view<T> const& col_view){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    // Create a `DYNAMIC` optional iterator
- *    auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::DYNAMIC{},
- *                                                col_view.has_nulls());
+ *    auto optional_iterator =
+ *      cudf::detail::make_optional_iterator<T>(
+ *        d_col, cudf::nullate::DYNAMIC{col_view.has_nulls()});
  * }
- * \endcode
- *
- * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and
- *         the user has stated nulls exist
- * @throws cudf::logic_error if column datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the column
- * @param column The column to iterate
- * @return Iterator that returns valid column elements and the validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column,
-                            contains_nulls::DYNAMIC,
-                            bool has_nulls)
-{
-  return column.optional_begin<Element>(contains_nulls::DYNAMIC{}, has_nulls);
-}
-
-/**
- * @brief Constructs an optional iterator over a column's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * make_optional_iterator with mode `YES` means that the column supports nulls and
- * potentially has null values, therefore the optional might not contain a value
+ * @endcode
  *
- * Example:
+ * Calling this function with `nullate::YES` means that the column supports
+ * nulls and the optional returned might not contain a value.
+ * Calling this function with `nullate::NO` means that the column has no
+ * null values and the optional returned will always contain a value.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T, bool has_nulls>
  * void some_function(cudf::column_view<T> const& col_view){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    if constexpr(has_nulls) {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::YES{});
+ *      auto optional_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::YES{});
  *      //use optional_iterator
  *    } else {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::NO{});
+ *      auto optional_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::NO{});
  *      //use optional_iterator
  *    }
  * }
- * \endcode
+ * @endcode
  *
- * @throws cudf::logic_error if the column is not nullable, and `YES` mode used
+ * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true.
  * @throws cudf::logic_error if column datatype and Element type mismatch.
  *
- * @tparam Element The type of elements in the column
- * @param column The column to iterate
- * @return Iterator that returns column elements and the validity of the
- * element as a thrust::optional
- */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column, contains_nulls::YES)
-{
-  return column.optional_begin<Element>(contains_nulls::YES{});
-}
-
-/**
- * @brief Constructs an optional iterator over a column's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * make_optional_iterator with mode `NO` means that the column has no null values,
- * therefore the optional will always contain a value.
- *
- * Example:
- *
- * \code{.cpp}
- * template<typename T, bool has_nulls>
- * void some_function(cudf::column_view<T> const& col_view){
- *    auto d_col = cudf::column_device_view::create(col_view);
- *    if constexpr(has_nulls) {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::YES{});
- *      //use optional_iterator
- *    } else {
- *      auto optional_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                  cudf::contains_nulls::NO{});
- *      //use optional_iterator
- *    }
- * }
- * \endcode
- *
- * @throws cudf::logic_error if column datatype and Element type mismatch.
+ * @tparam Element The type of elements in the column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  *
- * @tparam Element The type of elements in the column
  * @param column The column to iterate
- * @return Iterator that returns column elements and the validity of the
- * element in a thrust::optional
+ * @param has_nulls Indicates whether `column` is checked for nulls.
+ * @return Iterator that returns valid column elements and the validity of the
+ * element in a `thrust::optional`
  */
-template <typename Element>
-auto make_optional_iterator(column_device_view const& column, contains_nulls::NO)
+template <typename Element, typename Nullate>
+auto make_optional_iterator(column_device_view const& column, Nullate has_nulls)
 {
-  return column.optional_begin<Element>(contains_nulls::NO{});
+  return column.optional_begin<Element, Nullate>(has_nulls);
 }
 
 /**
@@ -447,40 +381,38 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
                                          scalar_value_accessor<Element>{scalar_value});
 }
 
-template <typename Element, typename contains_nulls_mode>
-struct scalar_optional_accessor;
-
 /**
- * @brief optional accessor of a maybe-nullable scalar
- *
- * The scalar_optional_accessor always returns a thrust::optional of the scalar.
- * The validity of the optional is determined by the contains_nulls_mode template parameter
- * which has the following modes:
+ * @brief Optional accessor for a scalar
  *
- * `DYNAMIC`: Defer nullability checks to runtime
+ * The `scalar_optional_accessor` always returns a `thrust::optional` of the scalar.
+ * The validity of the optional is determined by the `Nullate` parameter which may
+ * be one of the following:
  *
- *  - When `with_nulls=true` the return value will be a `thrust::optional{scalar}`
- *    when scalar is valid, and `thrust::optional{}` when the scalar is invalid.
+ * - `nullate::YES` means that the scalar may be valid or invalid and the optional returned
+ *    will contain a value only if the scalar is valid.
  *
- *  - When `with_nulls=false` the return value will always be `thrust::optional{scalar}`
+ * - `nullate::NO` means the caller attests that the scalar will always be valid,
+ *    no checks will occur and `thrust::optional{column[i]}` will return a value
+ *    for each `i`.
  *
- * `NO`: No null values will occur for this scalar, no checks will occur
- *  and `thrust::optional{scalar}` will always be returned.
- *
- * `YES`: null values will occur for this scalar,
- *  and `thrust::optional{scalar}` will always be returned.
+ * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
+ *    specifies if the scalar may be valid or invalid.
+ *    For `DYNAMIC{true}` the return value will be a `thrust::optional{scalar}` when the
+ *      scalar is valid and a `thrust::optional{}` when the scalar is invalid.
+ *    For `DYNAMIC{false}` the return value will always be a `thrust::optional{scalar}`.
  *
  * @throws `cudf::logic_error` if scalar datatype and Element type mismatch.
  *
  * @tparam Element The type of return type of functor
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <typename Element, typename contains_nulls_mode>
+template <typename Element, typename Nullate>
 struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   using super_t    = scalar_value_accessor<Element>;
   using value_type = thrust::optional<Element>;
 
-  scalar_optional_accessor(scalar const& scalar_value)
-    : scalar_value_accessor<Element>(scalar_value)
+  scalar_optional_accessor(scalar const& scalar_value, Nullate with_nulls)
+    : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
   {
   }
 
@@ -494,32 +426,14 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   CUDA_HOST_DEVICE_CALLABLE
   const value_type operator()(size_type) const
   {
-    if constexpr (std::is_same_v<contains_nulls_mode, contains_nulls::YES>) {
+    if (has_nulls) {
       return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
                                            : value_type{thrust::nullopt};
     }
     return Element{super_t::dscalar.value()};
   }
-};
 
-template <typename Element>
-struct scalar_optional_accessor<Element, cudf::contains_nulls::DYNAMIC>
-  : public scalar_value_accessor<Element> {
-  using super_t    = scalar_value_accessor<Element>;
-  using value_type = thrust::optional<Element>;
-  bool has_nulls;
-
-  scalar_optional_accessor(scalar const& scalar_value, bool with_nulls)
-    : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
-  {
-  }
-
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
-  {
-    return (has_nulls and !super_t::dscalar.is_valid()) ? value_type{thrust::nullopt}
-                                                        : Element{super_t::dscalar.value()};
-  }
+  Nullate has_nulls{};
 };
 
 /**
@@ -622,156 +536,70 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  *
  * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
  *
- * When the element of an iterator contextually converted to bool, the conversion returns true
+ * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
  *
  * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
  *
- * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to
- * runtime, with the user stating on construction of the iterator if scalar has nulls.
- *
- * Example:
+ * Calling this function with `nullate::DYNAMIC` defers the assumption
+ * of nullability to runtime with the caller indicating if the scalar is valid.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T>
  * void some_function(cudf::column_view<T> const& col_view,
  *                    scalar const& scalar_value,
  *                    bool col_has_nulls){
  *    auto d_col = cudf::column_device_view::create(col_view);
- *    auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
-                                      cudf::contains_nulls::DYNAMIC{}, col_has_nulls);
- *    auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
-                                      cudf::contains_nulls::DYNAMIC{}, scalar_value.is_valid());
+ *    auto column_iterator = cudf::detail::make_optional_iterator<T>(
+ *      d_col, cudf::nullate::DYNAMIC{col_has_nulls});
+ *    auto scalar_iterator = cudf::detail::make_optional_iterator<T>(
+ *      scalar_value, cudf::nullate::DYNAMIC{scalar_value.is_valid()});
  *    //use iterators
  * }
- * \endcode
- *
- * @throws cudf::logic_error if the scalar is not nullable, and `DYNAMIC` mode used and
- *         the user has stated nulls exist
- * @throws cudf::logic_error if scalar datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the scalar
- * @tparam has_nulls If the scalar value will have a null at runtime
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value,
-                                   contains_nulls::DYNAMIC,
-                                   bool has_nulls)
-{
-  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
-               "the data type mismatch");
-  return thrust::make_transform_iterator(
-    thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::DYNAMIC>{scalar_value, has_nulls});
-}
-
-/**
- * @brief Constructs an optional iterator over a scalar's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
- *
- * make_optional_iterator ith mode `YES` means that the scalar supports nulls and
- * potentially has null values, therefore the optional might not contain a value
- * therefore the optional will always contain a value.
+ * @endcode
  *
- * Example:
+ * Calling this function with `nullate::YES` means that the scalar maybe invalid
+ * and the optional return might not contain a value.
+ * Calling this function with `nullate::NO` means that the scalar is valid
+ * and the optional returned will always contain a value.
  *
- * \code{.cpp}
+ * @code{.cpp}
  * template<typename T, bool any_nulls>
  * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
  *    auto d_col = cudf::column_device_view::create(col_view);
  *    if constexpr(any_nulls) {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::YES{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::YES{});
+ *      auto column_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::YES{});
+ *      auto scalar_iterator =
+ *        cudf::detail::make_optional_iterator<T>(scalar_value, cudf::nullate::YES{});
  *      //use iterators
  *    } else {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::NO{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::NO{});
+ *      auto column_iterator =
+ *        cudf::detail::make_optional_iterator<T>(d_col, cudf::nullate::NO{});
+ *      auto scalar_iterator =
+ *        cudf::detail::make_optional_iterator<T>(scalar_value, cudf::nullate::NO{});
  *      //use iterators
  *    }
  * }
- * \endcode
+ * @endcode
  *
- * @throws cudf::logic_error if the scalar is not nullable, and `YES` mode used
  * @throws cudf::logic_error if scalar datatype and Element type mismatch.
  *
  * @tparam Element The type of elements in the scalar
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and the validity of the
- * element in a thrust::optional
- */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::YES)
-{
-  CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
-               "the data type mismatch");
-  return thrust::make_transform_iterator(
-    thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::YES>{scalar_value});
-}
-
-/**
- * @brief Constructs an optional iterator over a scalar's values and its validity.
- *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
- *
- * When the element of an iterator contextually converted to bool, the conversion returns true
- * if the object contains a value and false if it does not contain a value.
- *
- * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing.
- *
- * make_optional_iterator with mode `NO` means that the scalar has no null values,
- * therefore the optional will always contain a value.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  *
- * Example:
- *
- * \code{.cpp}
- * template<typename T, bool any_nulls>
- * void some_function(cudf::column_view<T> const& col_view, scalar const& scalar_value){
- *    auto d_col = cudf::column_device_view::create(col_view);
- *    if constexpr(any_nulls) {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::YES{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::YES{});
- *      //use iterators
- *    } else {
- *      auto column_iterator = cudf::detail::make_optional_iterator<T>(d_col,
- *                                                cudf::contains_nulls::NO{});
- *      auto scalar_iterator = cudf::detail::make_optional_iterator<T>(scalar_value,
- *                                                cudf::contains_nulls::NO{});
- *      //use iterators
- *    }
- * }
- * \endcode
- *
- * @throws cudf::logic_error if scalar datatype and Element type mismatch.
- *
- * @tparam Element The type of elements in the scalar
- * @param scalar_value The scalar to iterate
- * @return Iterator that returns scalar elements and the validity of the
- * element in a thrust::optional
+ * @param scalar_value The scalar to be returned by the iterator.
+ * @param has_nulls Indicates if the scalar value may be invalid.
+ * @return Iterator that returns scalar and the validity of the scalar in a thrust::optional
  */
-template <typename Element>
-auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::NO)
+template <typename Element, typename Nullate>
+auto inline make_optional_iterator(scalar const& scalar_value, Nullate has_nulls)
 {
   CUDF_EXPECTS(type_id_matches_device_storage_type<Element>(scalar_value.type().id()),
                "the data type mismatch");
   return thrust::make_transform_iterator(
     thrust::make_constant_iterator<size_type>(0),
-    scalar_optional_accessor<Element, contains_nulls::NO>{scalar_value});
+    scalar_optional_accessor<Element, Nullate>{scalar_value, has_nulls});
 }
 
 /**
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index ec83e348e33..f141d9b5d59 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -90,8 +90,8 @@ struct tagged_element_relational_comparator {
 
     column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs};
 
-    auto erl_comparator =
-      element_relational_comparator<has_nulls>(*ptr_left_dview, *ptr_right_dview, null_precedence);
+    auto erl_comparator = element_relational_comparator(
+      nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence);
 
     return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx);
   }
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 70ccac2f75d..0f3ca073380 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -50,9 +50,9 @@ namespace detail {
 /**
  * @brief Compare the elements ordering with respect to `lhs`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @param lhs first element
+ * @param rhs second element
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element>
@@ -69,14 +69,15 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs)
 
 /**
  * @brief A specialization for floating-point `Element` type relational comparison
- * to derive the order of the elements with respect to `lhs`. Specialization is to
- * handle `nan` in the order shown below.
+ * to derive the order of the elements with respect to `lhs`.
+ *
+ * This Specialization handles `nan` in the following order:
  * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)`
  * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)`
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @param lhs first element
+ * @param rhs second element
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element, std::enable_if_t<std::is_floating_point<Element>::value>* = nullptr>
@@ -119,7 +120,7 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord
  *
  * @param[in] lhs first element
  * @param[in] rhs second element
- * @return weak_ordering Indicates the relationship between the elements in
+ * @return Indicates the relationship between the elements in
  * the `lhs` and `rhs` columns.
  */
 template <typename Element, std::enable_if_t<not std::is_floating_point<Element>::value>* = nullptr>
@@ -132,9 +133,9 @@ __device__ weak_ordering relational_compare(Element lhs, Element rhs)
  * @brief A specialization for floating-point `Element` type to check if
  * `lhs` is equivalent to `rhs`. `nan == nan`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return bool `true` if `lhs` == `rhs` else `false`.
+ * @param lhs first element
+ * @param rhs second element
+ * @return `true` if `lhs` == `rhs` else `false`.
  */
 template <typename Element, std::enable_if_t<std::is_floating_point<Element>::value>* = nullptr>
 __device__ bool equality_compare(Element lhs, Element rhs)
@@ -147,9 +148,9 @@ __device__ bool equality_compare(Element lhs, Element rhs)
  * @brief A specialization for non-floating-point `Element` type to check if
  * `lhs` is equivalent to `rhs`.
  *
- * @param[in] lhs first element
- * @param[in] rhs second element
- * @return bool `true` if `lhs` == `rhs` else `false`.
+ * @param lhs first element
+ * @param rhs second element
+ * @return `true` if `lhs` == `rhs` else `false`.
  */
 template <typename Element, std::enable_if_t<not std::is_floating_point<Element>::value>* = nullptr>
 __device__ bool equality_compare(Element const lhs, Element const rhs)
@@ -160,9 +161,9 @@ __device__ bool equality_compare(Element const lhs, Element const rhs)
 /**
  * @brief Performs an equality comparison between two elements in two columns.
  *
- * @tparam has_nulls Indicates the potential for null values in either column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class element_equality_comparator {
  public:
   /**
@@ -171,14 +172,17 @@ class element_equality_comparator {
    *
    * @note `lhs` and `rhs` may be the same.
    *
+   * @param has_nulls Indicates if either input column contains nulls.
    * @param lhs The column containing the first element
    * @param rhs The column containing the second element (may be the same as lhs)
    * @param nulls_are_equal Indicates if two null elements are treated as equivalent
    */
-  __host__ __device__ element_equality_comparator(column_device_view lhs,
-                                                  column_device_view rhs,
-                                                  bool nulls_are_equal = true)
-    : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal}
+  __host__ __device__
+  element_equality_comparator(Nullate has_nulls,
+                              column_device_view lhs,
+                              column_device_view rhs,
+                              null_equality nulls_are_equal = null_equality::EQUAL)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
   {
   }
 
@@ -188,18 +192,17 @@ class element_equality_comparator {
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
    * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal
-   *
    */
   template <typename Element,
             std::enable_if_t<cudf::is_equality_comparable<Element, Element>()>* = nullptr>
   __device__ bool operator()(size_type lhs_element_index,
                              size_type rhs_element_index) const noexcept
   {
-    if (has_nulls) {
+    if (nulls) {
       bool const lhs_is_null{lhs.is_null(lhs_element_index)};
       bool const rhs_is_null{rhs.is_null(rhs_element_index)};
       if (lhs_is_null and rhs_is_null) {
-        return nulls_are_equal;
+        return nulls_are_equal == null_equality::EQUAL;
       } else if (lhs_is_null != rhs_is_null) {
         return false;
       }
@@ -220,14 +223,18 @@ class element_equality_comparator {
  private:
   column_device_view lhs;
   column_device_view rhs;
-  bool nulls_are_equal;
+  Nullate nulls;
+  null_equality nulls_are_equal;
 };
 
-template <bool has_nulls = true>
+template <typename Nullate>
 class row_equality_comparator {
  public:
-  row_equality_comparator(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true)
-    : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal}
+  row_equality_comparator(Nullate has_nulls,
+                          table_device_view lhs,
+                          table_device_view rhs,
+                          null_equality nulls_are_equal = true)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal}
   {
     CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(), "Mismatched number of columns.");
   }
@@ -236,7 +243,7 @@ class row_equality_comparator {
   {
     auto equal_elements = [=](column_device_view l, column_device_view r) {
       return cudf::type_dispatcher(l.type(),
-                                   element_equality_comparator<has_nulls>{l, r, nulls_are_equal},
+                                   element_equality_comparator{nulls, l, r, nulls_are_equal},
                                    lhs_row_index,
                                    rhs_row_index);
     };
@@ -247,15 +254,16 @@ class row_equality_comparator {
  private:
   table_device_view lhs;
   table_device_view rhs;
-  bool nulls_are_equal;
+  Nullate nulls;
+  null_equality nulls_are_equal;
 };
 
 /**
  * @brief Performs a relational comparison between two elements in two columns.
  *
- * @tparam has_nulls Indicates the potential for null values in either column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class element_relational_comparator {
  public:
   /**
@@ -266,13 +274,21 @@ class element_relational_comparator {
    *
    * @param lhs The column containing the first element
    * @param rhs The column containing the second element (may be the same as lhs)
-   * @param null_precedence Indicates how null values are ordered with other
-   * values
+   * @param has_nulls Indicates if either input column contains nulls.
+   * @param null_precedence Indicates how null values are ordered with other values
    */
-  __host__ __device__ element_relational_comparator(column_device_view lhs,
+  __host__ __device__ element_relational_comparator(Nullate has_nulls,
+                                                    column_device_view lhs,
                                                     column_device_view rhs,
                                                     null_order null_precedence)
-    : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence}
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, null_precedence{null_precedence}
+  {
+  }
+
+  __host__ __device__ element_relational_comparator(Nullate has_nulls,
+                                                    column_device_view lhs,
+                                                    column_device_view rhs)
+    : lhs{lhs}, rhs{rhs}, nulls{has_nulls}
   {
   }
 
@@ -281,7 +297,7 @@ class element_relational_comparator {
    *
    * @param lhs_element_index The index of the first element
    * @param rhs_element_index The index of the second element
-   * @return weak_ordering Indicates the relationship between the elements in
+   * @return Indicates the relationship between the elements in
    * the `lhs` and `rhs` columns.
    */
   template <typename Element,
@@ -289,11 +305,11 @@ class element_relational_comparator {
   __device__ weak_ordering operator()(size_type lhs_element_index,
                                       size_type rhs_element_index) const noexcept
   {
-    if (has_nulls) {
+    if (nulls) {
       bool const lhs_is_null{lhs.is_null(lhs_element_index)};
       bool const rhs_is_null{rhs.is_null(rhs_element_index)};
 
-      if (lhs_is_null or rhs_is_null) {  // atleast one is null
+      if (lhs_is_null or rhs_is_null) {  // at least one is null
         return null_compare(lhs_is_null, rhs_is_null, null_precedence);
       }
     }
@@ -313,7 +329,8 @@ class element_relational_comparator {
  private:
   column_device_view lhs;
   column_device_view rhs;
-  null_order null_precedence;
+  Nullate nulls;
+  null_order null_precedence{};
 };
 
 /**
@@ -329,9 +346,9 @@ class element_relational_comparator {
  * second letter in both words is the first non-equal letter, and `a < b`, thus
  * `aac < abb`.
  *
- * @tparam has_nulls Indicates the potential for null values in either row.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <bool has_nulls = true>
+template <typename Nullate>
 class row_lexicographic_comparator {
  public:
   /**
@@ -343,6 +360,7 @@ class row_lexicographic_comparator {
    *
    * @param lhs The first table
    * @param rhs The second table (may be the same table as `lhs`)
+   * @param has_nulls Indicates if either input table contains columns with nulls.
    * @param column_order Optional, device array the same length as a row that
    * indicates the desired ascending/descending order of each column in a row.
    * If `nullptr`, it is assumed all columns are sorted in ascending order.
@@ -351,11 +369,16 @@ class row_lexicographic_comparator {
    * it is nullptr, then null precedence would be `null_order::BEFORE` for all
    * columns.
    */
-  row_lexicographic_comparator(table_device_view lhs,
+  row_lexicographic_comparator(Nullate has_nulls,
+                               table_device_view lhs,
                                table_device_view rhs,
                                order const* column_order         = nullptr,
                                null_order const* null_precedence = nullptr)
-    : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence}
+    : _lhs{lhs},
+      _rhs{rhs},
+      _nulls{has_nulls},
+      _column_order{column_order},
+      _null_precedence{null_precedence}
   {
     CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns.");
     CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs),
@@ -376,14 +399,14 @@ class row_lexicographic_comparator {
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING);
 
-      weak_ordering state{weak_ordering::EQUIVALENT};
       null_order null_precedence =
         _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i];
 
       auto comparator =
-        element_relational_comparator<has_nulls>{_lhs.column(i), _rhs.column(i), null_precedence};
+        element_relational_comparator{_nulls, _lhs.column(i), _rhs.column(i), null_precedence};
 
-      state = cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
+      weak_ordering state =
+        cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index);
 
       if (state == weak_ordering::EQUIVALENT) { continue; }
 
@@ -395,6 +418,7 @@ class row_lexicographic_comparator {
  private:
   table_device_view _lhs;
   table_device_view _rhs;
+  Nullate _nulls{};
   null_order const* _null_precedence{};
   order const* _column_order{};
 };  // class row_lexicographic_comparator
@@ -403,9 +427,9 @@ class row_lexicographic_comparator {
  * @brief Computes the hash value of an element in the given column.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the column.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class element_hasher {
  public:
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
@@ -421,15 +445,20 @@ class element_hasher {
     cudf_assert(false && "Unsupported type in hash.");
     return {};
   }
+
+  Nullate has_nulls;
 };
 
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class element_hasher_with_seed {
  public:
-  element_hasher_with_seed() = default;
-  __device__ element_hasher_with_seed(uint32_t seed) : _seed{seed} {}
-  __device__ element_hasher_with_seed(uint32_t seed, hash_value_type null_hash)
-    : _seed{seed}, _null_hash(null_hash)
+  __device__ element_hasher_with_seed(Nullate has_nulls, uint32_t seed)
+    : _seed{seed}, _has_nulls{has_nulls}
+  {
+  }
+
+  __device__ element_hasher_with_seed(Nullate has_nulls, uint32_t seed, hash_value_type null_hash)
+    : _seed{seed}, _null_hash{null_hash}, _has_nulls{has_nulls}
   {
   }
 
@@ -450,20 +479,24 @@ class element_hasher_with_seed {
  private:
   uint32_t _seed{DEFAULT_HASH_SEED};
   hash_value_type _null_hash{std::numeric_limits<hash_value_type>::max()};
+  Nullate _has_nulls;
 };
 
 /**
  * @brief Computes the hash value of a row in the given table.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the table.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class row_hasher {
  public:
   row_hasher() = delete;
-  row_hasher(table_device_view t) : _table{t} {}
-  row_hasher(table_device_view t, uint32_t seed) : _table{t}, _seed(seed) {}
+  row_hasher(Nullate has_nulls, table_device_view t) : _table{t}, _has_nulls{has_nulls} {}
+  row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
+    : _table{t}, _seed(seed), _has_nulls{has_nulls}
+  {
+  }
 
   __device__ auto operator()(size_type row_index) const
   {
@@ -476,7 +509,7 @@ class row_hasher {
       hash_combiner(hash_value_type{0},
                     type_dispatcher<dispatch_storage_type>(
                       _table.column(0).type(),
-                      element_hasher_with_seed<hash_function, has_nulls>{_seed},
+                      element_hasher_with_seed<hash_function, Nullate>{_has_nulls, _seed},
                       _table.column(0),
                       row_index));
 
@@ -484,7 +517,7 @@ class row_hasher {
     auto hasher = [=](size_type column_index) {
       return cudf::type_dispatcher<dispatch_storage_type>(
         _table.column(column_index).type(),
-        element_hasher<hash_function, has_nulls>{},
+        element_hasher<hash_function, Nullate>{_has_nulls},
         _table.column(column_index),
         row_index);
     };
@@ -502,6 +535,7 @@ class row_hasher {
 
  private:
   table_device_view _table;
+  Nullate _has_nulls;
   uint32_t _seed{DEFAULT_HASH_SEED};
 };
 
@@ -510,14 +544,14 @@ class row_hasher {
  * initial hash value for each column.
  *
  * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam has_nulls Indicates the potential for null values in the table.
+ * @tparam Nullate A cudf::nullate type describing how to check for nulls.
  */
-template <template <typename> class hash_function, bool has_nulls = true>
+template <template <typename> class hash_function, typename Nullate>
 class row_hasher_initial_values {
  public:
   row_hasher_initial_values() = delete;
-  row_hasher_initial_values(table_device_view t, hash_value_type* initial_hash)
-    : _table{t}, _initial_hash(initial_hash)
+  row_hasher_initial_values(Nullate has_nulls, table_device_view t, hash_value_type* initial_hash)
+    : _table{t}, _initial_hash(initial_hash), _has_nulls{has_nulls}
   {
   }
 
@@ -529,11 +563,11 @@ class row_hasher_initial_values {
 
     // Hashes an element in a column and combines with an initial value
     auto hasher = [=](size_type column_index) {
-      auto hash_value =
-        cudf::type_dispatcher<dispatch_storage_type>(_table.column(column_index).type(),
-                                                     element_hasher<hash_function, has_nulls>{},
-                                                     _table.column(column_index),
-                                                     row_index);
+      auto hash_value = cudf::type_dispatcher<dispatch_storage_type>(
+        _table.column(column_index).type(),
+        element_hasher<hash_function, Nullate>{_has_nulls},
+        _table.column(column_index),
+        row_index);
 
       return hash_combiner(_initial_hash[column_index], hash_value);
     };
@@ -550,6 +584,7 @@ class row_hasher_initial_values {
  private:
   table_device_view _table;
   hash_value_type* _initial_hash;
+  Nullate _has_nulls;
 };
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index f4d09c8e0be..55173fd409f 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -75,10 +75,8 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    auto lhs_iter =
-      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
-    auto rhs_iter =
-      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
+    auto lhs_iter = cudf::detail::make_optional_iterator<T>(lhs, nullate::DYNAMIC{left_nullable});
+    auto rhs_iter = cudf::detail::make_optional_iterator<T>(rhs, nullate::DYNAMIC{right_nullable});
     return detail::copy_if_else(left_nullable || right_nullable,
                                 lhs_iter,
                                 lhs_iter + size,
@@ -112,10 +110,8 @@ struct copy_if_else_functor_impl<string_view> {
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    auto lhs_iter =
-      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
-    auto rhs_iter =
-      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
+    auto lhs_iter = cudf::detail::make_optional_iterator<T>(lhs, nullate::DYNAMIC{left_nullable});
+    auto rhs_iter = cudf::detail::make_optional_iterator<T>(rhs, nullate::DYNAMIC{right_nullable});
     return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
   }
 };
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index 62f992012cd..b08eaa0862c 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -76,11 +76,10 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
-    auto input_iterator =
-      cudf::detail::make_optional_iterator<T>(
-        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
-      offset;
-    auto fill_iterator = cudf::detail::make_optional_iterator<T>(fill_value, contains_nulls::YES{});
+    auto input_iterator     = cudf::detail::make_optional_iterator<T>(
+                            *values_device_view, nullate::DYNAMIC{segmented_values.has_nulls()}) -
+                          offset;
+    auto fill_iterator = cudf::detail::make_optional_iterator<T>(fill_value, nullate::YES{});
     return copy_if_else(nullable,
                         input_iterator,
                         input_iterator + segmented_values.size(),
@@ -105,12 +104,10 @@ struct segmented_shift_functor<string_view> {
                                      rmm::mr::device_memory_resource* mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
-    auto input_iterator =
-      make_optional_iterator<cudf::string_view>(
-        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
-      offset;
-    auto fill_iterator =
-      make_optional_iterator<cudf::string_view>(fill_value, contains_nulls::YES{});
+    auto input_iterator     = make_optional_iterator<cudf::string_view>(
+                            *values_device_view, nullate::DYNAMIC{segmented_values.has_nulls()}) -
+                          offset;
+    auto fill_iterator = make_optional_iterator<cudf::string_view>(fill_value, nullate::YES{});
     return strings::detail::copy_if_else(input_iterator,
                                          input_iterator + segmented_values.size(),
                                          fill_iterator,
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index e35fa36a289..0c3e79ea36c 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -416,8 +416,8 @@ void sparse_to_dense_results(table_view const& keys,
  * @brief Construct hash map that uses row comparator and row hasher on
  * `d_keys` table and stores indices
  */
-template <bool keys_have_nulls>
 auto create_hash_map(table_device_view const& d_keys,
+                     bool keys_have_nulls,
                      null_policy include_null_keys,
                      rmm::cuda_stream_view stream)
 {
@@ -426,15 +426,17 @@ auto create_hash_map(table_device_view const& d_keys,
 
   using map_type = concurrent_unordered_map<size_type,
                                             size_type,
-                                            row_hasher<default_hash, keys_have_nulls>,
-                                            row_equality_comparator<keys_have_nulls>>;
+                                            row_hasher<default_hash, nullate::DYNAMIC>,
+                                            row_equality_comparator<nullate::DYNAMIC>>;
 
   using allocator_type = typename map_type::allocator_type;
 
-  bool const null_keys_are_equal{include_null_keys == null_policy::INCLUDE};
+  auto const null_keys_are_equal =
+    include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
 
-  row_hasher<default_hash, keys_have_nulls> hasher{d_keys};
-  row_equality_comparator<keys_have_nulls> rows_equal{d_keys, d_keys, null_keys_are_equal};
+  row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
+  row_equality_comparator rows_equal{
+    nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
 
   return map_type::create(compute_hash_table_size(d_keys.num_rows()),
                           stream,
@@ -481,14 +483,13 @@ auto create_sparse_results_table(table_view const& flattened_values,
 /**
  * @brief Computes all aggregations from `requests` that require a single pass
  * over the data and stores the results in `sparse_results`
- *
- * @see groupby_null_templated()
  */
-template <bool keys_have_nulls, typename Map>
+template <typename Map>
 void compute_single_pass_aggs(table_view const& keys,
                               host_span<aggregation_request const> requests,
                               cudf::detail::result_cache* sparse_results,
                               Map& map,
+                              bool keys_have_nulls,
                               null_policy include_null_keys,
                               rmm::cuda_stream_view stream)
 {
@@ -578,24 +579,24 @@ rmm::device_uvector<size_type> extract_populated_keys(Map map,
  * results using the aforementioned index vector. Dense results are stored into
  * the in/out parameter `cache`.
  */
-template <bool keys_have_nulls>
-std::unique_ptr<table> groupby_null_templated(table_view const& keys,
-                                              host_span<aggregation_request const> requests,
-                                              cudf::detail::result_cache* cache,
-                                              null_policy include_null_keys,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> groupby(table_view const& keys,
+                               host_span<aggregation_request const> requests,
+                               cudf::detail::result_cache* cache,
+                               bool keys_have_nulls,
+                               null_policy include_null_keys,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
 {
   auto d_keys_ptr = table_device_view::create(keys, stream);
-  auto map        = create_hash_map<keys_have_nulls>(*d_keys_ptr, include_null_keys, stream);
+  auto map        = create_hash_map(*d_keys_ptr, keys_have_nulls, include_null_keys, stream);
 
   // Cache of sparse results where the location of aggregate value in each
   // column is indexed by the hash map
   cudf::detail::result_cache sparse_results(requests.size());
 
   // Compute all single pass aggs first
-  compute_single_pass_aggs<keys_have_nulls>(
-    keys, requests, &sparse_results, *map, include_null_keys, stream);
+  compute_single_pass_aggs(
+    keys, requests, &sparse_results, *map, keys_have_nulls, include_null_keys, stream);
 
   // Extract the populated indices from the hash map and create a gather map.
   // Gathering using this map from sparse results will give dense results.
@@ -664,14 +665,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
 {
   cudf::detail::result_cache cache(requests.size());
 
-  std::unique_ptr<table> unique_keys;
-  if (has_nulls(keys)) {
-    unique_keys =
-      groupby_null_templated<true>(keys, requests, &cache, include_null_keys, stream, mr);
-  } else {
-    unique_keys =
-      groupby_null_templated<false>(keys, requests, &cache, include_null_keys, stream, mr);
-  }
+  std::unique_ptr<table> unique_keys =
+    groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
 
   return std::make_pair(std::move(unique_keys), extract_results(requests, cache));
 }
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index b8c4571961d..5154c867095 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -49,7 +49,7 @@ struct nunique_functor {
 
     auto values_view = column_device_view::create(values, stream);
     if (values.has_nulls()) {
-      auto equal              = element_equality_comparator<true>{*values_view, *values_view};
+      auto equal = element_equality_comparator{nullate::YES{}, *values_view, *values_view};
       auto is_unique_iterator = thrust::make_transform_iterator(
         thrust::make_counting_iterator<size_type>(0),
         [v = *values_view,
@@ -72,7 +72,7 @@ struct nunique_functor {
                             thrust::make_discard_iterator(),
                             result->mutable_view().begin<size_type>());
     } else {
-      auto equal              = element_equality_comparator<false>{*values_view, *values_view};
+      auto equal = element_equality_comparator{nullate::NO{}, *values_view, *values_view};
       auto is_unique_iterator = thrust::make_transform_iterator(
         thrust::make_counting_iterator<size_type>(0),
         [v = *values_view,
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index f36bdc0a660..62aa3df8e5c 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -31,7 +31,6 @@ namespace {
 /**
  * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results
  *
- * @tparam has_nulls if the order_by column has nulls
  * @tparam value_resolver flag value resolver function with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
  * @param order_by input column to generate ranks for
@@ -39,23 +38,26 @@ namespace {
  * @param group_offsets group index offsets with group ID indices
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
+ * @param has_nulls true if nulls are included in the `order_by` column
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <bool has_nulls, typename value_resolver, typename scan_operator>
+template <typename value_resolver, typename scan_operator>
 std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        device_span<size_type const> group_labels,
                                        device_span<size_type const> group_offsets,
                                        value_resolver resolver,
                                        scan_operator scan_op,
+                                       bool has_nulls,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
     table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
+  row_equality_comparator comparator(
+    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -92,22 +94,13 @@ std::unique_ptr<column> rank_scan(column_view const& order_by,
                                   rmm::cuda_stream_view stream,
                                   rmm::mr::device_memory_resource* mr)
 {
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      group_labels,
-      group_offsets,
-      [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
-      DeviceMax{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
     group_labels,
     group_offsets,
     [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
     DeviceMax{},
+    has_nested_nulls(table_view{{order_by}}),
     stream,
     mr);
 }
@@ -118,22 +111,13 @@ std::unique_ptr<column> dense_rank_scan(column_view const& order_by,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      group_labels,
-      group_offsets,
-      [] __device__(bool equality, auto row_index) { return equality; },
-      DeviceSum{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
     group_labels,
     group_offsets,
     [] __device__(bool equality, auto row_index) { return equality; },
     DeviceSum{},
+    has_nested_nulls(table_view{{order_by}}),
     stream,
     mr);
 }
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index e25fdd6fc27..08f65536466 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -220,21 +220,12 @@ struct group_scan_functor<K,
 
     // Find the indices of the prefix min/max elements within each group.
     auto const count_iter = thrust::make_counting_iterator<size_type>(0);
-    if (values.has_nulls()) {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
-                                                         *d_flattened_values_ptr,
-                                                         flattened_null_precedences.data(),
-                                                         K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    } else {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
-                                                          *d_flattened_values_ptr,
-                                                          flattened_null_precedences.data(),
-                                                          K == aggregation::MIN);
-      do_scan(count_iter, map_begin, binop);
-    }
+    auto const binop      = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
+                                                                  *d_flattened_values_ptr,
+                                                                  values.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  K == aggregation::MIN);
+    do_scan(count_iter, map_begin, binop);
 
     auto gather_map_view =
       column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 95a36f40e57..4fde825c0e0 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -270,14 +270,14 @@ struct group_reduction_functor<
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
-    if (values.has_nulls()) {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<true>(values.size(),
-                                                         *d_flattened_values_ptr,
-                                                         flattened_null_precedences.data(),
-                                                         K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
+    auto const binop        = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
+                                                                  *d_flattened_values_ptr,
+                                                                  values.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  K == aggregation::ARGMIN);
+    do_reduction(count_iter, result_begin, binop);
 
+    if (values.has_nulls()) {
       // Generate bitmask for the output by segmented reduction of the input bitmask.
       auto const d_values_ptr = column_device_view::create(values, stream);
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
@@ -288,13 +288,6 @@ struct group_reduction_functor<
       auto [null_mask, null_count] =
         cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr);
       result->set_null_mask(std::move(null_mask), null_count);
-    } else {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn<false>(values.size(),
-                                                          *d_flattened_values_ptr,
-                                                          flattened_null_precedences.data(),
-                                                          K == aggregation::ARGMIN);
-      do_reduction(count_iter, result_begin, binop);
     }
 
     return result;
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 7adb4ccec76..8d09728b771 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -50,9 +50,8 @@ namespace {
  * @brief Compares two `table` rows for equality as if the table were
  * ordered according to a specified permutation map.
  */
-template <bool nullable = true>
 struct permuted_row_equality_comparator {
-  cudf::row_equality_comparator<nullable> _comparator;
+  cudf::row_equality_comparator<cudf::nullate::DYNAMIC> _comparator;
   cudf::size_type const* _map;
 
   /**
@@ -60,10 +59,12 @@ struct permuted_row_equality_comparator {
    *
    * @param t The `table` whose rows will be compared
    * @param map The permutation map that specifies the effective ordering of
-   *`t`. Must be the same size as `t.num_rows()`
+   * `t`. Must be the same size as `t.num_rows()`
    */
-  permuted_row_equality_comparator(cudf::table_device_view const& t, cudf::size_type const* map)
-    : _comparator(t, t, true), _map{map}
+  permuted_row_equality_comparator(cudf::table_device_view const& t,
+                                   cudf::size_type const* map,
+                                   bool nullable = true)
+    : _comparator(cudf::nullate::DYNAMIC{nullable}, t, t, cudf::null_equality::EQUAL), _map{map}
   {
   }
 
@@ -76,7 +77,7 @@ struct permuted_row_equality_comparator {
    *
    * @param lhs The index of the first row
    * @param rhs The index of the second row
-   * @returns if the two specified rows in the permuted order are equivalent
+   * @returns true if the two specified rows in the permuted order are equivalent
    */
   CUDA_DEVICE_CALLABLE
   bool operator()(cudf::size_type lhs, cudf::size_type rhs)
@@ -196,21 +197,12 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
   auto sorted_order       = key_sort_order(stream).data<size_type>();
   decltype(_group_offsets->begin()) result_end;
 
-  if (has_nulls(_keys)) {
-    result_end = thrust::unique_copy(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys(stream)),
-      _group_offsets->begin(),
-      permuted_row_equality_comparator<true>(*device_input_table, sorted_order));
-  } else {
-    result_end = thrust::unique_copy(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(num_keys(stream)),
-      _group_offsets->begin(),
-      permuted_row_equality_comparator<false>(*device_input_table, sorted_order));
-  }
+  result_end = thrust::unique_copy(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_keys(stream)),
+    _group_offsets->begin(),
+    permuted_row_equality_comparator(*device_input_table, sorted_order, has_nulls(_keys)));
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
   _group_offsets->set_element(num_groups, num_keys(stream), stream);
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index a882b33bcdf..039652e0012 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -64,43 +64,24 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
   auto const device_input = table_device_view::create(leaf_table, stream);
   auto output_view        = output->mutable_view();
 
-  if (has_nulls(leaf_table)) {
-    thrust::tabulate(rmm::exec_policy(stream),
-                     output_view.begin<int32_t>(),
-                     output_view.end<int32_t>(),
-                     [device_input = *device_input, seed] __device__(auto row_index) {
-                       return thrust::reduce(
-                         thrust::seq,
-                         device_input.begin(),
-                         device_input.end(),
-                         seed,
-                         [rindex = row_index] __device__(auto hash, auto column) {
-                           return cudf::type_dispatcher(
-                             column.type(),
-                             element_hasher_with_seed<hash_function, true>{hash, hash},
-                             column,
-                             rindex);
-                         });
-                     });
-  } else {
-    thrust::tabulate(rmm::exec_policy(stream),
-                     output_view.begin<int32_t>(),
-                     output_view.end<int32_t>(),
-                     [device_input = *device_input, seed] __device__(auto row_index) {
-                       return thrust::reduce(
-                         thrust::seq,
-                         device_input.begin(),
-                         device_input.end(),
-                         seed,
-                         [rindex = row_index] __device__(auto hash, auto column) {
-                           return cudf::type_dispatcher(
-                             column.type(),
-                             element_hasher_with_seed<hash_function, false>{hash, hash},
-                             column,
-                             rindex);
-                         });
-                     });
-  }
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    output_view.begin<int32_t>(),
+    output_view.end<int32_t>(),
+    [device_input = *device_input, nulls = has_nulls(leaf_table), seed] __device__(auto row_index) {
+      return thrust::reduce(thrust::seq,
+                            device_input.begin(),
+                            device_input.end(),
+                            seed,
+                            [rindex = row_index, nulls] __device__(auto hash, auto column) {
+                              return cudf::type_dispatcher(
+                                column.type(),
+                                element_hasher_with_seed<hash_function, nullate::DYNAMIC>{
+                                  nullate::DYNAMIC{nulls}, hash, hash},
+                                column,
+                                rindex);
+                            });
+    });
 
   return output;
 }
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmur_hash.cu
index 81be4d0eabe..a761d058180 100644
--- a/cpp/src/hash/murmur_hash.cu
+++ b/cpp/src/hash/murmur_hash.cu
@@ -50,31 +50,17 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
                  "Expected same size of initial hash values as number of columns");
     auto device_initial_hash = make_device_uvector_async(initial_hash, stream);
 
-    if (nullable) {
-      thrust::tabulate(
-        rmm::exec_policy(stream),
-        output_view.begin<int32_t>(),
-        output_view.end<int32_t>(),
-        row_hasher_initial_values<MurmurHash3_32, true>(*device_input, device_initial_hash.data()));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher_initial_values<MurmurHash3_32, false>(
-                         *device_input, device_initial_hash.data()));
-    }
+    thrust::tabulate(rmm::exec_policy(stream),
+                     output_view.begin<int32_t>(),
+                     output_view.end<int32_t>(),
+                     row_hasher_initial_values<MurmurHash3_32, nullate::DYNAMIC>(
+                       nullate::DYNAMIC{nullable}, *device_input, device_initial_hash.data()));
   } else {
-    if (nullable) {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, true>(*device_input));
-    } else {
-      thrust::tabulate(rmm::exec_policy(stream),
-                       output_view.begin<int32_t>(),
-                       output_view.end<int32_t>(),
-                       row_hasher<MurmurHash3_32, false>(*device_input));
-    }
+    thrust::tabulate(
+      rmm::exec_policy(stream),
+      output_view.begin<int32_t>(),
+      output_view.end<int32_t>(),
+      row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
   }
 
   return output;
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c5b680f129e..eee0a8cc6f0 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -81,7 +81,7 @@ void build_join_hash_table(cudf::table_view const& build,
   CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
   CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
 
-  row_hash hash_build{*build_table_ptr};
+  row_hash hash_build{nullate::YES{}, *build_table_ptr};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_build, empty_key_sentinel};
 
@@ -147,9 +147,9 @@ probe_join_hash_table(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
@@ -212,9 +212,9 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 10b0e420ef6..976b0c81ead 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -117,9 +117,9 @@ std::size_t compute_join_output_size(table_device_view build_table,
     }
   }
 
-  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
+  pair_equality equality{probe_table, build_table, compare_nulls};
 
-  row_hash hash_probe{probe_table};
+  row_hash hash_probe{nullate::YES{}, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index cec633765c7..4b33772dd69 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -32,8 +32,10 @@ namespace detail {
  */
 class pair_equality {
  public:
-  pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true)
-    : _check_row_equality{lhs, rhs, nulls_are_equal}
+  pair_equality(table_device_view lhs,
+                table_device_view rhs,
+                null_equality nulls_are_equal = null_equality::EQUAL)
+    : _check_row_equality{cudf::nullate::YES{}, lhs, rhs, nulls_are_equal}
   {
   }
 
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index d6eb5e93a98..84506daf2f1 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -45,9 +45,9 @@ using multimap_type =
                         default_allocator<char>,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
-using row_hash = cudf::row_hasher<default_hash>;
+using row_hash = cudf::row_hasher<default_hash, cudf::nullate::YES>;
 
-using row_equality = cudf::row_equality_comparator<true>;
+using row_equality = cudf::row_equality_comparator<cudf::nullate::YES>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 5b5dd418a97..3d27c5740f4 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -77,13 +77,13 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Create hash table containing all keys found in right table
   auto right_rows_d            = table_device_view::create(right_flattened_keys, stream);
   size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  row_hash hash_build{*right_rows_d};
-  row_equality equality_build{*right_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+  row_hash hash_build{cudf::nullate::YES{}, *right_rows_d};
+  row_equality equality_build{cudf::nullate::YES{}, *right_rows_d, *right_rows_d, compare_nulls};
 
   // Going to join it with left table
   auto left_rows_d = table_device_view::create(left_flattened_keys, stream);
-  row_hash hash_probe{*left_rows_d};
-  row_equality equality_probe{*left_rows_d, *right_rows_d, compare_nulls == null_equality::EQUAL};
+  row_hash hash_probe{cudf::nullate::YES{}, *left_rows_d};
+  row_equality equality_probe{cudf::nullate::YES{}, *left_rows_d, *right_rows_d, compare_nulls};
 
   auto hash_table_ptr = hash_table_type::create(hash_table_size,
                                                 stream,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index e8c56cdafd8..7b3b7b0f3fd 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -486,7 +486,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
     cudf::detail::make_zeroed_device_uvector_async<size_type>(num_rows, stream);
 
   auto const device_input = table_device_view::create(table_to_hash, stream);
-  auto const hasher       = row_hasher<hash_function, hash_has_nulls>(*device_input, seed);
+  auto const hasher       = row_hasher<hash_function, nullate::DYNAMIC>(
+    nullate::DYNAMIC{hash_has_nulls}, *device_input, seed);
 
   // If the number of partitions is a power of two, we can compute the partition
   // number of each row more efficiently with bitwise operations
diff --git a/cpp/src/reductions/arg_minmax_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
index 40df23bcd8e..5694d0ed0fa 100644
--- a/cpp/src/reductions/arg_minmax_util.cuh
+++ b/cpp/src/reductions/arg_minmax_util.cuh
@@ -24,21 +24,20 @@ namespace detail {
 
 /**
  * @brief Binary operator ArgMin/ArgMax with index values into the input table.
- *
- * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
- * '<' operator.
  */
-template <bool has_nulls>
 struct row_arg_minmax_fn {
   size_type const num_rows;
-  row_lexicographic_comparator<has_nulls> const comp;
+  row_lexicographic_comparator<nullate::DYNAMIC> const comp;
   bool const arg_min;
 
-  row_arg_minmax_fn(size_type const num_rows_,
-                    table_device_view const& table_,
-                    null_order const* null_precedence_,
-                    bool const arg_min_)
-    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence_), arg_min(arg_min_)
+  row_arg_minmax_fn(size_type const num_rows,
+                    table_device_view const& table,
+                    bool has_nulls,
+                    null_order const* null_precedence,
+                    bool const arg_min)
+    : num_rows(num_rows),
+      comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
+      arg_min(arg_min)
   {
   }
 
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index e7f1e867a41..9ac4db3a34b 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -33,18 +33,19 @@ namespace {
 /**
  * @brief generate row ranks or dense ranks using a row comparison then scan the results
  *
- * @tparam has_nulls if the order_by column has nulls
  * @tparam value_resolver flag value resolver with boolean first and row number arguments
  * @tparam scan_operator scan function ran on the flag values
  * @param order_by input column to generate ranks for
+ * @param has_nulls if the order_by column has nested nulls
  * @param resolver flag value resolver
  * @param scan_op scan operation ran on the flag results
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return std::unique_ptr<column> rank values
  */
-template <bool has_nulls, typename value_resolver, typename scan_operator>
+template <typename value_resolver, typename scan_operator>
 std::unique_ptr<column> rank_generator(column_view const& order_by,
+                                       bool has_nulls,
                                        value_resolver resolver,
                                        scan_operator scan_op,
                                        rmm::cuda_stream_view stream,
@@ -53,7 +54,8 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
   auto const flattened = cudf::structs::detail::flatten_nested_columns(
     table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
   auto const d_flat_order = table_device_view::create(flattened, stream);
-  row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
+  row_equality_comparator comparator(
+    nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                        flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
@@ -85,16 +87,9 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in dense_rank scan.");
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      [] __device__(bool equality, auto row_index) { return equality; },
-      DeviceSum{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
+    has_nested_nulls(table_view{{order_by}}),
     [] __device__(bool equality, auto row_index) { return equality; },
     DeviceSum{},
     stream,
@@ -107,16 +102,9 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in rank scan.");
-  if (has_nested_nulls(table_view{{order_by}})) {
-    return rank_generator<true>(
-      order_by,
-      [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
-      DeviceMax{},
-      stream,
-      mr);
-  }
-  return rank_generator<false>(
+  return rank_generator(
     order_by,
+    has_nested_nulls(table_view{{order_by}}),
     [] __device__(bool equality, auto row_index) { return equality ? row_index + 1 : 0; },
     DeviceMax{},
     stream,
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index b0e761c4c3b..2b1ac8aa704 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -182,15 +182,12 @@ struct scan_functor<Op, cudf::struct_view> {
       is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
                 : rmm::device_uvector<cudf::null_order>(0, stream);
 
-    if (input.has_nulls()) {
-      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
-        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-      do_scan(binop);
-    } else {
-      auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
-        input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-      do_scan(binop);
-    }
+    auto const binop = cudf::reduction::detail::row_arg_minmax_fn(input.size(),
+                                                                  *d_flattened_input_ptr,
+                                                                  input.has_nulls(),
+                                                                  flattened_null_precedences.data(),
+                                                                  is_min_op);
+    do_scan(binop);
 
     // Gather the children columns of the input column. Must use `get_sliced_child` to properly
     // handle input in case it is a sliced view.
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 7dd54e9250a..e5633341ffa 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -317,15 +317,13 @@ struct same_element_type_dispatcher {
     };
 
     auto const minmax_idx = [&] {
-      if (input.has_nulls()) {
-        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<true>(
-          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-        return do_reduction(binop);
-      } else {
-        auto const binop = cudf::reduction::detail::row_arg_minmax_fn<false>(
-          input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op);
-        return do_reduction(binop);
-      }
+      auto const binop =
+        cudf::reduction::detail::row_arg_minmax_fn(input.size(),
+                                                   *d_flattened_input_ptr,
+                                                   input.has_nulls(),
+                                                   flattened_null_precedences.data(),
+                                                   is_min_op);
+      return do_reduction(binop);
     }();
 
     return cudf::detail::get_element(input, minmax_idx, stream, mr);
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 713b3b27a2b..fae02805620 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -180,7 +180,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   };
 
   auto input_pair_iterator =
-    make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+    make_optional_iterator<T>(*input_device_view, nullate::DYNAMIC{input.has_nulls()});
   thrust::transform(rmm::exec_policy(stream),
                     input_pair_iterator,
                     input_pair_iterator + input.size(),
@@ -232,10 +232,10 @@ struct dispatch_clamp {
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
-    auto lo_itr         = make_optional_iterator<T>(lo, contains_nulls::YES{});
-    auto hi_itr         = make_optional_iterator<T>(hi, contains_nulls::YES{});
-    auto lo_replace_itr = make_optional_iterator<T>(lo_replace, contains_nulls::NO{});
-    auto hi_replace_itr = make_optional_iterator<T>(hi_replace, contains_nulls::NO{});
+    auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
+    auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
+    auto lo_replace_itr = make_optional_iterator<T>(lo_replace, nullate::NO{});
+    auto hi_replace_itr = make_optional_iterator<T>(hi_replace, nullate::NO{});
 
     return clamp<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
   }
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index c1c26573692..7b47f8df28d 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -56,9 +56,9 @@ struct replace_nans_functor {
     };
 
     auto input_iterator =
-      make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+      make_optional_iterator<T>(*input_device_view, nullate::DYNAMIC{input.has_nulls()});
     auto replacement_iterator =
-      make_optional_iterator<T>(replacement, contains_nulls::DYNAMIC{}, replacement_nullable);
+      make_optional_iterator<T>(replacement, nullate::DYNAMIC{replacement_nullable});
     return copy_if_else(input.has_nulls() or replacement_nullable,
                         input_iterator,
                         input_iterator + size,
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 462d0678eab..9a677d7907a 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -124,17 +124,13 @@ std::unique_ptr<column> search_ordered(table_view const& t,
     detail::make_device_uvector_async(null_precedence_flattened, stream);
 
   auto const count_it = thrust::make_counting_iterator<size_type>(0);
-  if (has_null_elements) {
-    auto const comp = row_lexicographic_comparator<true>(
-      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
-    launch_search(
-      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
-  } else {
-    auto const comp = row_lexicographic_comparator<false>(
-      lhs, rhs, column_order_dv.data(), null_precedence_dv.data());
-    launch_search(
-      count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
-  }
+  auto const comp     = row_lexicographic_comparator(nullate::DYNAMIC{has_null_elements},
+                                                 lhs,
+                                                 rhs,
+                                                 column_order_dv.data(),
+                                                 null_precedence_dv.data());
+  launch_search(
+    count_it, count_it, t.num_rows(), values.num_rows(), result_out, comp, find_first, stream);
 
   return result;
 }
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index b08baaa0261..a8820204c22 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -27,12 +27,14 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/sort.h>
+
 namespace cudf {
 namespace detail {
 
-template <bool has_nulls>
 auto is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
+               bool has_nulls,
                std::vector<null_order> const& null_precedence,
                rmm::cuda_stream_view stream)
 {
@@ -45,8 +47,11 @@ auto is_sorted(cudf::table_view const& in,
                                    ? make_device_uvector_async(flattened.null_orders(), stream)
                                    : rmm::device_uvector<null_order>(0, stream);
 
-  auto comparator = row_lexicographic_comparator<has_nulls>(
-    *d_input, *d_input, d_column_order.data(), d_null_precedence.data());
+  auto comparator = row_lexicographic_comparator(nullate::DYNAMIC{has_nulls},
+                                                 *d_input,
+                                                 *d_input,
+                                                 d_column_order.data(),
+                                                 d_null_precedence.data());
 
   auto sorted = thrust::is_sorted(rmm::exec_policy(stream),
                                   thrust::make_counting_iterator(0),
@@ -76,11 +81,8 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  if (has_nulls(in)) {
-    return detail::is_sorted<true>(in, column_order, null_precedence, rmm::cuda_stream_default);
-  } else {
-    return detail::is_sorted<false>(in, column_order, null_precedence, rmm::cuda_stream_default);
-  }
+  return detail::is_sorted(
+    in, column_order, has_nulls(in), null_precedence, rmm::cuda_stream_default);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index de0a44e3234..e17a18997e8 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -37,10 +37,11 @@ namespace cudf {
 namespace detail {
 namespace {
 // Functor to identify unique elements in a sorted order table/column
-template <bool has_nulls, typename ReturnType, typename Iterator>
+template <typename ReturnType, typename Iterator>
 struct unique_comparator {
-  unique_comparator(table_device_view device_table, Iterator const sorted_order)
-    : comparator(device_table, device_table, true), permute(sorted_order)
+  unique_comparator(table_device_view device_table, Iterator const sorted_order, bool has_nulls)
+    : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL),
+      permute(sorted_order)
   {
   }
   __device__ ReturnType operator()(size_type index) const noexcept
@@ -49,7 +50,7 @@ struct unique_comparator {
   };
 
  private:
-  row_equality_comparator<has_nulls> comparator;
+  row_equality_comparator<nullate::DYNAMIC> comparator;
   Iterator const permute;
 };
 
@@ -63,21 +64,12 @@ rmm::device_uvector<size_type> sorted_dense_rank(column_view input_col,
   rmm::device_uvector<size_type> dense_rank_sorted(input_size, stream);
   auto sorted_index_order = thrust::make_permutation_iterator(
     sorted_order_view.begin<size_type>(), thrust::make_counting_iterator<size_type>(0));
-  if (input_col.has_nulls()) {
-    auto conv = unique_comparator<true, size_type, decltype(sorted_index_order)>(
-      *device_table, sorted_index_order);
-    auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
+  auto conv = unique_comparator<size_type, decltype(sorted_index_order)>(
+    *device_table, sorted_index_order, input_col.has_nulls());
+  auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
 
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
-  } else {
-    auto conv = unique_comparator<false, size_type, decltype(sorted_index_order)>(
-      *device_table, sorted_index_order);
-    auto unique_it = cudf::detail::make_counting_transform_iterator(0, conv);
-
-    thrust::inclusive_scan(
-      rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
-  }
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), unique_it, unique_it + input_size, dense_rank_sorted.data());
   return dense_rank_sorted;
 }
 
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 25f0815e645..881503a49e3 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -127,40 +127,25 @@ std::unique_ptr<column> sorted_order(table_view input,
   auto device_table = table_device_view::create(flattened, stream);
   auto const d_column_order = make_device_uvector_async(flattened.orders(), stream);
 
-  if (has_nulls(flattened)) {
-    auto const d_null_precedence = make_device_uvector_async(flattened.null_orders(), stream);
-    auto const comparator        = row_lexicographic_comparator<true>(
-      *device_table, *device_table, d_column_order.data(), d_null_precedence.data());
-    if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream),
-                          mutable_indices_view.begin<size_type>(),
-                          mutable_indices_view.end<size_type>(),
-                          comparator);
-    } else {
-      thrust::sort(rmm::exec_policy(stream),
-                   mutable_indices_view.begin<size_type>(),
-                   mutable_indices_view.end<size_type>(),
-                   comparator);
-    }
-    // protection for temporary d_column_order and d_null_precedence
-    stream.synchronize();
+  auto const d_null_precedence = make_device_uvector_async(flattened.null_orders(), stream);
+  auto const comparator = row_lexicographic_comparator(nullate::DYNAMIC{has_nulls(flattened)},
+                                                       *device_table,
+                                                       *device_table,
+                                                       d_column_order.data(),
+                                                       d_null_precedence.data());
+  if (stable) {
+    thrust::stable_sort(rmm::exec_policy(stream),
+                        mutable_indices_view.begin<size_type>(),
+                        mutable_indices_view.end<size_type>(),
+                        comparator);
   } else {
-    auto const comparator =
-      row_lexicographic_comparator<false>(*device_table, *device_table, d_column_order.data());
-    if (stable) {
-      thrust::stable_sort(rmm::exec_policy(stream),
-                          mutable_indices_view.begin<size_type>(),
-                          mutable_indices_view.end<size_type>(),
-                          comparator);
-    } else {
-      thrust::sort(rmm::exec_policy(stream),
-                   mutable_indices_view.begin<size_type>(),
-                   mutable_indices_view.end<size_type>(),
-                   comparator);
-    }
-    // protection for temporary d_column_order
-    stream.synchronize();
+    thrust::sort(rmm::exec_policy(stream),
+                 mutable_indices_view.begin<size_type>(),
+                 mutable_indices_view.end<size_type>(),
+                 comparator);
   }
+  // protection for temporary d_column_order and d_null_precedence
+  stream.synchronize();
 
   return sorted_indices;
 }
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 4e142204a29..5c695f8a16f 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -50,27 +50,15 @@ cudf::size_type distinct_count(table_view const& keys,
   auto sorted_row_index   = sorted_indices->view().data<cudf::size_type>();
   auto device_input_table = cudf::table_device_view::create(keys, stream);
 
-  if (cudf::has_nulls(keys)) {
-    row_equality_comparator<true> comp(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [sorted_row_index, comp] __device__(cudf::size_type i) {
-        return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-      });
-  } else {
-    row_equality_comparator<false> comp(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::counting_iterator<cudf::size_type>(0),
-      thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
-      [sorted_row_index, comp] __device__(cudf::size_type i) {
-        return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
-      });
-  }
+  row_equality_comparator comp(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
+  return thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
+    [sorted_row_index, comp] __device__(cudf::size_type i) {
+      return (i == 0 || not comp(sorted_row_index[i], sorted_row_index[i - 1]));
+    });
 }
 
 /**
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index f236e6a5f53..abc34663aee 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -83,35 +83,18 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
   // extract unique indices
   auto device_input_table = cudf::table_device_view::create(keys, stream);
 
-  if (cudf::has_nulls(keys)) {
-    auto comp = row_equality_comparator<true>(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                  sorted_indices->view().end<cudf::size_type>(),
-                                  unique_indices.begin<cudf::size_type>(),
-                                  comp,
-                                  keep,
-                                  stream);
-
-    return cudf::detail::slice(
-      column_view(unique_indices),
-      0,
-      thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
-  } else {
-    auto comp = row_equality_comparator<false>(
-      *device_input_table, *device_input_table, nulls_equal == null_equality::EQUAL);
-    auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
-                                  sorted_indices->view().end<cudf::size_type>(),
-                                  unique_indices.begin<cudf::size_type>(),
-                                  comp,
-                                  keep,
-                                  stream);
-
-    return cudf::detail::slice(
-      column_view(unique_indices),
-      0,
-      thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
-  }
+  auto comp = row_equality_comparator(
+    nullate::DYNAMIC{cudf::has_nulls(keys)}, *device_input_table, *device_input_table, nulls_equal);
+  auto result_end = unique_copy(sorted_indices->view().begin<cudf::size_type>(),
+                                sorted_indices->view().end<cudf::size_type>(),
+                                unique_indices.begin<cudf::size_type>(),
+                                comp,
+                                keep,
+                                stream);
+
+  return cudf::detail::slice(column_view(unique_indices),
+                             0,
+                             thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
 }
 }  // namespace
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 5b5c35df551..16aee349bb5 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -37,10 +37,11 @@ namespace detail {
 
 namespace {
 
-template <typename InputType, bool has_nulls>
+template <typename InputType>
 struct one_hot_encode_functor {
-  one_hot_encode_functor(column_device_view input, column_device_view category)
-    : _equality_comparator{input, category}, _input_size{input.size()}
+  one_hot_encode_functor(column_device_view input, column_device_view category, bool nulls)
+    : _equality_comparator{nullate::DYNAMIC{nulls}, input, category, null_equality::EQUAL},
+      _input_size{input.size()}
   {
   }
 
@@ -52,13 +53,12 @@ struct one_hot_encode_functor {
   }
 
  private:
-  element_equality_comparator<has_nulls> const _equality_comparator;
+  element_equality_comparator<nullate::DYNAMIC> const _equality_comparator;
   size_type const _input_size;
 };
 
 }  // anonymous namespace
 
-template <bool has_nulls>
 struct one_hot_encode_launcher {
   template <typename InputType, CUDF_ENABLE_IF(is_equality_comparable<InputType, InputType>())>
   std::pair<std::unique_ptr<column>, table_view> operator()(column_view const& input_column,
@@ -72,8 +72,8 @@ struct one_hot_encode_launcher {
 
     auto d_input_column    = column_device_view::create(input_column, stream);
     auto d_category_column = column_device_view::create(categories, stream);
-    one_hot_encode_functor<InputType, has_nulls> one_hot_encoding_compute_f(*d_input_column,
-                                                                            *d_category_column);
+    one_hot_encode_functor<InputType> one_hot_encoding_compute_f(
+      *d_input_column, *d_category_column, input_column.nullable() || categories.nullable());
 
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
@@ -118,11 +118,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
     return std::make_pair(std::move(empty_data), table_view{views});
   }
 
-  return (!input.nullable() && !categories.nullable())
-           ? type_dispatcher(
-               input.type(), one_hot_encode_launcher<false>{}, input, categories, stream, mr)
-           : type_dispatcher(
-               input.type(), one_hot_encode_launcher<true>{}, input, categories, stream, mr);
+  return type_dispatcher(input.type(), one_hot_encode_launcher{}, input, categories, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/iterator/optional_iterator_test.cuh b/cpp/tests/iterator/optional_iterator_test.cuh
index 6b1c2b360eb..d19c9e49ad9 100644
--- a/cpp/tests/iterator/optional_iterator_test.cuh
+++ b/cpp/tests/iterator/optional_iterator_test.cuh
@@ -36,11 +36,11 @@ void nonull_optional_iterator(IteratorTest<T>& testFixture)
   // GPU test
   testFixture.iterator_test_thrust(
     replaced_array,
-    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::DYNAMIC{}, false),
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::nullate::DYNAMIC{false}),
     host_values.size());
   testFixture.iterator_test_thrust(
     replaced_array,
-    cudf::detail::make_optional_iterator<T>(*d_col, cudf::contains_nulls::NO{}),
+    cudf::detail::make_optional_iterator<T>(*d_col, cudf::nullate::NO{}),
     host_values.size());
 }
 
@@ -72,22 +72,20 @@ void null_optional_iterator(IteratorTest<T>& testFixture)
                  [](auto s, bool b) { return thrust::optional<T>{s}; });
 
   // GPU test for correct null mapping
-  testFixture.iterator_test_thrust(optional_values,
-                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, true),
-                                   host_values.size());
+  testFixture.iterator_test_thrust(
+    optional_values, d_col->optional_begin<T>(cudf::nullate::DYNAMIC{true}), host_values.size());
 
   testFixture.iterator_test_thrust(
-    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+    optional_values, d_col->optional_begin<T>(cudf::nullate::YES{}), host_values.size());
   testFixture.iterator_test_thrust(
-    optional_values, d_col->optional_begin<T>(cudf::contains_nulls::YES{}), host_values.size());
+    optional_values, d_col->optional_begin<T>(cudf::nullate::YES{}), host_values.size());
 
   // GPU test for ignoring null mapping
-  testFixture.iterator_test_thrust(value_all_valid,
-                                   d_col->optional_begin<T>(cudf::contains_nulls::DYNAMIC{}, false),
-                                   host_values.size());
+  testFixture.iterator_test_thrust(
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::DYNAMIC{false}), host_values.size());
 
   testFixture.iterator_test_thrust(
-    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::NO{}), host_values.size());
   testFixture.iterator_test_thrust(
-    value_all_valid, d_col->optional_begin<T>(cudf::contains_nulls::NO{}), host_values.size());
+    value_all_valid, d_col->optional_begin<T>(cudf::nullate::NO{}), host_values.size());
 }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index a8c135a726f..1d23d73bf36 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -99,7 +99,7 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
     replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; });
 
   // GPU test
-  auto it_dev         = d_col->optional_begin<T>(cudf::contains_nulls::YES{});
+  auto it_dev         = d_col->optional_begin<T>(cudf::nullate::YES{});
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
 
   // this can be computed with a single reduce and without a temporary output vector
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 3a792573108..c94963525a0 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -45,8 +45,8 @@ void row_comparison(cudf::table_view input1,
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
   auto d_column_order = cudf::detail::make_device_uvector_sync(column_order);
 
-  auto comparator = cudf::row_lexicographic_comparator<false>(
-    *device_table_1, *device_table_2, d_column_order.data());
+  auto comparator = cudf::row_lexicographic_comparator(
+    cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
 
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator(0),
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 8341425e9e7..2d9fe7775d5 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
@@ -35,16 +36,17 @@
 
 #include <jit/type.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/equal.h>
+#include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
+#include <thrust/sequence.h>
 
 #include <numeric>
 #include <sstream>
-#include "cudf/detail/utilities/vector_factories.hpp"
-#include "rmm/cuda_stream_view.hpp"
 
 namespace cudf {
 
@@ -325,11 +327,13 @@ class corresponding_rows_unequal {
                              column_device_view lhs_row_indices_,
                              column_device_view rhs_row_indices_,
                              size_type /*fp_ulps*/)
-    : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_)
+    : comp(cudf::nullate::YES{}, d_lhs, d_rhs, cudf::null_equality::EQUAL),
+      lhs_row_indices(lhs_row_indices_),
+      rhs_row_indices(rhs_row_indices_)
   {
   }
 
-  cudf::row_equality_comparator<true> comp;
+  cudf::row_equality_comparator<cudf::nullate::YES> comp;
 
   __device__ bool operator()(size_type index)
   {
@@ -358,7 +362,7 @@ class corresponding_rows_not_equivalent {
                                     size_type fp_ulps_)
     : d_lhs(d_lhs),
       d_rhs(d_rhs),
-      comp(d_lhs, d_rhs),
+      comp(cudf::nullate::YES{}, d_lhs, d_rhs, null_equality::EQUAL),
       lhs_row_indices(lhs_row_indices_),
       rhs_row_indices(rhs_row_indices_),
       fp_ulps(fp_ulps_)
@@ -404,7 +408,7 @@ class corresponding_rows_not_equivalent {
     }
   };
 
-  cudf::row_equality_comparator<true> comp;
+  cudf::row_equality_comparator<cudf::nullate::YES> comp;
 
   __device__ bool operator()(size_type index)
   {

From 3b93f5c356035164b41d918f86efe419ba85cc2b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 6 Dec 2021 16:10:45 -0500
Subject: [PATCH 053/202] Use stream allocator adaptor for hash join table
 (#9704)

Depends on https://github.com/NVIDIA/cuCollections/pull/119

This PR replaces the default hash join allocator with the corresponding `rmm::mr::stream_allocator_adaptor`. It accommodates new `cuco::allocator` APIs that don't take stream as input argument.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9704
---
 cpp/cmake/thirdparty/get_cucollections.cmake |  2 +-
 cpp/src/join/hash_join.cu                    |  3 ++-
 cpp/src/join/join_common_utils.hpp           | 10 ++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index b6cb9757ae8..b58bdb55de3 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG f0eecb203590f1f4ac4a9f1700229f4434ac64dc
+    GIT_TAG 6433e8ad7571f14cc5384051b049029c60dd1ce0
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index eee0a8cc6f0..57303ed7795 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -293,7 +293,8 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
     _hash_table{compute_hash_table_size(build.num_rows()),
                 std::numeric_limits<hash_value_type>::max(),
                 cudf::detail::JoinNoneValue,
-                stream.value()}
+                stream.value(),
+                detail::hash_table_allocator_type{default_allocator<char>{}, stream}}
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 84506daf2f1..c4692a50fec 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -15,11 +15,15 @@
  */
 #pragma once
 
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <hash/concurrent_unordered_multimap.cuh>
+#include <hash/hash_allocator.cuh>
+#include <hash/helper_functions.cuh>
+
+#include <rmm/mr/device/polymorphic_allocator.hpp>
 
 #include <cuco/static_multimap.cuh>
 
@@ -38,11 +42,13 @@ using pair_type = cuco::pair_type<hash_value_type, size_type>;
 
 using hash_type = cuco::detail::MurmurHash3_32<hash_value_type>;
 
+using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+
 using multimap_type =
   cuco::static_multimap<hash_value_type,
                         size_type,
                         cuda::thread_scope_device,
-                        default_allocator<char>,
+                        hash_table_allocator_type,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
 using row_hash = cudf::row_hasher<default_hash, cudf::nullate::YES>;

From 8ceed73b96ed93409febf6ac2d8e8c213f290b60 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 6 Dec 2021 16:44:33 -0600
Subject: [PATCH 054/202] Use vector factories for host-device copies. (#9806)

See: https://github.com/rapidsai/cudf/pull/9588#discussion_r751889583

In a recent PR review, @jrhemstad suggested switching to vector factories for one-way host-device data copying (that is, cases where using a `hostdevice_vector` isn't the right choice).

This PR applies that suggestion more broadly across the code base, replacing a number of simple cases where a (device) vector was being constructed followed by a call to `CUDA_TRY(cudaMemcpyAsync(...))` with the corresponding factory functions. This makes the code a little more concise and encourages broader use of these factory functions in the future.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9806
---
 cpp/src/dictionary/detail/concatenate.cu | 10 ++--------
 cpp/src/io/orc/timezone.cpp              | 18 +++++-------------
 cpp/src/io/parquet/page_enc.cu           | 18 +++++-------------
 cpp/src/io/parquet/writer_impl.cu        |  7 +------
 cpp/src/strings/filter_chars.cu          |  8 ++------
 cpp/src/strings/replace/backref_re.cu    |  9 +++------
 cpp/src/strings/translate.cu             |  9 +++------
 cpp/src/transform/row_bit_count.cu       |  8 ++------
 8 files changed, 23 insertions(+), 64 deletions(-)

diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index a3cac6ac5c1..fd86d8ec7d4 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/concatenate.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
@@ -104,14 +105,7 @@ struct compute_children_offsets_fn {
       [](auto lhs, auto rhs) {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
-    auto d_offsets = rmm::device_uvector<offsets_pair>(offsets.size(), stream);
-    CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                             offsets.data(),
-                             offsets.size() * sizeof(offsets_pair),
-                             cudaMemcpyHostToDevice,
-                             stream.value()));
-    stream.synchronize();
-    return d_offsets;
+    return cudf::detail::make_device_uvector_sync(offsets, stream);
   }
 
  private:
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 77fde0d1e75..3a1e8bf898a 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -15,6 +15,8 @@
  */
 #include "timezone.cuh"
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+
 #include <algorithm>
 #include <fstream>
 
@@ -459,19 +461,9 @@ timezone_table build_timezone_transition_table(std::string const& timezone_name,
                         .count();
   }
 
-  rmm::device_uvector<int64_t> d_ttimes{ttimes.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_ttimes.data(),
-                           ttimes.data(),
-                           ttimes.size() * sizeof(int64_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  rmm::device_uvector<int32_t> d_offsets{offsets.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
-                           offsets.data(),
-                           offsets.size() * sizeof(int32_t),
-                           cudaMemcpyDefault,
-                           stream.value()));
-  auto const gmt_offset = get_gmt_offset(ttimes, offsets, orc_utc_offset);
+  rmm::device_uvector<int64_t> d_ttimes  = cudf::detail::make_device_uvector_async(ttimes, stream);
+  rmm::device_uvector<int32_t> d_offsets = cudf::detail::make_device_uvector_async(offsets, stream);
+  auto const gmt_offset                  = get_gmt_offset(ttimes, offsets, orc_utc_offset);
   stream.synchronize();
 
   return {gmt_offset, std::move(d_ttimes), std::move(d_offsets)};
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 48490426db7..3ca53d9e651 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1716,19 +1717,10 @@ dremel_data get_dremel_data(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets(d_column_offsets.size());
-  CUDA_TRY(cudaMemcpyAsync(column_offsets.data(),
-                           d_column_offsets.data(),
-                           d_column_offsets.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-  thrust::host_vector<size_type> column_ends(d_column_ends.size());
-  CUDA_TRY(cudaMemcpyAsync(column_ends.data(),
-                           d_column_ends.data(),
-                           d_column_ends.size() * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
-
+  thrust::host_vector<size_type> column_offsets =
+    cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  thrust::host_vector<size_type> column_ends =
+    cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 62803432157..d1101b24d7e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -673,12 +673,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   _nullability = std::vector<uint8_t>(r_nullability.crbegin(), r_nullability.crend());
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
-  _d_nullability = rmm::device_uvector<uint8_t>(_nullability.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(_d_nullability.data(),
-                           _nullability.data(),
-                           _nullability.size() * sizeof(uint8_t),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  _d_nullability = cudf::detail::make_device_uvector_async(_nullability, stream);
 
   _is_list = (_max_rep_level > 0);
 
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 883a7fada75..7e45a609d34 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -126,12 +127,7 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table(table_size, stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           table_size * sizeof(char_range),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 87603e4c35b..99c55998fb9 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -24,6 +24,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -116,12 +117,8 @@ std::unique_ptr<column> replace_with_backrefs(
 
   // parse the repl string for back-ref indicators
   auto const parse_result = parse_backrefs(repl);
-  rmm::device_uvector<backref_type> backrefs(parse_result.second.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(backrefs.data(),
-                           parse_result.second.data(),
-                           sizeof(backref_type) * backrefs.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<backref_type> backrefs =
+    cudf::detail::make_device_uvector_async(parse_result.second, stream);
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value();
 
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index ad3515e8058..8761deab4a4 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -101,12 +102,8 @@ std::unique_ptr<column> translate(
     return lhs.first < rhs.first;
   });
   // copy translate table to device memory
-  rmm::device_uvector<translate_table> table(htable.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(table.data(),
-                           htable.data(),
-                           sizeof(translate_table) * htable.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<translate_table> table =
+    cudf::detail::make_device_uvector_async(htable, stream);
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 06b03a6b36f..ff720daa5cb 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -496,12 +497,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
   auto d_cols = contiguous_copy_column_device_views<column_device_view>(cols, stream);
 
   // move stack info to the gpu
-  rmm::device_uvector<column_info> d_info(info.size(), stream);
-  CUDA_TRY(cudaMemcpyAsync(d_info.data(),
-                           info.data(),
-                           sizeof(column_info) * info.size(),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
+  rmm::device_uvector<column_info> d_info = cudf::detail::make_device_uvector_async(info, stream);
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer

From ccab7aefbb49fbcc33cf53d8a63a60304bea5b3c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 6 Dec 2021 17:59:44 -0500
Subject: [PATCH 055/202] Fix build instructions for libcudf doxygen (#9837)

Updates doxygen build instructions in the libcudf documentation guide.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9837
---
 cpp/docs/DOCUMENTATION.md | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/cpp/docs/DOCUMENTATION.md b/cpp/docs/DOCUMENTATION.md
index b219543e3d6..2382a0eb022 100644
--- a/cpp/docs/DOCUMENTATION.md
+++ b/cpp/docs/DOCUMENTATION.md
@@ -9,7 +9,7 @@ The following is the license header comment that should appear at the beginning
 
 ```c++
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ The comment should start with `/*` and not `/**` so it is not processed by doxyg
 Also, here are the rules for the copyright year.
 
 - A new file should have the year in which it was created
-- A modified file should span the year it was created and the year it was modified (e.g. `2019-2020`)
+- A modified file should span the year it was created and the year it was modified (e.g. `2019-2021`)
 
 Changing the copyright year may not be necessary if no content has changed (e.g. reformatting only).
 
@@ -48,7 +48,7 @@ Here are some of the custom options in the Doxyfile for libcudf.
 | Option | Setting | Description |
 | ------ | ------- | ----------- |
 | PROJECT_NAME | libcudf | Title used on the main page |
-| PROJECT_NUMBER | 0.14 | Version number |
+| PROJECT_NUMBER | 22.02.00 | Version number |
 | EXTENSION_MAPPING | cu=C++ cuh=C++ | Process `cu` and `cuh` as C++ |
 | INPUT | main_page.md regex.md unicode.md ../include | Embedded markdown files and source code directories to process |
 | FILE_PATTERNS | *.cpp *.hpp *.h *.c *.cu *.cuh | File extensions to process |
@@ -459,6 +459,7 @@ We recommend installing Doxygen using conda (`conda install doxygen`) or a Linux
 Alternatively you can [build and install doxygen from source](http://www.doxygen.nl/manual/install.html).
 
 To build the libcudf HTML documentation simply run the `doxygen` command from the `cpp/doxygen` directory containing the `Doxyfile`.
+The libcudf documentation can also be built using `make docs_cudf` from the cmake build directory (e.g. `cpp/build`).
 Doxygen reads and processes all appropriate source files under the `cpp/include/` directory.
 The output is generated in the `cpp/doxygen/html/` directory.
 You can load the local `index.html` file generated there into any web browser to view the result.
@@ -466,10 +467,6 @@ You can load the local `index.html` file generated there into any web browser to
 To view docs built on a remote server, you can run a simple HTTP server using Python: `cd html && python -m http.server`.
 Then open `http://<IP address>:8000` in your local web browser, inserting the IP address of the machine on which you ran the HTTP server.
 
-By default, doxygen uses the `graphviz dot` tool to build diagrams of the class, namespace, and module relationships.
-If the `dot` tool cannot be found then doxygen generates output without diagrams.
-The doxygen installation page does not include instructions for downloading and installing `graphviz dot`.
-
 The doxygen output is intended for building documentation only for the public APIs and classes.
 For example, the output should not include documentation for `detail` or `/src` files, and these directories are excluded in the `Doxyfile` configuration.
 When published by the build/CI system, the doxygen output will appear on our external [RAPIDS web site](https://docs.rapids.ai/api/libcudf/stable/index.html).

From 8c82d6acd14b28a74228ba20b765925beea2c383 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Tue, 7 Dec 2021 09:32:55 -0500
Subject: [PATCH 056/202] adding `series.transpose` (#9835)

Fixes: #9605

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9835
---
 docs/cudf/source/api_docs/series.rst  |  1 +
 python/cudf/cudf/core/dataframe.py    | 17 +----------------
 python/cudf/cudf/core/series.py       |  8 ++++++++
 python/cudf/cudf/tests/test_series.py | 21 +++++++++++++++++++++
 4 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index b90ee628332..a3b621a9f7d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -206,6 +206,7 @@ Reshaping, sorting
    Series.scatter_by_map
    Series.searchsorted
    Series.repeat
+   Series.transpose
 
 Combining / comparing / joining / merging / encoding
 ----------------------------------------------------
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c0cb6f1917f..279b1f44961 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3520,22 +3520,7 @@ def transpose(self):
         result.columns = columns
         return result
 
-    @property
-    def T(self):
-        """
-        Transpose index and columns.
-
-        Reflect the DataFrame over its main diagonal by writing rows
-        as columns and vice-versa. The property T is an accessor to
-        the method transpose().
-
-        Returns
-        -------
-        out : DataFrame
-            The transposed DataFrame.
-        """
-
-        return self.transpose()
+    T = property(transpose, doc=transpose.__doc__)
 
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index cf035ef457d..0f0ebe55043 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2788,6 +2788,14 @@ def cov(self, other, min_periods=None):
 
         return lhs._column.cov(rhs._column)
 
+    def transpose(self):
+        """Return the transpose, which is by definition self.
+        """
+
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
+
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 73fe46746ce..20f5f3a19e4 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1346,3 +1346,24 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     gsr = cudf.Series(data, dtype=bool_dtype)
 
     assert_eq(psr, gsr.to_pandas(nullable=True))
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [0, 1, 2, 3],
+        ["abc", "a", None, "hello world", "foo buzz", "", None, "rapids ai"],
+    ],
+)
+def test_series_transpose(data):
+    psr = pd.Series(data=data)
+    csr = cudf.Series(data=data)
+
+    cudf_transposed = csr.transpose()
+    pd_transposed = psr.transpose()
+    cudf_property = csr.T
+    pd_property = psr.T
+
+    assert_eq(pd_transposed, cudf_transposed)
+    assert_eq(pd_property, cudf_property)
+    assert_eq(cudf_transposed, csr)

From 0ce9571e61c88f1bb7acc622497fe5711f493f2c Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 7 Dec 2021 13:30:37 -0600
Subject: [PATCH 057/202] Remove deprecated methods from Java Table class
 (#9853)

Relates to #9851.  This removes deprecated methods from the Java Table class, including APIs that would allow writing ORC files without specifying metadata via writer options needed for some columns (e.g.: precision for decimal columns).  This also fixes a test where decimal precisions where not specified despite trying to write decimal columns to ORC.  Javadoc errors introduced by #9096 were also corrected.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9853
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 89 +------------------
 .../test/java/ai/rapids/cudf/TableTest.java   | 37 ++++----
 2 files changed, 19 insertions(+), 107 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index e32d466e853..a34d4afdc56 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1091,20 +1091,6 @@ public static void writeColumnViewsToParquet(ParquetWriterOptions options,
     }
   }
 
-  /**
-   * Writes this table to a Parquet file on the host
-   *
-   * @param options parameters for the writer
-   * @param outputFile file to write the table to
-   * @deprecated please use writeParquetChunked instead
-   */
-  @Deprecated
-  public void writeParquet(ParquetWriterOptions options, File outputFile) {
-    try (TableWriter writer = writeParquetChunked(options, outputFile)) {
-      writer.write(this);
-    }
-  }
-
   private static class ORCTableWriter implements TableWriter {
     private long handle;
     HostBufferConsumer consumer;
@@ -1179,33 +1165,6 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo
     return new ORCTableWriter(options, consumer);
   }
 
-  /**
-   * Writes this table to a file on the host.
-   * @param outputFile - File to write the table to
-   * @deprecated please use writeORCChunked instead
-   */
-  @Deprecated
-  public void writeORC(File outputFile) {
-    // Need to specify the number of columns but leave all column names undefined
-    String[] names = new String[getNumberOfColumns()];
-    Arrays.fill(names, "");
-    ORCWriterOptions opts = ORCWriterOptions.builder().withColumns(true, names).build();
-    writeORC(opts, outputFile);
-  }
-
-  /**
-   * Writes this table to a file on the host.
-   * @param outputFile - File to write the table to
-   * @deprecated please use writeORCChunked instead
-   */
-  @Deprecated
-  public void writeORC(ORCWriterOptions options, File outputFile) {
-    assert options.getTopLevelChildren() == getNumberOfColumns() : "must specify names for all columns";
-    try (TableWriter writer = Table.writeORCChunked(options, outputFile)) {
-      writer.write(this);
-    }
-  }
-
   private static class ArrowIPCTableWriter implements TableWriter {
     private final ArrowIPCWriterOptions.DoneOnGpu callback;
     private long handle;
@@ -2082,26 +2041,6 @@ public Table gather(ColumnView gatherMap) {
     return gather(gatherMap, OutOfBoundsPolicy.NULLIFY);
   }
 
-  /**
-   * Gathers the rows of this table according to `gatherMap` such that row "i"
-   * in the resulting table's columns will contain row "gatherMap[i]" from this table.
-   * The number of rows in the result table will be equal to the number of elements in
-   * `gatherMap`.
-   *
-   * A negative value `i` in the `gatherMap` is interpreted as `i+n`, where
-   * `n` is the number of rows in this table.
-   *
-   * @deprecated Use {@link #gather(ColumnView, OutOfBoundsPolicy)}
-   * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
-   * @param checkBounds if true bounds checking is performed on the value. Be very careful
-   *                    when setting this to false.
-   * @return the resulting Table.
-   */
-  @Deprecated
-  public Table gather(ColumnView gatherMap, boolean checkBounds) {
-    return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
-  }
-
   /**
    * Gathers the rows of this table according to `gatherMap` such that row "i"
    * in the resulting table's columns will contain row "gatherMap[i]" from this table.
@@ -2256,7 +2195,7 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable,
    * the left and right tables, respectively, to produce the result of the left join.
    * It is the responsibility of the caller to close the resulting gather map instances.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join in the join
@@ -2396,7 +2335,7 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable,
    * the left and right tables, respectively, to produce the result of the inner join.
    * It is the responsibility of the caller to close the resulting gather map instances.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalInnerJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join in the join
@@ -2588,7 +2527,7 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
    * to produce the result of the left semi join.
    * It is the responsibility of the caller to close the resulting gather map instance.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftSemiJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join
@@ -2667,7 +2606,7 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
    * to produce the result of the left anti join.
    * It is the responsibility of the caller to close the resulting gather map instance.
    * This interface allows passing an output row count that was previously computed from
-   * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression, boolean)}.
+   * {@link #conditionalLeftAntiJoinRowCount(Table, CompiledExpression)}.
    * WARNING: Passing a row count that is smaller than the actual row count will result
    * in undefined behavior.
    * @param rightTable the right side table of the join
@@ -3449,14 +3388,6 @@ public ContiguousTable[] contiguousSplitGroups() {
           groupByOptions.getKeysDescending(),
           groupByOptions.getKeysNullSmallest());
     }
-
-    /**
-     * @deprecated use aggregateWindowsOverRanges
-     */
-    @Deprecated
-    public Table aggregateWindowsOverTimeRanges(AggregationOverWindow... windowAggregates) {
-      return aggregateWindowsOverRanges(windowAggregates);
-    }
   }
 
   public static final class TableOperation {
@@ -3651,18 +3582,6 @@ public PartitionedTable hashPartition(HashType type, int numberOfPartitions) {
           partitionOffsets.length,
           partitionOffsets)), partitionOffsets);
     }
-
-    /**
-     * Hash partition a table into the specified number of partitions.
-     * @deprecated Use {@link #hashPartition(int)}
-     * @param numberOfPartitions - number of partitions to use
-     * @return - {@link PartitionedTable} - Table that exposes a limited functionality of the
-     * {@link Table} class
-     */
-    @Deprecated
-    public PartitionedTable partition(int numberOfPartitions) {
-      return hashPartition(numberOfPartitions);
-    }
   }
 
   /////////////////////////////////////////////////////////////////////////////
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index a5779bf9dbb..21e3b3784fc 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7177,19 +7177,6 @@ void testORCWriteMapChunked() throws IOException {
     }
   }
 
-  @Test
-  void testORCWriteToFile() throws IOException {
-    File tempFile = File.createTempFile("test", ".orc");
-    try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
-      table0.writeORC(tempFile.getAbsoluteFile());
-      try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) {
-        assertTablesAreEqual(table0, table1);
-      }
-    } finally {
-      tempFile.delete();
-    }
-  }
-
   @Test
   void testORCWriteToFileWithColNames() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
@@ -7198,7 +7185,9 @@ void testORCWriteToFileWithColNames() throws IOException {
       ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
       WriteUtils.buildWriterOptions(optBuilder, colNames);
       ORCWriterOptions options = optBuilder.build();
-      table0.writeORC(options, tempFile.getAbsoluteFile());
+      try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       ORCOptions opts = ORCOptions.builder().includeColumn(colNames).build();
       try (Table table1 = Table.readORC(opts, tempFile.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table1);
@@ -7217,7 +7206,9 @@ void testORCReadAndWriteForDecimal128() throws IOException {
       ORCWriterOptions.Builder optBuilder = ORCWriterOptions.builder();
       WriteUtils.buildWriterOptions(optBuilder, colNames);
       ORCWriterOptions options = optBuilder.build();
-      table0.writeORC(options, tempFile.getAbsoluteFile());
+      try (TableWriter writer = Table.writeORCChunked(options, tempFile.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       ORCOptions opts = ORCOptions.builder()
           .includeColumn(colNames)
           .decimal128Column(Columns.DECIMAL128.name,
@@ -7236,13 +7227,15 @@ void testORCReadAndWriteForDecimal128() throws IOException {
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
     try (Table table0 = getExpectedFileTable(WriteUtils.getNonNestedColumns(false))) {
-      String[] colNames = new String[table0.getNumberOfColumns()];
-      Arrays.fill(colNames, "");
-      ORCWriterOptions opts = ORCWriterOptions.builder()
-              .withColumns(true, colNames)
-              .withCompressionType(CompressionType.NONE)
-              .build();
-      table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile());
+      String[] colNames = WriteUtils.getNonNestedColumns(false);
+      ORCWriterOptions.Builder optsBuilder = ORCWriterOptions.builder();
+      WriteUtils.buildWriterOptions(optsBuilder, colNames);
+      optsBuilder.withCompressionType(CompressionType.NONE);
+      ORCWriterOptions opts = optsBuilder.build();
+      try (TableWriter writer =
+               Table.writeORCChunked(opts,tempFileUncompressed.getAbsoluteFile())) {
+        writer.write(table0);
+      }
       try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table2);
       }

From a5633c2045b2946b5ea8b83b89f3af6ab6d0fcfa Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Tue, 7 Dec 2021 15:29:27 -0500
Subject: [PATCH 058/202] Adding support for `Series.autocorr` (#9833)

Fixes: #9635

TODO:
- [x] add implementation
- [x] tests
- [x] add to `.rst` files for documentation

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9833
---
 docs/cudf/source/api_docs/series.rst  |  1 +
 python/cudf/cudf/core/frame.py        |  1 -
 python/cudf/cudf/core/series.py       | 25 +++++++++++++++++++++++++
 python/cudf/cudf/tests/test_series.py | 17 +++++++++++++++++
 4 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index a3b621a9f7d..d234dfc4bcb 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -118,6 +118,7 @@ Computations / descriptive stats
    Series.abs
    Series.all
    Series.any
+   Series.autocorr
    Series.ceil
    Series.clip
    Series.corr
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d7a75cb9f40..9969b9ac0fa 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1919,7 +1919,6 @@ def round(self, decimals=0, how="half_even"):
         2   0.7   0.0
         3   0.2   0.0
         """
-
         if isinstance(decimals, cudf.Series):
             decimals = decimals.to_pandas()
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 0f0ebe55043..3aae79af4e8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2824,6 +2824,31 @@ def corr(self, other, method="pearson", min_periods=None):
 
         return lhs._column.corr(rhs._column)
 
+    def autocorr(self, lag=1):
+        """Compute the lag-N autocorrelation. This method computes the Pearson
+        correlation between the Series and its shifted self.
+
+        Parameters
+        ----------
+        lag : int, default 1
+            Number of lags to apply before performing autocorrelation.
+
+        Returns
+        -------
+        result : float
+            The Pearson correlation between self and self.shift(lag).
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05])
+        >>> s.autocorr()
+        0.10355263309024071
+        >>> s.autocorr(lag=2)
+        -0.9999999999999999
+        """
+        return self.corr(self.shift(lag))
+
     def isin(self, values):
         """Check whether values are contained in Series.
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 20f5f3a19e4..d59e3ba7571 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1348,6 +1348,23 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     assert_eq(psr, gsr.to_pandas(nullable=True))
 
 
+@pytest.mark.parametrize(
+    "cudf_series",
+    [
+        cudf.Series([0.25, 0.5, 0.2, -0.05]),
+        cudf.Series([0, 1, 2, np.nan, 4, cudf.NA, 6]),
+    ],
+)
+@pytest.mark.parametrize("lag", [1, 2, 3, 4])
+def test_autocorr(cudf_series, lag):
+    psr = cudf_series.to_pandas()
+
+    cudf_corr = cudf_series.autocorr(lag=lag)
+    pd_corr = psr.autocorr(lag=lag)
+
+    assert_eq(pd_corr, cudf_corr)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From ba3aedbb869ef95cae517f56ffb2bab305bffa40 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 Dec 2021 15:23:43 -0600
Subject: [PATCH 059/202] Raise temporary error for `decimal128` types in
 parquet reader (#9804)

This PR adds a `decimal128` type validation in parquet reader. This is put in-place to unblock libcudf changes: https://github.com/rapidsai/cudf/pull/9765 and this validation will soon be removed once python side of `decimal128` changes are merged(blocked by libcudf `from_arrow` bug).

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9804
---
 python/cudf/cudf/io/parquet.py                |  37 ++++++++++++++++++
 .../parquet/nested_decimal128_file.parquet    | Bin 0 -> 1692 bytes
 python/cudf/cudf/tests/test_parquet.py        |  22 +++++++++--
 3 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 04d64969a16..f9b39bf2cfa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -7,6 +7,7 @@
 from uuid import uuid4
 
 import fsspec
+import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -614,6 +615,34 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
+        # Temporary error to probe a parquet file
+        # and raise decimal128 support error.
+        if len(filepaths_or_buffers) > 0:
+            try:
+                metadata = pq.read_metadata(filepaths_or_buffers[0])
+            except TypeError:
+                # pq.read_metadata only supports reading metadata from
+                # certain types of file inputs, like str-filepath or file-like
+                # objects, and errors for the rest of inputs. Hence this is
+                # to avoid failing on other types of file inputs.
+                pass
+            else:
+                arrow_schema = metadata.schema.to_arrow_schema()
+                check_cols = arrow_schema.names if columns is None else columns
+                for col_name, arrow_type in zip(
+                    arrow_schema.names, arrow_schema.types
+                ):
+                    if col_name not in check_cols:
+                        continue
+                    if isinstance(arrow_type, pa.ListType):
+                        val_field_types = arrow_type.value_field.flatten()
+                        for val_field_type in val_field_types:
+                            _check_decimal128_type(val_field_type.type)
+                    elif isinstance(arrow_type, pa.StructType):
+                        _ = cudf.StructDtype.from_arrow(arrow_type)
+                    else:
+                        _check_decimal128_type(arrow_type)
+
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -731,3 +760,11 @@ def merge_parquet_filemetadata(filemetadata_list):
 
 
 ParquetWriter = libparquet.ParquetWriter
+
+
+def _check_decimal128_type(arrow_type):
+    if isinstance(arrow_type, pa.Decimal128Type):
+        if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
+            raise NotImplementedError(
+                "Decimal type greater than Decimal64 is not yet supported"
+            )
diff --git a/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet b/python/cudf/cudf/tests/data/parquet/nested_decimal128_file.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..7440d357a1263ce9c28b6681e38c35540d5e2af8
GIT binary patch
literal 1692
zcmcgtPj3=I6n{HA(1qBhHl4|CvQZ9nLn5uU2}xt{Fbfr$f&!*0iI;^C#I_az#rOgI
z6nqfRo;(>39zA;O(W8mJH&f^Z4jN?wZ{DBZ`#;InRO61|qAd0*!V!D}APul;L2}jM
zuz5X^z5$mfBS54?I!iDS7jfTB%uH8oQbJ~~9<l3rhSW|Hb!=vThv28>i#0FDA@1#4
z5kKt>N4z)coCzi!`JTxiR`57*{xp@B#S*z-fF;ufbS=L{u3P*Vo3&WBS__(-@Zhuk
zPuj#4Rj7O1<a*x1*6Gj)=-@mygf!F9D|Dt7*O_8*fpfIzoJbLvGs|Ydh2slu_Ljk>
zTDUJRKUwmN6?|I>CM&)>C&q%vbGF62Ne<lju>HJq76}eH=30?pB3{X32|4cSmqH6(
z!-klv-^^AL5t-~WqJ%=9)?fMxzD3?3x#xTNrM=Q%nQ!Gk?@)e?Nsg0~A7YyTp6^~$
zjmE<#cg;@S3;Q;my>6(Ks^g!|ky2w7cGZ2Qx~kCCx)1?X;tEO~(v9-;Mv2%WsY#FI
z9R}S;=HNj|%VVMkL@*?BI;hD<v_$@4l}dS3LqhQSp|+Qiy(;aJ)p$xerg{)!ibQK@
zoM!o}*aT{|-Q*{LF4E8@$wx0R>8%sWPuB1)Ql*+i43s!ifAq9*+R{l1$As7MwW+9O
zaw(cB+T>*Xb?Ot6^^{9mMp8F+1YsQPr>VlRnX@)(hgm(kiN*Dpd7vHRXz1}QpNVr1
v+?)Mq@alDEwB7Fax1+(}e4KjI84P>pFH^al-JM-8?*o`0{IUUR_+9!9wrcs_

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 516ee0d17d3..597ae6c05c0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -629,15 +629,29 @@ def test_parquet_reader_spark_timestamps(datadir):
 def test_parquet_reader_spark_decimals(datadir):
     fname = datadir / "spark_decimal.parquet"
 
-    expect = pd.read_parquet(fname)
-    got = cudf.read_parquet(fname)
+    # expect = pd.read_parquet(fname)
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname)
 
     # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
     # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    expect = expect.apply(pd.to_numeric)
+    # expect = expect.apply(pd.to_numeric)
 
     # np.testing.assert_allclose(expect, got)
-    assert_eq(expect, got)
+    # assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
+def test_parquet_reader_decimal128_error_validation(datadir, columns):
+    fname = datadir / "nested_decimal128_file.parquet"
+    with pytest.raises(
+        NotImplementedError,
+        match="Decimal type greater than Decimal64 is not yet supported",
+    ):
+        cudf.read_parquet(fname, columns=columns)
 
 
 def test_parquet_reader_microsecond_timestamps(datadir):

From a72f19e9eb5f23ed58f87768cdafcf244e6424df Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Tue, 7 Dec 2021 16:24:17 -0500
Subject: [PATCH 060/202] Enforce boolean `ascending` for dask-cudf
 `sort_values` (#9814)

It is possible to pass a list of `ascending` booleans to dask-cudf's `sort_values`, which is not yet supported by `quantile_divisions` (which computes divisions for all sort-by columns sorted in ascending order), and can cause undefined behavior.

This small PR adds a check that `ascending` is a boolean before computing quantile divisions; note that this check happens _after_ the single-partition case is handled, as cuDF can handle lists of `ascending` booleans.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9814
---
 python/dask_cudf/dask_cudf/sorting.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 5f2af445170..e8551493bb1 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -224,6 +224,8 @@ def sort_values(
     na_position="last",
 ):
     """Sort by the given list/tuple of column names."""
+    if not isinstance(ascending, bool):
+        raise ValueError("ascending must be either True or False")
     if na_position not in ("first", "last"):
         raise ValueError("na_position must be either 'first' or 'last'")
 

From ea3aff280827dcf24f5ca824a7cbdf88d5ba8d0f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 7 Dec 2021 18:32:08 -0800
Subject: [PATCH 061/202] Add decimal128 support to Parquet reader and writer
 (#9765)

Closes https://github.com/rapidsai/cudf/issues/9566
Depends on https://github.com/rapidsai/cudf/pull/9804

Read decimal columns as 128bit when the input width requires it.
Write decimal128 columns as `FIXED_LEN_BYTE_ARRAY`.
Use the smallest viable decimal size to read `FIXED_LEN_BYTE_ARRAY` (used to default to decimal64, even when 32bits are sufficient).
Removes `strict_decimal_types` option from Parquet reader, we can now always read using the exact decimal type.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Devavret Makkar (https://github.com/devavret)
  - MithunR (https://github.com/mythrocks)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9765
---
 cpp/include/cudf/io/parquet.hpp               |  30 -----
 cpp/src/io/parquet/chunk_dict.cu              |   1 +
 cpp/src/io/parquet/page_data.cu               | 127 +++---------------
 cpp/src/io/parquet/page_enc.cu                |  83 +++++++-----
 cpp/src/io/parquet/parquet_common.hpp         |   2 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |   6 +-
 cpp/src/io/parquet/reader_impl.cu             |  56 ++++----
 cpp/src/io/parquet/reader_impl.hpp            |   1 -
 cpp/src/io/parquet/writer_impl.cu             |   6 +-
 cpp/tests/io/parquet_test.cpp                 |  59 +++++---
 .../java/ai/rapids/cudf/ParquetOptions.java   |  17 ---
 java/src/main/java/ai/rapids/cudf/Table.java  |  10 +-
 java/src/main/native/src/TableJni.cpp         |   9 +-
 .../test/java/ai/rapids/cudf/TableTest.java   |   7 +-
 14 files changed, 160 insertions(+), 254 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 88cf7416506..a2771d6400f 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -68,10 +68,6 @@ class parquet_reader_options {
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
-  // force decimal reading to error if resorting to
-  // doubles for storage of types unsupported by cudf
-  bool _strict_decimal_types = false;
-
   /**
    * @brief Constructor from source info.
    *
@@ -138,12 +134,6 @@ class parquet_reader_options {
    */
   data_type get_timestamp_type() const { return _timestamp_type; }
 
-  /**
-   * @brief Returns true if strict decimal types is set, which errors if reading
-   * a decimal type that is unsupported.
-   */
-  bool is_enabled_strict_decimal_types() const { return _strict_decimal_types; }
-
   /**
    * @brief Sets names of the columns to be read.
    *
@@ -213,14 +203,6 @@ class parquet_reader_options {
    * @param type The timestamp data_type to which all timestamp columns need to be cast.
    */
   void set_timestamp_type(data_type type) { _timestamp_type = type; }
-
-  /**
-   * @brief Enables/disables strict decimal type checking.
-   *
-   * @param val If true, cudf will error if reading a decimal type that is unsupported. If false,
-   * cudf will convert unsupported types to double.
-   */
-  void set_strict_decimal_types(bool val) { _strict_decimal_types = val; }
 };
 
 class parquet_reader_options_builder {
@@ -325,18 +307,6 @@ class parquet_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Sets to enable/disable error with unsupported decimal types.
-   *
-   * @param val Boolean value whether to error with unsupported decimal types.
-   * @return this for chaining.
-   */
-  parquet_reader_options_builder& use_strict_decimal_types(bool val)
-  {
-    options._strict_decimal_types = val;
-    return *this;
-  }
-
   /**
    * @brief move parquet_reader_options member once it's built.
    */
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 64b3dd69c0d..42d27dadd1a 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -188,6 +188,7 @@ __global__ void __launch_bounds__(block_size, 1)
                 return 4 + data_col.element<string_view>(val_idx).size_bytes();
               }
             case Type::FIXED_LEN_BYTE_ARRAY:
+              if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
             default: cudf_assert(false && "Unsupported type for dictionary encoding"); return 0;
           }
         }();
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 337d9faec20..95a79fceb63 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -743,102 +743,14 @@ inline __device__ void gpuOutputInt64Timestamp(volatile page_state_s* s, int src
 }
 
 /**
- * @brief Powers of 10
- */
-static const __device__ __constant__ double kPow10[40] = {
-  1.0,   1.e1,  1.e2,  1.e3,  1.e4,  1.e5,  1.e6,  1.e7,  1.e8,  1.e9,  1.e10, 1.e11, 1.e12, 1.e13,
-  1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
-  1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39,
-};
-
-/**
- * @brief Output a decimal type ([INT32..INT128] + scale) as a 64-bit float
+ * @brief Output a fixed-length byte array as int.
  *
  * @param[in,out] s Page state input/output
  * @param[in] src_pos Source position
  * @param[in] dst Pointer to row output data
- * @param[in] dtype Stored data type
  */
-inline __device__ void gpuOutputDecimalAsFloat(volatile page_state_s* s,
-                                               int src_pos,
-                                               double* dst,
-                                               int dtype)
-{
-  const uint8_t* dict;
-  uint32_t dict_pos, dict_size = s->dict_size, dtype_len_in;
-  int64_t i128_hi, i128_lo;
-  int32_t scale;
-  double d;
-
-  if (s->dict_base) {
-    // Dictionary
-    dict_pos = (s->dict_bits > 0) ? s->dict_idx[src_pos & (non_zero_buffer_size - 1)] : 0;
-    dict     = s->dict_base;
-  } else {
-    // Plain
-    dict_pos = src_pos;
-    dict     = s->data_start;
-  }
-  dtype_len_in = s->dtype_len_in;
-  dict_pos *= dtype_len_in;
-  // FIXME: Not very efficient (currently reading 1 byte at a time) -> need a variable-length
-  // unaligned load utility function (both little-endian and big-endian versions)
-  if (dtype == INT32) {
-    int32_t lo32 = 0;
-    for (unsigned int i = 0; i < dtype_len_in; i++) {
-      uint32_t v = (dict_pos + i < dict_size) ? dict[dict_pos + i] : 0;
-      lo32 |= v << (i * 8);
-    }
-    i128_lo = lo32;
-    i128_hi = lo32 >> 31;
-  } else if (dtype == INT64) {
-    int64_t lo64 = 0;
-    for (unsigned int i = 0; i < dtype_len_in; i++) {
-      uint64_t v = (dict_pos + i < dict_size) ? dict[dict_pos + i] : 0;
-      lo64 |= v << (i * 8);
-    }
-    i128_lo = lo64;
-    i128_hi = lo64 >> 63;
-  } else  // if (dtype == FIXED_LENGTH_BYTE_ARRAY)
-  {
-    i128_lo = 0;
-    for (unsigned int i = dtype_len_in - min(dtype_len_in, 8); i < dtype_len_in; i++) {
-      uint32_t v = (dict_pos + i < dict_size) ? dict[dict_pos + i] : 0;
-      i128_lo    = (i128_lo << 8) | v;
-    }
-    if (dtype_len_in > 8) {
-      i128_hi = 0;
-      for (unsigned int i = dtype_len_in - min(dtype_len_in, 16); i < dtype_len_in - 8; i++) {
-        uint32_t v = (dict_pos + i < dict_size) ? dict[dict_pos + i] : 0;
-        i128_hi    = (i128_hi << 8) | v;
-      }
-      if (dtype_len_in < 16) {
-        i128_hi <<= 64 - (dtype_len_in - 8) * 8;
-        i128_hi >>= 64 - (dtype_len_in - 8) * 8;
-      }
-    } else {
-      if (dtype_len_in < 8) {
-        i128_lo <<= 64 - dtype_len_in * 8;
-        i128_lo >>= 64 - dtype_len_in * 8;
-      }
-      i128_hi = i128_lo >> 63;
-    }
-  }
-  scale = s->col.decimal_scale;
-  d     = Int128ToDouble_rn(i128_lo, i128_hi);
-  *dst  = (scale < 0) ? (d * kPow10[min(-scale, 39)]) : (d / kPow10[min(scale, 39)]);
-}
-
-/**
- * @brief Output a fixed-length byte array(len <= 8) as a 64-bit int
- *
- * @param[in,out] s Page state input/output
- * @param[in] src_pos Source position
- * @param[in] dst Pointer to row output data
- */
-inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s* s,
-                                                         int src_pos,
-                                                         int64_t* dst)
+template <typename T>
+__device__ void gpuOutputFixedLenByteArrayAsInt(volatile page_state_s* s, int src_pos, T* dst)
 {
   uint32_t const dtype_len_in = s->dtype_len_in;
   uint8_t const* data         = s->dict_base ? s->dict_base : s->data_start;
@@ -848,18 +760,18 @@ inline __device__ void gpuOutputFixedLenByteArrayAsInt64(volatile page_state_s*
     dtype_len_in;
   uint32_t const dict_size = s->dict_size;
 
-  int64_t unscaled64 = 0;
+  T unscaled = 0;
   for (unsigned int i = 0; i < dtype_len_in; i++) {
     uint32_t v = (pos + i < dict_size) ? data[pos + i] : 0;
-    unscaled64 = (unscaled64 << 8) | v;
+    unscaled   = (unscaled << 8) | v;
   }
   // Shift the unscaled value up and back down when it isn't all 8 bytes,
   // which sign extend the value for correctly representing negative numbers.
-  if (dtype_len_in < 8) {
-    unscaled64 <<= 64 - dtype_len_in * 8;
-    unscaled64 >>= 64 - dtype_len_in * 8;
+  if (dtype_len_in < sizeof(T)) {
+    unscaled <<= (sizeof(T) - dtype_len_in) * 8;
+    unscaled >>= (sizeof(T) - dtype_len_in) * 8;
   }
-  *dst = unscaled64;
+  *dst = unscaled;
 }
 
 /**
@@ -1003,7 +915,8 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       uint32_t dtype_len_out = s->col.data_type >> 3;
       s->ts_scale            = 0;
       // Validate data type
-      switch (s->col.data_type & 7) {
+      auto const data_type = s->col.data_type & 7;
+      switch (data_type) {
         case BOOLEAN:
           s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
           break;
@@ -1034,10 +947,11 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
           break;
       }
       // Special check for downconversions
-      s->dtype_len_in          = s->dtype_len;
-      uint16_t const data_type = s->col.data_type & 7;
-      if (s->col.converted_type == DECIMAL && data_type != INT32 && data_type != INT64) {
-        s->dtype_len = 8;  // FLOAT output
+      s->dtype_len_in = s->dtype_len;
+      if (s->col.converted_type == DECIMAL && data_type == FIXED_LEN_BYTE_ARRAY) {
+        s->dtype_len = s->dtype_len <= sizeof(int32_t)   ? sizeof(int32_t)
+                       : s->dtype_len <= sizeof(int64_t) ? sizeof(int64_t)
+                                                         : sizeof(__int128_t);
       } else if (data_type == INT32) {
         if (dtype_len_out == 1) s->dtype_len = 1;  // INT8 output
         if (dtype_len_out == 2) s->dtype_len = 2;  // INT16 output
@@ -1794,11 +1708,12 @@ extern "C" __global__ void __launch_bounds__(block_size)
             case INT32: gpuOutputFast(s, val_src_pos, static_cast<uint32_t*>(dst)); break;
             case INT64: gpuOutputFast(s, val_src_pos, static_cast<uint2*>(dst)); break;
             default:
-              // we currently do not support reading byte arrays larger than DECIMAL64
-              if (s->dtype_len_in <= 8) {
-                gpuOutputFixedLenByteArrayAsInt64(s, val_src_pos, static_cast<int64_t*>(dst));
+              if (s->dtype_len_in <= sizeof(int32_t)) {
+                gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast<int32_t*>(dst));
+              } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast<int64_t*>(dst));
               } else {
-                gpuOutputDecimalAsFloat(s, val_src_pos, static_cast<double*>(dst), dtype);
+                gpuOutputFixedLenByteArrayAsInt(s, val_src_pos, static_cast<__int128_t*>(dst));
               }
               break;
           }
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3ca53d9e651..4728a8001f2 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -73,6 +73,23 @@ struct page_enc_state_s {
   uint16_t vals[rle_buffer_size];
 };
 
+/**
+ * @brief Returns the size of the type in the Parquet file.
+ */
+uint32_t __device__ physical_type_len(Type physical_type, type_id id)
+{
+  if (physical_type == FIXED_LEN_BYTE_ARRAY and id == type_id::DECIMAL128) {
+    return sizeof(__int128_t);
+  }
+  switch (physical_type) {
+    case INT96: return 12u;
+    case INT64:
+    case DOUBLE: return sizeof(int64_t);
+    case BOOLEAN: return 1u;
+    default: return sizeof(int32_t);
+  }
+}
+
 /**
  * @brief Return a 12-bit hash from a byte sequence
  */
@@ -123,11 +140,10 @@ __global__ void __launch_bounds__(block_size)
 
   frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
-  uint32_t start_row, dtype_len, dtype;
 
   if (t == 0) s->col = col_desc[blockIdx.x];
   __syncthreads();
-  start_row = blockIdx.y * fragment_size;
+  uint32_t const start_row = blockIdx.y * fragment_size;
   if (!t) {
     // frag.num_rows = fragment_size except for the last page fragment which can be smaller.
     // num_rows is fixed but fragment size could be larger if the data is strings or nested.
@@ -176,11 +192,8 @@ __global__ void __launch_bounds__(block_size)
       s->frag.num_values = s->frag.num_rows;
     }
   }
-  dtype     = s->col.physical_type;
-  dtype_len = (dtype == INT96)                      ? 12
-              : (dtype == INT64 || dtype == DOUBLE) ? 8
-              : (dtype == BOOLEAN)                  ? 1
-                                                    : 4;
+  auto const physical_type = s->col.physical_type;
+  auto const dtype_len     = physical_type_len(physical_type, s->col.leaf_column->type().id());
   __syncthreads();
 
   size_type nvals           = s->frag.num_leaf_values;
@@ -194,8 +207,8 @@ __global__ void __launch_bounds__(block_size)
     uint32_t len;
     if (is_valid) {
       len = dtype_len;
-      if (dtype != BOOLEAN) {
-        if (dtype == BYTE_ARRAY) {
+      if (physical_type != BOOLEAN) {
+        if (physical_type == BYTE_ARRAY) {
           auto str = s->col.leaf_column->element<string_view>(val_idx);
           len += str.size_bytes();
         }
@@ -759,8 +772,6 @@ __global__ void __launch_bounds__(128, 8)
 
   page_enc_state_s* const s = &state_g;
   uint32_t t                = threadIdx.x;
-  uint32_t dtype, dtype_len_in, dtype_len_out;
-  int32_t dict_bits;
 
   if (t == 0) {
     s->page = pages[blockIdx.x];
@@ -878,29 +889,26 @@ __global__ void __launch_bounds__(128, 8)
   }
   // Encode data values
   __syncthreads();
-  dtype         = s->col.physical_type;
-  dtype_len_out = (dtype == INT96)                      ? 12
-                  : (dtype == INT64 || dtype == DOUBLE) ? 8
-                  : (dtype == BOOLEAN)                  ? 1
-                                                        : 4;
-  if (dtype == INT32) {
-    dtype_len_in = GetDtypeLogicalLen(s->col.leaf_column);
-  } else if (dtype == INT96) {
-    dtype_len_in = 8;
-  } else {
-    dtype_len_in = dtype_len_out;
-  }
-  dict_bits = (dtype == BOOLEAN) ? 1
-              : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
-                ? s->ck.dict_rle_bits
-                : -1;
+  auto const physical_type = s->col.physical_type;
+  auto const type_id       = s->col.leaf_column->type().id();
+  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_in  = [&]() -> uint32_t {
+    if (physical_type == INT32) { return int32_logical_len(type_id); }
+    if (physical_type == INT96) { return sizeof(int64_t); }
+    return dtype_len_out;
+  }();
+
+  auto const dict_bits = (physical_type == BOOLEAN) ? 1
+                         : (s->ck.use_dictionary and s->page.page_type != PageType::DICTIONARY_PAGE)
+                           ? s->ck.dict_rle_bits
+                           : -1;
   if (t == 0) {
     uint8_t* dst   = s->cur;
     s->rle_run     = 0;
     s->rle_pos     = 0;
     s->rle_numvals = 0;
     s->rle_out     = dst;
-    if (dict_bits >= 0 && dtype != BOOLEAN) {
+    if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
     }
@@ -964,7 +972,7 @@ __global__ void __launch_bounds__(128, 8)
         rle_numvals = s->rle_numvals;
         if (is_valid) {
           uint32_t v;
-          if (dtype == BOOLEAN) {
+          if (physical_type == BOOLEAN) {
             v = s->col.leaf_column->element<uint8_t>(val_idx);
           } else {
             v = s->ck.dict_index[val_idx];
@@ -973,7 +981,7 @@ __global__ void __launch_bounds__(128, 8)
         }
         rle_numvals += rle_numvals_in_block;
         __syncthreads();
-        if ((!enable_bool_rle) && (dtype == BOOLEAN)) {
+        if ((!enable_bool_rle) && (physical_type == BOOLEAN)) {
           PlainBoolEncode(s, rle_numvals, (cur_val_idx == s->page.num_leaf_values), t);
         } else {
           RleEncode(s, rle_numvals, dict_bits, (cur_val_idx == s->page.num_leaf_values), t);
@@ -988,7 +996,7 @@ __global__ void __launch_bounds__(128, 8)
 
       if (is_valid) {
         len = dtype_len_out;
-        if (dtype == BYTE_ARRAY) {
+        if (physical_type == BYTE_ARRAY) {
           len += s->col.leaf_column->element<string_view>(val_idx).size_bytes();
         }
       } else {
@@ -999,7 +1007,7 @@ __global__ void __launch_bounds__(128, 8)
       __syncthreads();
       if (t == 0) { s->cur = dst + total_len; }
       if (is_valid) {
-        switch (dtype) {
+        switch (physical_type) {
           case INT32:
           case FLOAT: {
             int32_t v;
@@ -1088,6 +1096,17 @@ __global__ void __launch_bounds__(128, 8)
             dst[pos + 3] = v >> 24;
             if (v != 0) memcpy(dst + pos + 4, str.data(), v);
           } break;
+          case FIXED_LEN_BYTE_ARRAY: {
+            if (type_id == type_id::DECIMAL128) {
+              // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
+              auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
+              auto const v_char_ptr = reinterpret_cast<char const*>(&v);
+              thrust::copy(thrust::seq,
+                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                           thrust::make_reverse_iterator(v_char_ptr),
+                           dst + pos);
+            }
+          } break;
         }
       }
       __syncthreads();
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index bf86fe1c838..97d38a82e88 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -27,7 +27,7 @@ namespace parquet {
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
-enum Type {
+enum Type : int8_t {
   UNDEFINED_TYPE       = -1,  // Undefined for non-leaf nodes
   BOOLEAN              = 0,
   INT32                = 1,
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 1bd4cb3c6f4..a0cbc28bc8d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -233,7 +233,7 @@ struct ColumnChunkDesc {
  * @brief Struct describing an encoder column
  */
 struct parquet_column_device_view : stats_column_desc {
-  uint8_t physical_type;   //!< physical data type
+  Type physical_type;      //!< physical data type
   uint8_t converted_type;  //!< logical data type
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
@@ -273,9 +273,9 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
  */
-inline uint32_t __device__ GetDtypeLogicalLen(column_device_view const* col)
+inline uint32_t __device__ int32_logical_len(type_id id)
 {
-  switch (col->type().id()) {
+  switch (id) {
     case cudf::type_id::INT8:
     case cudf::type_id::UINT8: return 1;
     case cudf::type_id::INT16:
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 28144276066..8ff0d14ffda 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -103,8 +103,7 @@ parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const
  */
 type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
-                   type_id timestamp_type_id,
-                   bool strict_decimal_types)
+                   type_id timestamp_type_id)
 {
   parquet::Type physical                = schema.type;
   parquet::ConvertedType converted_type = schema.converted_type;
@@ -138,16 +137,20 @@ type_id to_type_id(SchemaElement const& schema,
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_MILLISECONDS;
     case parquet::DECIMAL:
-      if (physical == parquet::INT32)
-        return type_id::DECIMAL32;
-      else if (physical == parquet::INT64)
-        return type_id::DECIMAL64;
-      else if (physical == parquet::FIXED_LEN_BYTE_ARRAY && schema.type_length <= 8) {
-        return type_id::DECIMAL64;
-      } else {
-        CUDF_EXPECTS(strict_decimal_types == false, "Unsupported decimal type read!");
-        return type_id::FLOAT64;
+      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
+      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
+      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
+          return type_id::DECIMAL32;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
+          return type_id::DECIMAL64;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
+          return type_id::DECIMAL128;
+        }
       }
+      CUDF_FAIL("Invalid representation of decimal type");
       break;
 
     // maps are just List<Struct<>>.
@@ -550,7 +553,6 @@ class aggregate_metadata {
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
-   * @param strict_decimal_types Type conversion parameter
    *
    * @return input column information, output column information, list of output column schema
    * indices
@@ -558,8 +560,7 @@ class aggregate_metadata {
   auto select_columns(std::vector<std::string> const& use_names,
                       bool include_index,
                       bool strings_to_categorical,
-                      type_id timestamp_type_id,
-                      bool strict_decimal_types) const
+                      type_id timestamp_type_id) const
   {
     auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
       auto const& col_schema_idx = std::find_if(
@@ -595,11 +596,11 @@ class aggregate_metadata {
         }
 
         // if we're at the root, this is a new output column
-        auto const col_type =
-          to_type_id(schema_elem, strings_to_categorical, timestamp_type_id, strict_decimal_types);
-        auto const dtype = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64
-                             ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}}
-                             : data_type{col_type};
+        auto const col_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+        auto const dtype    = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 ||
+                               col_type == type_id::DECIMAL128
+                                ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}}
+                                : data_type{col_type};
 
         column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
         // store the index of this element if inserted in out_col_array
@@ -1559,8 +1560,6 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
     _timestamp_type = options.get_timestamp_type();
   }
 
-  _strict_decimal_types = options.is_enabled_strict_decimal_types();
-
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
 
@@ -1569,8 +1568,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
     _metadata->select_columns(options.get_columns(),
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
-                              _timestamp_type.id(),
-                              _strict_decimal_types);
+                              _timestamp_type.id());
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -1638,12 +1636,12 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         int32_t clock_rate;
         int8_t converted_type;
 
-        std::tie(type_width, clock_rate, converted_type) = conversion_info(
-          to_type_id(schema, _strings_to_categorical, _timestamp_type.id(), _strict_decimal_types),
-          _timestamp_type.id(),
-          schema.type,
-          schema.converted_type,
-          schema.type_length);
+        std::tie(type_width, clock_rate, converted_type) =
+          conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                          _timestamp_type.id(),
+                          schema.type,
+                          schema.converted_type,
+                          schema.type_length);
 
         column_chunk_offsets[chunks.size()] =
           (col_meta.dictionary_page_offset != 0)
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 4c3bd75b724..6564c4120a8 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -210,7 +210,6 @@ class reader::impl {
 
   bool _strings_to_categorical = false;
   data_type _timestamp_type{type_id::EMPTY};
-  bool _strict_decimal_types = false;
 };
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index d1101b24d7e..1a5fc1a3c64 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -343,7 +343,9 @@ struct leaf_schema_fn {
       col_schema.type        = Type::INT64;
       col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
     } else if (std::is_same_v<T, numeric::decimal128>) {
-      CUDF_FAIL("decimal128 currently not supported for parquet writer");
+      col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
+      col_schema.type_length = sizeof(__int128_t);
+      col_schema.stats_dtype = statistics_dtype::dtype_decimal128;
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
@@ -724,7 +726,7 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(
     desc.def_values    = _def_level.data();
   }
   desc.num_rows      = cudf_col.size();
-  desc.physical_type = static_cast<uint8_t>(physical_type());
+  desc.physical_type = physical_type();
 
   desc.level_bits = CompactProtocolReader::NumRequiredBits(max_rep_level()) << 4 |
                     CompactProtocolReader::NumRequiredBits(max_def_level());
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index b233819092a..c376accd1ff 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -399,7 +400,7 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
 
 TEST_F(ParquetWriterTest, MultiColumn)
 {
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 100000;
 
   // auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -407,14 +408,18 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto col3_data = random_values<int32_t>(num_rows);
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
-  auto col6_vals = random_values<int32_t>(num_rows);
-  auto col7_vals = random_values<int64_t>(num_rows);
+  auto col6_vals = random_values<int16_t>(num_rows);
+  auto col7_vals = random_values<int32_t>(num_rows);
+  auto col8_vals = random_values<int64_t>(num_rows);
   auto col6_data = cudf::detail::make_counting_transform_iterator(0, [col6_vals](auto i) {
     return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
   });
   auto col7_data = cudf::detail::make_counting_transform_iterator(0, [col7_vals](auto i) {
     return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
   });
+  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [col8_vals](auto i) {
+    return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
+  });
   auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   // column_wrapper<bool> col0{
@@ -426,6 +431,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
   column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, validity};
   column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, validity};
+  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, validity};
 
   std::vector<std::unique_ptr<column>> cols;
   // cols.push_back(col0.release());
@@ -436,8 +442,8 @@ TEST_F(ParquetWriterTest, MultiColumn)
   cols.push_back(col5.release());
   cols.push_back(col6.release());
   cols.push_back(col7.release());
+  cols.push_back(col8.release());
   auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(7, expected->num_columns());
 
   cudf_io::table_input_metadata expected_metadata(*expected);
   // expected_metadata.column_metadata[0].set_name( "bools");
@@ -448,6 +454,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   expected_metadata.column_metadata[4].set_name("doubles");
   expected_metadata.column_metadata[5].set_name("decimal32s").set_decimal_precision(10);
   expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
+  expected_metadata.column_metadata[7].set_name("decimal128s").set_decimal_precision(40);
 
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
   cudf_io::parquet_writer_options out_opts =
@@ -2871,7 +2878,6 @@ TEST_F(ParquetReaderTest, DecimalRead)
     cudf::test::expect_columns_equal(result.tbl->view().column(1), col1);
 
     cudf_io::parquet_reader_options read_strict_opts = read_opts;
-    read_strict_opts.set_strict_decimal_types(true);
     read_strict_opts.set_columns({"dec7p4", "dec14p5"});
     EXPECT_NO_THROW(cudf_io::read_parquet(read_strict_opts));
   }
@@ -2969,25 +2975,21 @@ TEST_F(ParquetReaderTest, DecimalRead)
     cudf_io::parquet_reader_options read_opts =
       cudf_io::parquet_reader_options::builder(cudf_io::source_info{
         reinterpret_cast<const char*>(fixed_len_bytes_decimal_parquet), parquet_len});
-    read_opts.set_strict_decimal_types(true);
-    read_opts.set_columns({"dec7p3", "dec12p11"});
     auto result = cudf_io::read_parquet(read_opts);
-    EXPECT_EQ(result.tbl->view().num_columns(), 2);
+    EXPECT_EQ(result.tbl->view().num_columns(), 3);
 
-    auto validity_c0 =
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 19; });
-    int64_t col0_data[] = {6361295, 698632,  7821423, 7073444, 9631892, 3021012, 5195059,
+    auto validity_c0    = cudf::test::iterators::nulls_at({19});
+    int32_t col0_data[] = {6361295, 698632,  7821423, 7073444, 9631892, 3021012, 5195059,
                            9913714, 901749,  7776938, 3186566, 4955569, 5131067, 98619,
                            2282579, 7521455, 4430706, 1937859, 4532040, 0};
 
     EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(0).size()),
               sizeof(col0_data) / sizeof(col0_data[0]));
-    cudf::test::fixed_point_column_wrapper<int64_t> col0(
+    cudf::test::fixed_point_column_wrapper<int32_t> col0(
       std::begin(col0_data), std::end(col0_data), validity_c0, numeric::scale_type{-3});
     cudf::test::expect_columns_equal(result.tbl->view().column(0), col0);
 
-    auto validity_c1 =
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 18; });
+    auto validity_c1    = cudf::test::iterators::nulls_at({18});
     int64_t col1_data[] = {361378026250,
                            30646804862,
                            429930238629,
@@ -3015,8 +3017,33 @@ TEST_F(ParquetReaderTest, DecimalRead)
       std::begin(col1_data), std::end(col1_data), validity_c1, numeric::scale_type{-11});
     cudf::test::expect_columns_equal(result.tbl->view().column(1), col1);
 
-    read_opts.set_columns({"dec20p1"});
-    EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+    auto validity_c2       = cudf::test::iterators::nulls_at({6, 14});
+    __int128_t col2_data[] = {9078697037144433659,
+                              9050770539577117612,
+                              2358363961733893636,
+                              1566059559232276662,
+                              6658306200002735268,
+                              4967909073046397334,
+                              0,
+                              7235588493887532473,
+                              5023160741463849572,
+                              2765173712965988273,
+                              3880866513515749646,
+                              5019704400576359500,
+                              5544435986818825655,
+                              7265381725809874549,
+                              0,
+                              1576192427381240677,
+                              2828305195087094598,
+                              260308667809395171,
+                              2460080200895288476,
+                              2718441925197820439};
+
+    EXPECT_EQ(static_cast<std::size_t>(result.tbl->view().column(2).size()),
+              sizeof(col2_data) / sizeof(col2_data[0]));
+    cudf::test::fixed_point_column_wrapper<__int128_t> col2(
+      std::begin(col2_data), std::end(col2_data), validity_c2, numeric::scale_type{-1});
+    cudf::test::expect_columns_equal(result.tbl->view().column(2), col2);
   }
 }
 
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
index 6c7aecaab04..dd771cab7ea 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetOptions.java
@@ -27,30 +27,23 @@ public class ParquetOptions extends ColumnFilterOptions {
 
   private final DType unit;
 
-  private final boolean strictDecimalType;
 
 
   private ParquetOptions(Builder builder) {
     super(builder);
     unit = builder.unit;
-    strictDecimalType = builder.strictDecimalType;
   }
 
   DType timeUnit() {
     return unit;
   }
 
-  boolean isStrictDecimalType() {
-    return strictDecimalType;
-  }
-
   public static Builder builder() {
     return new Builder();
   }
 
   public static class Builder extends ColumnFilterOptions.Builder<Builder> {
     private DType unit = DType.EMPTY;
-    private boolean strictDecimalType = false;
 
     /**
      * Specify the time unit to use when returning timestamps.
@@ -63,16 +56,6 @@ public Builder withTimeUnit(DType unit) {
       return this;
     }
 
-    /**
-     * Specify how to deal with decimal columns who are not backed by INT32/64 while reading.
-     * @param strictDecimalType whether strictly reading all decimal columns as fixed-point decimal type
-     * @return builder for chaining
-     */
-    public Builder enableStrictDecimalType(boolean strictDecimalType) {
-      this.strictDecimalType = strictDecimalType;
-      return this;
-    }
-
     public ParquetOptions build() {
       return new ParquetOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index a34d4afdc56..887a125e083 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -242,11 +242,9 @@ private static native long[] readCSV(String[] columnNames,
    * @param address            the address of the buffer to read from or 0 if we should not.
    * @param length             the length of the buffer to read from.
    * @param timeUnit           return type of TimeStamp in units
-   * @param strictDecimalTypes whether strictly reading all decimal columns as fixed-point decimal type
    */
   private static native long[] readParquet(String[] filterColumnNames, String filePath,
-                                           long address, long length, int timeUnit,
-                                           boolean strictDecimalTypes) throws CudfException;
+                                           long address, long length, int timeUnit) throws CudfException;
 
   /**
    * Setup everything to write parquet formatted data to a file.
@@ -809,8 +807,7 @@ public static Table readParquet(File path) {
    */
   public static Table readParquet(ParquetOptions opts, File path) {
     return new Table(readParquet(opts.getIncludeColumnNames(),
-        path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId(),
-        opts.isStrictDecimalType()));
+        path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
   }
 
   /**
@@ -870,8 +867,7 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
     return new Table(readParquet(opts.getIncludeColumnNames(),
-        null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId(),
-        opts.isStrictDecimalType()));
+        null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
   }
 
   /**
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 18e7936f322..5bae4f5f399 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1300,9 +1300,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jint unit, jboolean strict_decimal_types) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass,
+                                                                   jobjectArray filter_col_names,
+                                                                   jstring inputfilepath,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length, jint unit) {
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
@@ -1338,7 +1340,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
             .convert_strings_to_categories(false)
             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
             .build();
-    opts.set_strict_decimal_types(static_cast<bool>(strict_decimal_types));
     cudf::io::table_with_metadata result = cudf::io::read_parquet(opts);
     return cudf::jni::convert_table_for_return(env, result.tbl);
   }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 21e3b3784fc..eeed8224425 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -523,17 +523,12 @@ void testReadParquetContainsDecimalData() {
           DType.create(DType.DTypeEnum.DECIMAL64, -10),  // Decimal(10, 10)
           DType.create(DType.DTypeEnum.DECIMAL32, 0),  // Decimal(1, 0)
           DType.create(DType.DTypeEnum.DECIMAL64, -15),  // Decimal(18, 15)
-          DType.FLOAT64,  // Decimal(20, 10) which is backed by FIXED_LEN_BYTE_ARRAY
+          DType.create(DType.DTypeEnum.DECIMAL128, -10),  // Decimal(20, 10)
           DType.INT64,
           DType.FLOAT32
       };
       assertTableTypes(expectedTypes, table);
     }
-    // An CudfException will be thrown here because it contains a FIXED_LEN_BYTE_ARRAY column whose type length exceeds 8.
-    ParquetOptions opts = ParquetOptions.builder().enableStrictDecimalType(true).build();
-    assertThrows(ai.rapids.cudf.CudfException.class, () -> {
-      try (Table table = Table.readParquet(opts, TEST_DECIMAL_PARQUET_FILE)) {}
-    });
   }
 
   @Test

From 2e95fb19eb2876110617af5b564287c748915087 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 7 Dec 2021 22:28:50 -0800
Subject: [PATCH 062/202] Pick smallest decimal type with required precision in
 ORC reader (#9775)

Depends on #9853

Current behavior is to throw when an ORC column has precision that is too high for decimal64.
This PR changes the behavior to instead read the column as decimal128, when precision is too high for 64 bits. This reduces the need for the use of `decimal128_columns` option.
Also modified the decimal type inference to use decimal32 when the precision is sufficiently low, reducing memory use in such case.
Adds a temporary option to disable decimal128 use. This option is used in Python to get a readable error message in this case, while allowing decimal128 use by other callers.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Mark Harris (https://github.com/harrism)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9775
---
 cpp/include/cudf/io/orc.hpp          | 24 ++++++++++++++++++
 cpp/src/io/orc/reader_impl.cu        | 37 +++++++++++++++++-----------
 cpp/src/io/orc/reader_impl.hpp       |  7 +++---
 cpp/src/io/orc/stripe_data.cu        | 14 ++++++-----
 cpp/tests/io/orc_test.cpp            | 23 +++++++----------
 python/cudf/cudf/_lib/cpp/io/orc.pxd |  6 ++---
 python/cudf/cudf/_lib/orc.pyx        |  1 +
 7 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 3bc2e6c9ef2..16588185f3d 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -72,6 +72,7 @@ class orc_reader_options {
 
   // Columns that should be read as Decimal128
   std::vector<std::string> _decimal128_columns;
+  bool _enable_decimal128 = true;
 
   friend orc_reader_options_builder;
 
@@ -151,6 +152,11 @@ class orc_reader_options {
    */
   std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
 
+  /**
+   * @brief Whether to use row index to speed-up reading.
+   */
+  bool is_enabled_decimal128() const { return _enable_decimal128; }
+
   // Setters
 
   /**
@@ -225,6 +231,13 @@ class orc_reader_options {
     _decimal_cols_as_float = std::move(val);
   }
 
+  /**
+   * @brief Enable/Disable the use of decimal128 type
+   *
+   * @param use Boolean value to enable/disable.
+   */
+  void enable_decimal128(bool use) { _enable_decimal128 = use; }
+
   /**
    * @brief Set columns that should be read as 128-bit Decimal
    *
@@ -362,6 +375,17 @@ class orc_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Enable/Disable use of decimal128 type
+   *
+   * @param use Boolean value to enable/disable.
+   */
+  orc_reader_options_builder& decimal128(bool use)
+  {
+    options.enable_decimal128(use);
+    return *this;
+  }
+
   /**
    * @brief move orc_reader_options member once it's built.
    */
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 798cdca178a..21c52f9295b 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -230,24 +230,35 @@ size_t gather_stream_info(const size_t stripe_index,
 /**
  * @brief Determines cuDF type of an ORC Decimal column.
  */
-auto decimal_column_type(const std::vector<std::string>& float64_columns,
-                         const std::vector<std::string>& decimal128_columns,
-                         cudf::io::orc::metadata& metadata,
+auto decimal_column_type(std::vector<std::string> const& float64_columns,
+                         std::vector<std::string> const& decimal128_columns,
+                         bool is_decimal128_enabled,
+                         cudf::io::orc::detail::aggregate_orc_metadata const& metadata,
                          int column_index)
 {
-  auto const& column_path = metadata.column_path(column_index);
+  if (metadata.get_col_type(column_index).kind != DECIMAL) return type_id::EMPTY;
+
+  auto const& column_path = metadata.column_path(0, column_index);
   auto is_column_in       = [&](const std::vector<std::string>& cols) {
     return std::find(cols.cbegin(), cols.cend(), column_path) != cols.end();
   };
 
   auto const user_selected_float64    = is_column_in(float64_columns);
-  auto const user_selected_decimal128 = is_column_in(decimal128_columns);
+  auto const user_selected_decimal128 = is_decimal128_enabled and is_column_in(decimal128_columns);
   CUDF_EXPECTS(not user_selected_float64 or not user_selected_decimal128,
                "Both decimal128 and float64 types selected for column " + column_path);
 
   if (user_selected_float64) return type_id::FLOAT64;
   if (user_selected_decimal128) return type_id::DECIMAL128;
-  return type_id::DECIMAL64;
+
+  auto const precision = metadata.get_col_type(column_index)
+                           .precision.value_or(cuda::std::numeric_limits<int64_t>::digits10);
+  if (precision <= cuda::std::numeric_limits<int32_t>::digits10) return type_id::DECIMAL32;
+  if (precision <= cuda::std::numeric_limits<int64_t>::digits10) return type_id::DECIMAL64;
+  CUDF_EXPECTS(is_decimal128_enabled,
+               "Decimal precision too high for decimal64, use `decimal_cols_as_float` or enable "
+               "decimal128 use");
+  return type_id::DECIMAL128;
 }
 
 }  // namespace
@@ -744,7 +755,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
     _use_np_dtypes,
     _timestamp_type.id(),
     decimal_column_type(
-      _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], orc_col_id));
+      _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, orc_col_id));
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
@@ -795,7 +806,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_co
       break;
 
     case orc::DECIMAL:
-      if (type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
+      if (type == type_id::DECIMAL32 or type == type_id::DECIMAL64 or type == type_id::DECIMAL128) {
         scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
@@ -889,6 +900,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   // Control decimals conversion
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
   decimal128_columns     = options.get_decimal128_columns();
+  is_decimal128_enabled  = options.is_enabled_decimal128();
 }
 
 timezone_table reader::impl::compute_timezone_table(
@@ -953,13 +965,10 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         _use_np_dtypes,
         _timestamp_type.id(),
         decimal_column_type(
-          _decimal_cols_as_float, decimal128_columns, _metadata.per_file_metadata[0], col.id));
+          _decimal_cols_as_float, decimal128_columns, is_decimal128_enabled, _metadata, col.id));
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
-        "Precision of column " + std::string{_metadata.column_name(0, col.id)} +
-          " is over 18, use 128-bit Decimal.");
-      if (col_type == type_id::DECIMAL64 or col_type == type_id::DECIMAL128) {
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 64e7cbc74e5..e8aa298012b 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -219,12 +219,13 @@ class reader::impl {
   cudf::io::orc::detail::aggregate_orc_metadata _metadata;
   cudf::io::orc::detail::column_hierarchy selected_columns;
 
-  bool _use_index     = true;
-  bool _use_np_dtypes = true;
+  bool _use_index{true};
+  bool _use_np_dtypes{true};
   std::vector<std::string> _decimal_cols_as_float;
   std::vector<std::string> decimal128_columns;
+  bool is_decimal128_enabled{true};
   data_type _timestamp_type{type_id::EMPTY};
-  reader_column_meta _col_meta;
+  reader_column_meta _col_meta{};
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 44f106c4f5c..8f8bb87d9e4 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1066,12 +1066,12 @@ static __device__ int Decode_Decimals(orc_bytestream_s* bs,
             return (v / kPow5i[scale]) >> scale;
           }
         }();
-        if (dtype_id == type_id::DECIMAL64) {
+        if (dtype_id == type_id::DECIMAL32) {
+          vals.i32[t] = scaled_value;
+        } else if (dtype_id == type_id::DECIMAL64) {
           vals.i64[t] = scaled_value;
         } else {
-          {
-            vals.i128[t] = scaled_value;
-          }
+          vals.i128[t] = scaled_value;
         }
       }
     }
@@ -1708,8 +1708,10 @@ __global__ void __launch_bounds__(block_size)
             case DOUBLE:
             case LONG: static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped]; break;
             case DECIMAL:
-              if (s->chunk.dtype_id == type_id::FLOAT64 or
-                  s->chunk.dtype_id == type_id::DECIMAL64) {
+              if (s->chunk.dtype_id == type_id::DECIMAL32) {
+                static_cast<uint32_t*>(data_out)[row] = s->vals.u32[t + vals_skipped];
+              } else if (s->chunk.dtype_id == type_id::FLOAT64 or
+                         s->chunk.dtype_id == type_id::DECIMAL64) {
                 static_cast<uint64_t*>(data_out)[row] = s->vals.u64[t + vals_skipped];
               } else {
                 // decimal128
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index da44c91eec3..574ce8573e9 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -387,9 +387,7 @@ TEST_F(OrcWriterTest, MultiColumn)
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
-      .use_index(false)
-      .decimal128_columns({"decimal_pos_scale", "decimal_neg_scale"});
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -1178,9 +1176,9 @@ TEST_F(OrcWriterTest, Decimal32)
   auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
     return numeric::decimal32{vals[i], numeric::scale_type{2}};
   });
-  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; });
+  auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
   column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
-  cudf::table_view expected({static_cast<cudf::column_view>(col)});
+  cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
   cudf_io::orc_writer_options out_opts =
@@ -1192,12 +1190,7 @@ TEST_F(OrcWriterTest, Decimal32)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  auto data64 = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{2}};
-  });
-  column_wrapper<numeric::decimal64> col64{data64, data64 + num_rows, mask};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col64, result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0));
 }
 
 TEST_F(OrcStatisticsTest, Overflow)
@@ -1438,7 +1431,7 @@ TEST_F(OrcReaderTest, DecimalOptions)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
       .decimal128_columns({"dec", "fake_name"})
       .decimal_cols_as_float({"decc", "fake_name"});
-  // Should not throw
+  // Should not throw, even with "fake name" in both options
   EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
 
   cudf_io::orc_reader_options invalid_opts =
@@ -1493,10 +1486,12 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
       .use_index(false)
-      .decimal128_columns({"lists.1.dec128"});
+      .decimal128_columns({"lists.1.dec64"});
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  // Both columns should be read as decimal128
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0),
+                                      result.tbl->view().column(0).child(1).child(1));
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 4b5ec913fb6..2fc71f64df1 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -36,7 +36,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
         void set_decimal_cols_as_float(vector[string] val) except+
-        void set_decimal128_columns(vector[string] val) except+
+        void enable_decimal128(bool val) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -58,9 +58,7 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& decimal_cols_as_float(
             vector[string] val
         ) except+
-        orc_reader_options_builder& decimal128_columns(
-            vector[string] val
-        ) except+
+        orc_reader_options_builder& decimal128(bool val) except+
 
         orc_reader_options build() except+
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 1281aa172b4..9a4bd8652da 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -248,6 +248,7 @@ cdef orc_reader_options make_orc_reader_options(
         .timestamp_type(data_type(timestamp_type))
         .use_index(use_index)
         .decimal_cols_as_float(c_decimal_cols_as_float)
+        .decimal128(False)
         .build()
     )
 

From ffc624168dd35ae6a4cca1b720178e22a8e09044 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Wed, 8 Dec 2021 18:17:07 +0100
Subject: [PATCH 063/202] Update to UCX-Py 0.24 (#9748)

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9748
---
 ci/gpu/build.sh | 2 +-
 ci/gpu/java.sh  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8f83c169330..d8b5cc7ba4c 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -83,7 +83,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.23.*"
+                  "ucx-py=0.24.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index 0fd3f790f9f..bada16bd40e 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.23.*" \
+                  "ucx-py=0.24.*" \
                   "openjdk=8.*" \
                   "maven"
 

From 4579d237ad93d2416a1441b7661b029cfb9a5c10 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Wed, 8 Dec 2021 14:06:19 -0500
Subject: [PATCH 064/202] Add_suffix and add_prefix for DataFrames and Series
 (#9846)

This PR fixes #9590, by adding `add_suffix` and `add_prefix` for `cudf.DataFrame` and `cudf.Series`.

To make things concise, we unify the docstrings of these methods in both `Series` and `DataFrame` by defining them within `IndexedFrame` (with a unified docstring and raising `NotImplementedError`, asking the user to refer to the implementations in `Series` or `DataFrame`)

Its preferred to raise `NotImplementedError` so that if someone later creates another class by inheriting from `IndexedFrame`, it clarifies that they must reimplement `add_suffix` and `add_prefix`

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/9846
---
 docs/cudf/source/api_docs/dataframe.rst  |   2 +
 docs/cudf/source/api_docs/series.rst     |   2 +
 python/cudf/cudf/core/dataframe.py       |  14 +++
 python/cudf/cudf/core/indexed_frame.py   | 118 +++++++++++++++++++++++
 python/cudf/cudf/core/series.py          |  10 ++
 python/cudf/cudf/tests/test_dataframe.py |  20 ++++
 python/cudf/cudf/tests/test_series.py    |  20 ++++
 7 files changed, 186 insertions(+)

diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 76026c23d50..94f88a40ea5 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -154,6 +154,8 @@ Reindexing / selection / label manipulation
 .. autosummary::
    :toctree: api/
 
+   DataFrame.add_prefix
+   DataFrame.add_suffix
    DataFrame.drop
    DataFrame.drop_duplicates
    DataFrame.equals
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index d234dfc4bcb..a3b17926bdd 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -162,6 +162,8 @@ Reindexing / selection / label manipulation
 .. autosummary::
    :toctree: api/
 
+   Series.add_prefix
+   Series.add_suffix
    Series.drop
    Series.drop_duplicates
    Series.equals
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 279b1f44961..bbe691595e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3040,6 +3040,20 @@ def rename(
         else:
             return out.copy(deep=copy)
 
+    def add_prefix(self, prefix):
+        out = self.copy(deep=True)
+        out.columns = [
+            prefix + col_name for col_name in list(self._data.keys())
+        ]
+        return out
+
+    def add_suffix(self, suffix):
+        out = self.copy(deep=True)
+        out.columns = [
+            col_name + suffix for col_name in list(self._data.keys())
+        ]
+        return out
+
     def as_gpu_matrix(self, columns=None, order="F"):
         warnings.warn(
             "The as_gpu_matrix method will be removed in a future cuDF "
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2044bad9675..9625231a6ef 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -544,6 +544,124 @@ def drop_duplicates(
         result._copy_type_metadata(self)
         return result
 
+    def add_prefix(self, prefix):
+        """
+        Prefix labels with string `prefix`.
+
+        For Series, the row labels are prefixed.
+        For DataFrame, the column labels are prefixed.
+
+        Parameters
+        ----------
+        prefix : str
+            The string to add before each label.
+
+        Returns
+        -------
+        Series or DataFrame
+            New Series with updated labels or DataFrame with updated labels.
+
+        See Also
+        --------
+        Series.add_suffix: Suffix row labels with string 'suffix'.
+        DataFrame.add_suffix: Suffix column labels with string 'suffix'.
+
+        Examples
+        --------
+        **Series**
+        >>> s = cudf.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> s.add_prefix('item_')
+        item_0    1
+        item_1    2
+        item_2    3
+        item_3    4
+        dtype: int64
+
+        **DataFrame**
+        >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+        >>> df
+           A  B
+        0  1  3
+        1  2  4
+        2  3  5
+        3  4  6
+        >>> df.add_prefix('col_')
+             col_A  col_B
+        0       1       3
+        1       2       4
+        2       3       5
+        3       4       6
+        """
+        raise NotImplementedError(
+            "`IndexedFrame.add_prefix` not currently implemented. \
+                Use `Series.add_prefix` or `DataFrame.add_prefix`"
+        )
+
+    def add_suffix(self, suffix):
+        """
+        Suffix labels with string `suffix`.
+
+        For Series, the row labels are suffixed.
+        For DataFrame, the column labels are suffixed.
+
+        Parameters
+        ----------
+        prefix : str
+            The string to add after each label.
+
+        Returns
+        -------
+        Series or DataFrame
+            New Series with updated labels or DataFrame with updated labels.
+
+        See Also
+        --------
+        Series.add_prefix: prefix row labels with string 'prefix'.
+        DataFrame.add_prefix: Prefix column labels with string 'prefix'.
+
+        Examples
+        --------
+        **Series**
+        >>> s = cudf.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> s.add_suffix('_item')
+        0_item    1
+        1_item    2
+        2_item    3
+        3_item    4
+        dtype: int64
+
+        **DataFrame**
+        >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
+        >>> df
+           A  B
+        0  1  3
+        1  2  4
+        2  3  5
+        3  4  6
+        >>> df.add_suffix('_col')
+             A_col  B_col
+        0       1       3
+        1       2       4
+        2       3       5
+        3       4       6
+        """
+        raise NotImplementedError(
+            "`IndexedFrame.add_suffix` not currently implemented. \
+                Use `Series.add_suffix` or `DataFrame.add_suffix`"
+        )
+
     def sort_values(
         self,
         by,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 3aae79af4e8..140c68d4ce0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3694,6 +3694,16 @@ def merge(
 
         return result
 
+    def add_prefix(self, prefix):
+        result = self.copy(deep=True)
+        result.index = prefix + self.index.astype(str)
+        return result
+
+    def add_suffix(self, suffix):
+        result = self.copy(deep=True)
+        result.index = self.index.astype(str) + suffix
+        return result
+
     def keys(self):
         """
         Return alias for index.
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d555b5c4033..c40f9f0b0a5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9039,3 +9039,23 @@ def test_pearson_corr_multiindex_dataframe():
     expected = gdf.to_pandas().groupby(level="a").corr("pearson")
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_add_prefix():
+    cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
+    pdf = cdf.to_pandas()
+
+    got = cdf.add_prefix("item_")
+    expected = pdf.add_prefix("item_")
+
+    assert_eq(got, expected)
+
+
+def test_dataframe_add_suffix():
+    cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
+    pdf = cdf.to_pandas()
+
+    got = cdf.add_suffix("_item")
+    expected = pdf.add_suffix("_item")
+
+    assert_eq(got, expected)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index d59e3ba7571..f214e54c57e 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1348,6 +1348,26 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     assert_eq(psr, gsr.to_pandas(nullable=True))
 
 
+def test_series_add_prefix():
+    cd_s = cudf.Series([1, 2, 3, 4])
+    pd_s = cd_s.to_pandas()
+
+    got = cd_s.add_prefix("item_")
+    expected = pd_s.add_prefix("item_")
+
+    assert_eq(got, expected)
+
+
+def test_series_add_suffix():
+    cd_s = cudf.Series([1, 2, 3, 4])
+    pd_s = cd_s.to_pandas()
+
+    got = cd_s.add_suffix("_item")
+    expected = pd_s.add_suffix("_item")
+
+    assert_eq(got, expected)
+
+
 @pytest.mark.parametrize(
     "cudf_series",
     [

From e6b06616f5ae9449cc5fe0f8606003bb0442db40 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 9 Dec 2021 02:16:04 +0530
Subject: [PATCH 065/202] add templated benchmark with fixture (#9838)

BENCHMARK_TEMPLATE_F is for non-templated benchmark with templated fixture.
BENCHMARK_F is for non-templated benchmark with non-templated fixture.
Google benchmark does not have support for templated benchmark function with non-templated fixture.

Often, BENCHMARK_TEMPLATE_F is used as a proxy for templated benchmark. But templated fixture is not really required here.  It also has limitation of specifying different name for each template.
So, this PR extends google benchmark to support templated benchmark with non-templated fixture.

- [x] Use TEMPLATED_BENCHMARK_F in compiled binary op.
- [x] Use in other relevant benchmarks as well.

Usage:
`TEMPLATED_BENCHMARK_F(FixtureClass,    TemplateFunction, ...);`
`...`  are template arguments

Example:
```
class FixtureClass : public cudf::benchmark {
};

template<typename T, typename U>
void TemplateFunction(benchmark::State& state) {
     for (auto _ : state) {
       // benchmark stuff
     }
}
TEMPLATED_BENCHMARK_F(FixtureClass, TemplateFunction, int, double)->Range(128, 512);
```

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9838
---
 cpp/benchmarks/ast/transform_benchmark.cpp    |  62 +++--------
 .../binaryop/compiled_binaryop_benchmark.cpp  | 100 +++++++++---------
 .../column/concatenate_benchmark.cpp          |  51 ++++-----
 .../fixture/templated_benchmark_fixture.hpp   |  73 +++++++++++++
 .../string/url_decode_benchmark.cpp           |  20 ++--
 5 files changed, 171 insertions(+), 135 deletions(-)
 create mode 100644 cpp/benchmarks/fixture/templated_benchmark_fixture.hpp

diff --git a/cpp/benchmarks/ast/transform_benchmark.cpp b/cpp/benchmarks/ast/transform_benchmark.cpp
index fd0a0f7d2c8..c17c288a6d3 100644
--- a/cpp/benchmarks/ast/transform_benchmark.cpp
+++ b/cpp/benchmarks/ast/transform_benchmark.cpp
@@ -25,6 +25,7 @@
 
 #include <benchmark/benchmark.h>
 #include <fixture/benchmark_fixture.hpp>
+#include <fixture/templated_benchmark_fixture.hpp>
 #include <synchronization/synchronization.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,6 @@ enum class TreeType {
                    // child column reference
 };
 
-template <typename key_type, TreeType tree_type, bool reuse_columns, bool Nullable>
 class AST : public cudf::benchmark {
 };
 
@@ -127,9 +127,22 @@ static void BM_ast_transform(benchmark::State& state)
                           (tree_levels + 1) * sizeof(key_type));
 }
 
-#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable)     \
-  (::benchmark::State & st) { BM_ast_transform<key_type, tree_type, reuse_columns, nullable>(st); }
+static void CustomRanges(benchmark::internal::Benchmark* b)
+{
+  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
+  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
+  for (auto const& row_count : row_counts) {
+    for (auto const& operation_count : operation_counts) {
+      b->Args({row_count, operation_count});
+    }
+  }
+}
+
+#define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable)   \
+  TEMPLATED_BENCHMARK_F(AST, BM_ast_transform, key_type, tree_type, reuse_columns, nullable) \
+    ->Apply(CustomRanges)                                                                    \
+    ->Unit(benchmark::kMillisecond)                                                          \
+    ->UseManualTime();
 
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false);
@@ -144,44 +157,3 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
 AST_TRANSFORM_BENCHMARK_DEFINE(
   ast_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
-
-static void CustomRanges(benchmark::internal::Benchmark* b)
-{
-  auto row_counts       = std::vector<cudf::size_type>{100'000, 1'000'000, 10'000'000, 100'000'000};
-  auto operation_counts = std::vector<cudf::size_type>{1, 5, 10};
-  for (auto const& row_count : row_counts) {
-    for (auto const& operation_count : operation_counts) {
-      b->Args({row_count, operation_count});
-    }
-  }
-}
-
-BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_unique_nulls)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(AST, ast_int32_imbalanced_reuse_nulls)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(AST, ast_double_imbalanced_unique_nulls)
-  ->Apply(CustomRanges)
-  ->Unit(benchmark::kMillisecond)
-  ->UseManualTime();
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
index 8d04f8bdcb2..745d4e354e7 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop_benchmark.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <fixture/benchmark_fixture.hpp>
+#include <fixture/templated_benchmark_fixture.hpp>
 #include <synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
@@ -23,12 +24,11 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator>
 class COMPILED_BINARYOP : public cudf::benchmark {
 };
 
-template <typename TypeLhs, typename TypeRhs, typename TypeOut>
-void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
+template <typename TypeLhs, typename TypeRhs, typename TypeOut, cudf::binary_operator binop>
+void BM_compiled_binaryop(benchmark::State& state)
 {
   const cudf::size_type column_size{(cudf::size_type)state.range(0)};
 
@@ -50,56 +50,56 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop)
 }
 
 // TODO tparam boolean for null.
-#define BINARYOP_BENCHMARK_DEFINE(name, TypeLhs, TypeRhs, binop, TypeOut)              \
-  BENCHMARK_TEMPLATE_DEFINE_F(                                                         \
-    COMPILED_BINARYOP, name, TypeLhs, TypeRhs, TypeOut, cudf::binary_operator::binop)  \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    BM_compiled_binaryop<TypeLhs, TypeRhs, TypeOut>(st, cudf::binary_operator::binop); \
-  }                                                                                    \
-  BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name)                                        \
-    ->Unit(benchmark::kMicrosecond)                                                    \
-    ->UseManualTime()                                                                  \
-    ->Arg(10000)      /* 10k */                                                        \
-    ->Arg(100000)     /* 100k */                                                       \
-    ->Arg(1000000)    /* 1M */                                                         \
-    ->Arg(10000000)   /* 10M */                                                        \
+#define BINARYOP_BENCHMARK_DEFINE(TypeLhs, TypeRhs, binop, TypeOut) \
+  TEMPLATED_BENCHMARK_F(COMPILED_BINARYOP,                          \
+                        BM_compiled_binaryop,                       \
+                        TypeLhs,                                    \
+                        TypeRhs,                                    \
+                        TypeOut,                                    \
+                        cudf::binary_operator::binop)               \
+    ->Unit(benchmark::kMicrosecond)                                 \
+    ->UseManualTime()                                               \
+    ->Arg(10000)      /* 10k */                                     \
+    ->Arg(100000)     /* 100k */                                    \
+    ->Arg(1000000)    /* 1M */                                      \
+    ->Arg(10000000)   /* 10M */                                     \
     ->Arg(100000000); /* 100M */
 
 using namespace cudf;
 using namespace numeric;
 
 // clang-format off
-BINARYOP_BENCHMARK_DEFINE(ADD_1,          float,        float,        ADD,                  float);
-BINARYOP_BENCHMARK_DEFINE(ADD_2,          timestamp_s,  duration_s,   ADD,                  timestamp_s);
-BINARYOP_BENCHMARK_DEFINE(SUB_1,          duration_s,   duration_D,   SUB,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(SUB_2,          int64_t,      int64_t,      SUB,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(MUL_1,          float,        float,        MUL,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(MUL_2,          duration_s,   int64_t,      MUL,                  duration_s);
-BINARYOP_BENCHMARK_DEFINE(DIV_1,          int64_t,      int64_t,      DIV,                  int64_t);
-BINARYOP_BENCHMARK_DEFINE(DIV_2,          duration_ms,  int32_t,      DIV,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(TRUE_DIV,       int64_t,      int64_t,      TRUE_DIV,             int64_t);
-BINARYOP_BENCHMARK_DEFINE(FLOOR_DIV,      int64_t,      int64_t,      FLOOR_DIV,            int64_t);
-BINARYOP_BENCHMARK_DEFINE(MOD_1,          double,       double,       MOD,                  double);
-BINARYOP_BENCHMARK_DEFINE(MOD_2,          duration_ms,  int64_t,      MOD,                  duration_ms);
-BINARYOP_BENCHMARK_DEFINE(PMOD,           int32_t,      int64_t,      PMOD,                 double);
-BINARYOP_BENCHMARK_DEFINE(PYMOD,          int32_t,      uint8_t,      PYMOD,                int64_t);
-BINARYOP_BENCHMARK_DEFINE(POW,            int64_t,      int64_t,      POW,                  double);
-BINARYOP_BENCHMARK_DEFINE(LOG_BASE,       float,        double,       LOG_BASE,             double);
-BINARYOP_BENCHMARK_DEFINE(ATAN2,          float,        double,       ATAN2,                double);
-BINARYOP_BENCHMARK_DEFINE(SHIFT_LEFT,     int,          int,          SHIFT_LEFT,           int);
-BINARYOP_BENCHMARK_DEFINE(SHIFT_RIGHT,    int16_t,      int64_t,      SHIFT_RIGHT,          int);
-BINARYOP_BENCHMARK_DEFINE(USHIFT_RIGHT,   int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
-BINARYOP_BENCHMARK_DEFINE(BITWISE_AND,    int64_t,      int32_t,      BITWISE_AND,          int16_t);
-BINARYOP_BENCHMARK_DEFINE(BITWISE_OR,     int16_t,      int32_t,      BITWISE_OR,           int64_t);
-BINARYOP_BENCHMARK_DEFINE(BITWISE_XOR,    int16_t,      int64_t,      BITWISE_XOR,          int32_t);
-BINARYOP_BENCHMARK_DEFINE(LOGICAL_AND,    double,       int8_t,       LOGICAL_AND,          bool);
-BINARYOP_BENCHMARK_DEFINE(LOGICAL_OR,     int16_t,      int64_t,      LOGICAL_OR,           bool);
-BINARYOP_BENCHMARK_DEFINE(EQUAL_1,        int32_t,      int64_t,      EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(EQUAL_2,        duration_ms,  duration_ns,  EQUAL,                bool);
-BINARYOP_BENCHMARK_DEFINE(NOT_EQUAL,      decimal32,    decimal32,    NOT_EQUAL,            bool);
-BINARYOP_BENCHMARK_DEFINE(LESS,           timestamp_s,  timestamp_s,  LESS,                 bool);
-BINARYOP_BENCHMARK_DEFINE(GREATER,        timestamp_ms, timestamp_s,  GREATER,              bool);
-BINARYOP_BENCHMARK_DEFINE(NULL_EQUALS,    duration_ms,  duration_ns,  NULL_EQUALS,          bool);
-BINARYOP_BENCHMARK_DEFINE(NULL_MAX,       decimal32,    decimal32,    NULL_MAX,             decimal32);
-BINARYOP_BENCHMARK_DEFINE(NULL_MIN,       timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(float,        int64_t,      ADD,                  int32_t);
+BINARYOP_BENCHMARK_DEFINE(float,        float,        ADD,                  float);
+BINARYOP_BENCHMARK_DEFINE(timestamp_s,  duration_s,   ADD,                  timestamp_s);
+BINARYOP_BENCHMARK_DEFINE(duration_s,   duration_D,   SUB,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      SUB,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(float,        float,        MUL,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(duration_s,   int64_t,      MUL,                  duration_s);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      DIV,                  int64_t);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  int32_t,      DIV,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      TRUE_DIV,             int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      FLOOR_DIV,            int64_t);
+BINARYOP_BENCHMARK_DEFINE(double,       double,       MOD,                  double);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  int64_t,      MOD,                  duration_ms);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      PMOD,                 double);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      uint8_t,      PYMOD,                int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int64_t,      POW,                  double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       LOG_BASE,             double);
+BINARYOP_BENCHMARK_DEFINE(float,        double,       ATAN2,                double);
+BINARYOP_BENCHMARK_DEFINE(int,          int,          SHIFT_LEFT,           int);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      SHIFT_RIGHT,          int);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      SHIFT_RIGHT_UNSIGNED, int64_t);
+BINARYOP_BENCHMARK_DEFINE(int64_t,      int32_t,      BITWISE_AND,          int16_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int32_t,      BITWISE_OR,           int64_t);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      BITWISE_XOR,          int32_t);
+BINARYOP_BENCHMARK_DEFINE(double,       int8_t,       LOGICAL_AND,          bool);
+BINARYOP_BENCHMARK_DEFINE(int16_t,      int64_t,      LOGICAL_OR,           bool);
+BINARYOP_BENCHMARK_DEFINE(int32_t,      int64_t,      EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  EQUAL,                bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
+BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
+BINARYOP_BENCHMARK_DEFINE(timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/benchmarks/column/concatenate_benchmark.cpp b/cpp/benchmarks/column/concatenate_benchmark.cpp
index 3634b2f08a2..abca4b4e0f5 100644
--- a/cpp/benchmarks/column/concatenate_benchmark.cpp
+++ b/cpp/benchmarks/column/concatenate_benchmark.cpp
@@ -19,6 +19,7 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
@@ -26,7 +27,6 @@
 #include <algorithm>
 #include <vector>
 
-template <typename T, bool Nullable>
 class Concatenate : public cudf::benchmark {
 };
 
@@ -69,17 +69,15 @@ static void BM_concatenate(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
 }
 
-#define CONCAT_BENCHMARK_DEFINE(name, type, nullable)                     \
-  BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable)          \
-  (::benchmark::State & state) { BM_concatenate<type, nullable>(state); } \
-  BENCHMARK_REGISTER_F(Concatenate, name)                                 \
-    ->RangeMultiplier(8)                                                  \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                              \
-    ->Unit(benchmark::kMillisecond)                                       \
+#define CONCAT_BENCHMARK_DEFINE(type, nullable)                      \
+  TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate, type, nullable) \
+    ->RangeMultiplier(8)                                             \
+    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                         \
+    ->Unit(benchmark::kMillisecond)                                  \
     ->UseManualTime();
 
-CONCAT_BENCHMARK_DEFINE(concat_columns_int64_non_null, int64_t, false)
-CONCAT_BENCHMARK_DEFINE(concat_columns_int64_nullable, int64_t, true)
+CONCAT_BENCHMARK_DEFINE(int64_t, false)
+CONCAT_BENCHMARK_DEFINE(int64_t, true)
 
 template <typename T, bool Nullable>
 static void BM_concatenate_tables(benchmark::State& state)
@@ -131,19 +129,16 @@ static void BM_concatenate_tables(benchmark::State& state)
   state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
 }
 
-#define CONCAT_TABLES_BENCHMARK_DEFINE(name, type, nullable)                     \
-  BENCHMARK_TEMPLATE_DEFINE_F(Concatenate, name, type, nullable)                 \
-  (::benchmark::State & state) { BM_concatenate_tables<type, nullable>(state); } \
-  BENCHMARK_REGISTER_F(Concatenate, name)                                        \
-    ->RangeMultiplier(8)                                                         \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                             \
-    ->Unit(benchmark::kMillisecond)                                              \
+#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                      \
+  TEMPLATED_BENCHMARK_F(Concatenate, BM_concatenate_tables, type, nullable) \
+    ->RangeMultiplier(8)                                                    \
+    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                        \
+    ->Unit(benchmark::kMillisecond)                                         \
     ->UseManualTime();
 
-CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_non_null, int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(concat_tables_int64_nullable, int64_t, true)
+CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
+CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
 
-template <bool Nullable>
 class ConcatenateStrings : public cudf::benchmark {
 };
 
@@ -192,14 +187,12 @@ static void BM_concatenate_strings(benchmark::State& state)
                           (sizeof(int32_t) + num_chars));  // offset + chars
 }
 
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(name, nullable)                     \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConcatenateStrings, name, nullable)           \
-  (::benchmark::State & state) { BM_concatenate_strings<nullable>(state); } \
-  BENCHMARK_REGISTER_F(ConcatenateStrings, name)                            \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                       \
-    ->Unit(benchmark::kMillisecond)                                         \
+#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                             \
+  TEMPLATED_BENCHMARK_F(ConcatenateStrings, BM_concatenate_strings, nullable) \
+    ->RangeMultiplier(8)                                                      \
+    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                         \
+    ->Unit(benchmark::kMillisecond)                                           \
     ->UseManualTime();
 
-CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_non_null, false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(concat_string_columns_nullable, true)
+CONCAT_STRINGS_BENCHMARK_DEFINE(false)
+CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
new file mode 100644
index 00000000000..7d86ed1b95c
--- /dev/null
+++ b/cpp/benchmarks/fixture/templated_benchmark_fixture.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+namespace cudf {
+/**
+ * @brief Templated Google Benchmark with fixture
+ *
+ * Extends Google benchmarks to support templated Benchmarks with non-templated fixture class.
+ *
+ * The SetUp and TearDown methods is called before each templated benchmark function is run.
+ * These methods are called automatically by Google Benchmark
+ *
+ * Example:
+ *
+ * @code
+ * template <class T, class U>
+ * void  my_benchmark(::benchmark::State& state) {
+ *     std::vector<T> v1(state.range(0));
+ *     std::vector<U> v2(state.range(0));
+ *     for (auto _ : state) {
+ *       // benchmark stuff
+ *     }
+ * }
+ *
+ * TEMPLATED_BENCHMARK_F(cudf::benchmark, my_benchmark, int, double)->Range(128, 512);
+ * @endcode
+ */
+template <class Fixture>
+class FunctionTemplateBenchmark : public Fixture {
+ public:
+  FunctionTemplateBenchmark(const char* name, ::benchmark::internal::Function* func)
+    : Fixture(), func_(func)
+  {
+    this->SetName(name);
+  }
+
+  virtual void Run(::benchmark::State& st)
+  {
+    this->SetUp(st);
+    this->BenchmarkCase(st);
+    this->TearDown(st);
+  }
+
+ private:
+  ::benchmark::internal::Function* func_;
+
+ protected:
+  virtual void BenchmarkCase(::benchmark::State& st) { func_(st); }
+};
+
+#define TEMPLATED_BENCHMARK_F(BaseClass, n, ...)                                           \
+  BENCHMARK_PRIVATE_DECLARE(n) = (::benchmark::internal::RegisterBenchmarkInternal(        \
+    new cudf::FunctionTemplateBenchmark<BaseClass>(#BaseClass "/" #n "<" #__VA_ARGS__ ">", \
+                                                   n<__VA_ARGS__>)))
+
+}  // namespace cudf
diff --git a/cpp/benchmarks/string/url_decode_benchmark.cpp b/cpp/benchmarks/string/url_decode_benchmark.cpp
index 9cfaaf27286..4dc77cffa1a 100644
--- a/cpp/benchmarks/string/url_decode_benchmark.cpp
+++ b/cpp/benchmarks/string/url_decode_benchmark.cpp
@@ -16,6 +16,7 @@
 
 #include <benchmark/benchmark.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/column/column_view.hpp>
@@ -66,7 +67,6 @@ cudf::test::strings_column_wrapper generate_column(cudf::size_type num_rows,
   return cudf::test::strings_column_wrapper(strings.begin(), strings.end());
 }
 
-template <int esc_seq_pct>
 class UrlDecode : public cudf::benchmark {
 };
 
@@ -88,15 +88,13 @@ void BM_url_decode(benchmark::State& state)
                           (chars_per_row + sizeof(cudf::size_type)));
 }
 
-#define URLD_BENCHMARK_DEFINE(name, esc_seq_pct)                      \
-  BENCHMARK_TEMPLATE_DEFINE_F(UrlDecode, name, esc_seq_pct)           \
-  (::benchmark::State & state) { BM_url_decode<esc_seq_pct>(state); } \
-  BENCHMARK_REGISTER_F(UrlDecode, name)                               \
-    ->Args({100000000, 10})                                           \
-    ->Args({10000000, 100})                                           \
-    ->Args({1000000, 1000})                                           \
-    ->Unit(benchmark::kMillisecond)                                   \
+#define URLD_BENCHMARK_DEFINE(esc_seq_pct)                     \
+  TEMPLATED_BENCHMARK_F(UrlDecode, BM_url_decode, esc_seq_pct) \
+    ->Args({100000000, 10})                                    \
+    ->Args({10000000, 100})                                    \
+    ->Args({1000000, 1000})                                    \
+    ->Unit(benchmark::kMillisecond)                            \
     ->UseManualTime();
 
-URLD_BENCHMARK_DEFINE(url_decode_10pct, 10)
-URLD_BENCHMARK_DEFINE(url_decode_50pct, 50)
+URLD_BENCHMARK_DEFINE(10)
+URLD_BENCHMARK_DEFINE(50)

From 024003ca444f9d1a8374a1133337419f22cc880a Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 9 Dec 2021 12:49:50 +0530
Subject: [PATCH 066/202] Fix missing streams (#9767)

fix missing `stream` argument in default argument of functions.
And also in some cases, `mr` on returned objects creation.

This cleanup is done as a follow up after PR https://github.com/rapidsai/cudf/pull/9679

Almost all of libcudf functions usages of stream arg are cleaned up.
Missing `mr` still might need another clean up.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9767
---
 cpp/src/copying/copy.cu                       |  2 +-
 cpp/src/copying/scatter.cu                    |  2 +-
 cpp/src/copying/segmented_shift.cu            |  2 +-
 cpp/src/copying/shift.cu                      |  4 +--
 cpp/src/filling/fill.cu                       |  2 +-
 cpp/src/groupby/common/utils.hpp              |  6 +++--
 cpp/src/groupby/hash/groupby.cu               | 10 +++----
 cpp/src/groupby/sort/aggregate.cpp            |  2 +-
 cpp/src/groupby/sort/group_count.cu           |  2 +-
 cpp/src/groupby/sort/group_nth_element.cu     |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |  4 ++-
 cpp/src/groupby/sort/group_tdigest.cu         |  2 +-
 cpp/src/groupby/sort/scan.cpp                 |  2 +-
 cpp/src/interop/from_arrow.cu                 | 26 ++++++++++++-------
 cpp/src/interop/to_arrow.cu                   |  6 ++---
 cpp/src/io/avro/reader_impl.cu                |  2 +-
 cpp/src/labeling/label_bins.cu                |  4 +--
 .../combine/concatenate_list_elements.cu      |  4 ++-
 cpp/src/lists/copying/copying.cu              |  4 ++-
 cpp/src/lists/copying/gather.cu               |  8 ++++--
 cpp/src/lists/drop_list_duplicates.cu         | 10 ++++---
 cpp/src/lists/interleave_columns.cu           |  8 +++---
 cpp/src/lists/segmented_sort.cu               |  8 ++++--
 cpp/src/merge/merge.cu                        |  4 ++-
 cpp/src/quantiles/quantile.cu                 |  8 +++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 14 ++++++----
 cpp/src/reductions/nth_element.cu             |  2 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  4 ++-
 cpp/src/reductions/simple.cuh                 |  2 +-
 cpp/src/replace/nans.cu                       |  8 +++---
 cpp/src/replace/nulls.cu                      | 10 +++----
 cpp/src/replace/replace.cu                    | 12 ++++-----
 cpp/src/reshape/interleave_columns.cu         |  6 ++---
 cpp/src/round/round.cu                        |  3 ++-
 cpp/src/search/search.cu                      | 12 ++++-----
 cpp/src/strings/split/split_record.cu         |  4 ++-
 cpp/src/structs/utilities.cpp                 |  2 +-
 cpp/src/unary/cast_ops.cu                     | 13 ++++++----
 cpp/src/unary/nan_ops.cu                      |  2 +-
 39 files changed, 134 insertions(+), 94 deletions(-)

diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 55173fd409f..10af2ffb614 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -303,7 +303,7 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
 
   if (boolean_mask.is_empty()) { return cudf::empty_like(lhs); }
 
-  auto bool_mask_device_p             = column_device_view::create(boolean_mask);
+  auto bool_mask_device_p             = column_device_view::create(boolean_mask, stream);
   column_device_view bool_mask_device = *bool_mask_device_p;
 
   auto const has_nulls = boolean_mask.has_nulls();
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 7c1d78e2f98..06ef42e4a08 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -269,7 +269,7 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 
     // Null mask pushdown inside factory method
     return make_structs_column(
-      target.size(), std::move(fields), null_count, std::move(*contents.null_mask));
+      target.size(), std::move(fields), null_count, std::move(*contents.null_mask), stream, mr);
   }
 };
 
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index b08eaa0862c..6d3a005add0 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -145,7 +145,7 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         rmm::mr::device_memory_resource* mr)
 {
   if (segmented_values.is_empty()) { return empty_like(segmented_values); }
-  if (offset == 0) { return std::make_unique<column>(segmented_values); };
+  if (offset == 0) { return std::make_unique<column>(segmented_values, stream, mr); };
 
   return type_dispatcher<dispatch_storage_type>(segmented_values.type(),
                                                 segmented_shift_functor_forwarder{},
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 0b88545ffa5..dacc1d07447 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -105,10 +105,10 @@ struct shift_functor {
     using ScalarType = cudf::scalar_type_t<T>;
     auto& scalar     = static_cast<ScalarType const&>(fill_value);
 
-    auto device_input = column_device_view::create(input);
+    auto device_input = column_device_view::create(input, stream);
     auto output =
       detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
-    auto device_output = mutable_column_device_view::create(*output);
+    auto device_output = mutable_column_device_view::create(*output, stream);
 
     auto const scalar_is_valid = scalar.is_valid(stream);
 
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index d17c698f91a..50f750e6416 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -49,7 +49,7 @@ void in_place_fill(cudf::mutable_column_view& destination,
   using ScalarType = cudf::scalar_type_t<T>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   T fill_value     = p_scalar->value(stream);
-  bool is_valid    = p_scalar->is_valid();
+  bool is_valid    = p_scalar->is_valid(stream);
   cudf::detail::copy_range(thrust::make_constant_iterator(fill_value),
                            thrust::make_constant_iterator(is_valid),
                            destination,
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index e3611eb0e4b..09b85c74f08 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -29,7 +29,9 @@ namespace detail {
 
 template <typename RequestType>
 inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
-                                                       cudf::detail::result_cache& cache)
+                                                       cudf::detail::result_cache& cache,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::mr::device_memory_resource* mr)
 {
   std::vector<aggregation_result> results(requests.size());
   std::unordered_map<std::pair<column_view, std::reference_wrapper<aggregation const>>,
@@ -45,7 +47,7 @@ inline std::vector<aggregation_result> extract_results(host_span<RequestType con
       } else {
         auto it = repeated_result.find({requests[i].values, *agg});
         if (it != repeated_result.end()) {
-          results[i].results.emplace_back(std::make_unique<column>(it->second));
+          results[i].results.emplace_back(std::make_unique<column>(it->second, stream, mr));
         } else {
           CUDF_FAIL("Cannot extract result from the cache");
         }
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0c3e79ea36c..1474cfd8a19 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -308,13 +308,13 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
     column_view sum_result   = sparse_results->get_result(col, *sum_agg);
     column_view count_result = sparse_results->get_result(col, *count_agg);
 
-    auto values_view = column_device_view::create(col);
-    auto sum_view    = column_device_view::create(sum_result);
-    auto count_view  = column_device_view::create(count_result);
+    auto values_view = column_device_view::create(col, stream);
+    auto sum_view    = column_device_view::create(sum_result, stream);
+    auto count_view  = column_device_view::create(count_result, stream);
 
     auto var_result = make_fixed_width_column(
       cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
-    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view());
+    auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
     mutable_table_view var_table_view{{var_result->mutable_view()}};
     cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
 
@@ -668,7 +668,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   std::unique_ptr<table> unique_keys =
     groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr);
 
-  return std::make_pair(std::move(unique_keys), extract_results(requests, cache));
+  return std::make_pair(std::move(unique_keys), extract_results(requests, cache, stream, mr));
 }
 }  // namespace hash
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index d68b701d75f..b3624282c24 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -775,7 +775,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
     }
   }
 
-  auto results = detail::extract_results(requests, cache);
+  auto results = detail::extract_results(requests, cache, stream, mr);
 
   return std::make_pair(helper().unique_keys(stream, mr), std::move(results));
 }
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 121e4bb889d..6a2ff994b8b 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -45,7 +45,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
   if (num_groups == 0) { return result; }
 
   if (values.nullable()) {
-    auto values_view = column_device_view::create(values);
+    auto values_view = column_device_view::create(values, stream);
 
     // make_validity_iterator returns a boolean iterator that sums to 1 (1+1=1)
     // so we need to transform it to cast it to an integer type
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index f2c57abf54e..c1fc58beb80 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -72,7 +72,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
       });
   } else {  // skip nulls (equivalent to pandas nth(dropna='any'))
     // Returns index of nth value.
-    auto values_view = column_device_view::create(values);
+    auto values_view = column_device_view::create(values, stream);
     auto bitmask_iterator =
       thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*values_view),
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 08f65536466..2efe14f70ca 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -259,7 +259,9 @@ struct group_scan_functor<K,
     return make_structs_column(values.size(),
                                std::move(scanned_children),
                                values.null_count(),
-                               cudf::detail::copy_bitmask(values, stream, mr));
+                               cudf::detail::copy_bitmask(values, stream, mr),
+                               stream,
+                               mr);
   }
 };
 
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 551eb128231..ecb18c09f9d 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -692,7 +692,7 @@ struct typed_group_tdigest {
 
     // device column view. handy because the .element() function
     // automatically handles fixed-point conversions for us
-    auto d_col = cudf::column_device_view::create(col);
+    auto d_col = cudf::column_device_view::create(col, stream);
 
     // compute min and max columns
     auto min_col = cudf::make_numeric_column(
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index b22f82ce7e4..6ac416c1a30 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -162,7 +162,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
     }
   }
 
-  auto results = detail::extract_results(requests, cache);
+  auto results = detail::extract_results(requests, cache, stream, mr);
 
   return std::make_pair(helper().sorted_keys(stream, mr), std::move(results));
 }
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 59095fef85e..edd3ce2ed07 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -287,11 +287,14 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
                                      UNKNOWN_NULL_COUNT,
                                      std::move(*get_mask_buffer(array, stream, mr)));
 
-  return num_rows == array.length() ? std::move(out_col)
-                                    : std::make_unique<column>(cudf::detail::slice(
-                                        out_col->view(),
-                                        static_cast<size_type>(array.offset()),
-                                        static_cast<size_type>(array.offset() + array.length())));
+  return num_rows == array.length()
+           ? std::move(out_col)
+           : std::make_unique<column>(
+               cudf::detail::slice(out_col->view(),
+                                   static_cast<size_type>(array.offset()),
+                                   static_cast<size_type>(array.offset() + array.length())),
+               stream,
+               mr);
 }
 
 template <>
@@ -383,11 +386,14 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
                                    stream,
                                    mr);
 
-  return num_rows == array.length() ? std::move(out_col)
-                                    : std::make_unique<column>(cudf::detail::slice(
-                                        out_col->view(),
-                                        static_cast<size_type>(array.offset()),
-                                        static_cast<size_type>(array.offset() + array.length())));
+  return num_rows == array.length()
+           ? std::move(out_col)
+           : std::make_unique<column>(
+               cudf::detail::slice(out_col->view(),
+                                   static_cast<size_type>(array.offset()),
+                                   static_cast<size_type>(array.offset() + array.length())),
+               stream,
+               mr);
 }
 
 std::unique_ptr<column> get_column(arrow::Array const& array,
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3271804bf39..e6db5807dde 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -210,7 +210,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   std::unique_ptr<column> tmp_column =
     ((input.offset() != 0) or
      ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size())))
-      ? std::make_unique<cudf::column>(input)
+      ? std::make_unique<cudf::column>(input, stream)
       : nullptr;
 
   column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
@@ -245,7 +245,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::struct_view>(
                "Number of field names and number of children doesn't match\n");
   std::unique_ptr<column> tmp_column = nullptr;
 
-  if (input.offset() != 0) { tmp_column = std::make_unique<cudf::column>(input); }
+  if (input.offset() != 0) { tmp_column = std::make_unique<cudf::column>(input, stream); }
 
   column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
   auto child_arrays      = fetch_child_array(input_view, metadata.children_meta, ar_mr, stream);
@@ -280,7 +280,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   std::unique_ptr<column> tmp_column = nullptr;
   if ((input.offset() != 0) or
       ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {
-    tmp_column = std::make_unique<cudf::column>(input);
+    tmp_column = std::make_unique<cudf::column>(input, stream);
   }
 
   column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 176da2f5cf7..d908e6c8ed5 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -278,7 +278,7 @@ rmm::device_buffer decompress_data(datasource& source,
                         0);
     uncompressed_data_offsets.host_to_device(stream);
 
-    thrust::tabulate(rmm::exec_policy(),
+    thrust::tabulate(rmm::exec_policy(stream),
                      uncompressed_data_ptrs.begin(),
                      uncompressed_data_ptrs.end(),
                      [off  = uncompressed_data_offsets.device_ptr(),
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 5007c3028ad..774027ed322 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -146,9 +146,9 @@ std::unique_ptr<column> label_bins(column_view const& input,
                         left_begin, left_end, right_begin));
   }
 
-  const auto mask_and_count = valid_if(output_begin, output_end, filter_null_sentinel());
+  auto mask_and_count = valid_if(output_begin, output_end, filter_null_sentinel(), stream, mr);
 
-  output->set_null_mask(mask_and_count.first, mask_and_count.second);
+  output->set_null_mask(std::move(mask_and_count.first), mask_and_count.second);
   return output;
 }
 
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 2ddede97ce4..240543db7bb 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -70,7 +70,9 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 
   // The child column of the output lists column is just copied from the input column.
   auto out_entries = std::make_unique<column>(
-    lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream));
+    lists_column_view(lists_column_view(input).get_sliced_child(stream)).get_sliced_child(stream),
+    stream,
+    mr);
 
   auto [null_mask, null_count] = [&] {
     if (!build_null_mask)
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index d4a3d5555a6..e9d183bc073 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -84,7 +84,9 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                            std::move(offsets),
                            std::move(child),
                            cudf::UNKNOWN_NULL_COUNT,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index 60cf4027621..fe45cdfc338 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -176,7 +176,9 @@ std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                              std::move(child_gd.offsets),
                              std::move(child),
                              null_count,
-                             std::move(null_mask));
+                             std::move(null_mask),
+                             stream,
+                             mr);
   }
 
   // it's a leaf.  do a regular gather
@@ -187,7 +189,9 @@ std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                            std::move(child_gd.offsets),
                            std::move(child),
                            null_count,
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 527e834c76c..b86e028192e 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -634,17 +634,21 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
   // If the values lists column is not given, its corresponding output will be nullptr.
   auto out_values =
     values ? make_lists_column(keys.size(),
-                               std::make_unique<column>(output_offsets->view()),
+                               std::make_unique<column>(output_offsets->view(), stream, mr),
                                std::move(unique_entries_and_list_indices[1]),
                                values.value().null_count(),
-                               cudf::detail::copy_bitmask(values.value().parent(), stream, mr))
+                               cudf::detail::copy_bitmask(values.value().parent(), stream, mr),
+                               stream,
+                               mr)
            : nullptr;
 
   auto out_keys = make_lists_column(keys.size(),
                                     std::move(output_offsets),
                                     std::move(unique_entries_and_list_indices[0]),
                                     keys.null_count(),
-                                    cudf::detail::copy_bitmask(keys.parent(), stream, mr));
+                                    cudf::detail::copy_bitmask(keys.parent(), stream, mr),
+                                    stream,
+                                    mr);
 
   return std::pair{std::move(out_keys), std::move(out_values)};
 }
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 220cb25a942..913f2771a0e 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -50,7 +50,7 @@ generate_list_offsets_and_validities(table_view const& input,
   auto const num_cols         = input.num_columns();
   auto const num_rows         = input.num_rows();
   auto const num_output_lists = num_rows * num_cols;
-  auto const table_dv_ptr     = table_device_view::create(input);
+  auto const table_dv_ptr     = table_device_view::create(input, stream);
 
   // The output offsets column.
   static_assert(sizeof(offset_type) == sizeof(int32_t));
@@ -217,7 +217,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input);
+    auto const table_dv_ptr = table_device_view::create(input, stream);
     auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
       *table_dv_ptr, output_list_offsets.template begin<offset_type>(), data_has_null_mask};
 
@@ -250,7 +250,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
                                      rmm::mr::device_memory_resource* mr) const noexcept
   {
     auto const num_cols     = input.num_columns();
-    auto const table_dv_ptr = table_device_view::create(input);
+    auto const table_dv_ptr = table_device_view::create(input, stream);
 
     // The output child column.
     auto output        = allocate_like(lists_column_view(*input.begin()).child(),
@@ -258,7 +258,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
                                 mask_allocation_policy::NEVER,
                                 stream,
                                 mr);
-    auto output_dv_ptr = mutable_column_device_view::create(*output);
+    auto output_dv_ptr = mutable_column_device_view::create(*output, stream);
 
     // The array of int8_t to store entry validities.
     auto validities =
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 088db226c24..b7e2b73329a 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -266,7 +266,9 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                            std::move(output_offset),
                            std::move(output_child),
                            input.null_count(),
-                           std::move(null_mask));
+                           std::move(null_mask),
+                           stream,
+                           mr);
 }
 
 std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
@@ -300,7 +302,9 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                            std::move(output_offset),
                            std::move(sorted_child_table->release().front()),
                            input.null_count(),
-                           cudf::detail::copy_bitmask(input.parent(), stream, mr));
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
 }
 }  // namespace detail
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 9a456224635..05bd195e764 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -499,7 +499,9 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
   });
 
   // If there is only one non-empty table_view, return its copy
-  if (merge_queue.size() == 1) { return std::make_unique<cudf::table>(merge_queue.top().view); }
+  if (merge_queue.size() == 1) {
+    return std::make_unique<cudf::table>(merge_queue.top().view, stream, mr);
+  }
   // No inputs have rows, return a table with same columns as the first one
   if (merge_queue.empty()) { return empty_like(first_table); }
 
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 073b318b879..dbb66ec295c 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -80,14 +80,14 @@ struct quantile_functor {
     }
 
     auto d_input  = column_device_view::create(input, stream);
-    auto d_output = mutable_column_device_view::create(output->mutable_view());
+    auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
-    auto q_device = cudf::detail::make_device_uvector_sync(q);
+    auto q_device = cudf::detail::make_device_uvector_sync(q, stream);
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
         thrust::make_permutation_iterator(input.data<StorageType>(), ordered_indices);
-      thrust::transform(rmm::exec_policy(),
+      thrust::transform(rmm::exec_policy(stream),
                         q_device.begin(),
                         q_device.end(),
                         d_output->template begin<StorageResult>(),
@@ -97,7 +97,7 @@ struct quantile_functor {
     } else {
       auto sorted_data = thrust::make_permutation_iterator(
         dictionary::detail::make_dictionary_iterator<T>(*d_input), ordered_indices);
-      thrust::transform(rmm::exec_policy(),
+      thrust::transform(rmm::exec_policy(stream),
                         q_device.begin(),
                         q_device.end(),
                         d_output->template begin<StorageResult>(),
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 18e7d02d086..fbdd2e9896b 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -199,7 +199,7 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                                 weight.begin<double>(),
                                 cumulative_weights->mutable_view().begin<double>());
 
-  auto percentiles_cdv = column_device_view::create(percentiles);
+  auto percentiles_cdv = column_device_view::create(percentiles, stream);
 
   // leaf is a column of size input.size() * percentiles.size()
   auto const num_output_values = input.size() * percentiles.size();
@@ -212,7 +212,9 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                  thrust::make_counting_iterator<size_type>(0) + num_output_values,
                  [percentiles = *percentiles_cdv] __device__(size_type i) {
                    return percentiles.is_valid(i % percentiles.size());
-                 })
+                 },
+                 stream,
+                 mr)
              : std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, 0};
   }();
 
@@ -263,8 +265,8 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
     cudf::make_structs_column(centroids_size, std::move(inner_children), 0, {}, stream, mr);
 
   // grouped into lists
-  auto tdigest =
-    cudf::make_lists_column(num_rows, std::move(tdigest_offsets), std::move(tdigest_data), 0, {});
+  auto tdigest = cudf::make_lists_column(
+    num_rows, std::move(tdigest_offsets), std::move(tdigest_data), 0, {}, stream, mr);
 
   // create the final column
   std::vector<std::unique_ptr<column>> children;
@@ -334,7 +336,9 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
       cudf::make_empty_column(type_id::FLOAT64),
       input.size(),
       cudf::detail::create_null_mask(
-        input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr));
+        input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr),
+      stream,
+      mr);
   }
 
   // if any of the input digests are empty, nullify the corresponding output rows (values will be
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 001ee8c791f..2b8066a57ee 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -38,7 +38,7 @@ std::unique_ptr<cudf::scalar> cudf::reduction::nth_element(column_view const& co
     auto valid_count = col.size() - col.null_count();
     n                = wrap_n(valid_count);
     CUDF_EXPECTS(n >= 0 and n < valid_count, "Index out of bounds");
-    auto dcol = column_device_view::create(col);
+    auto dcol = column_device_view::create(col, stream);
     auto bitmask_iterator =
       thrust::make_transform_iterator(cudf::detail::make_validity_iterator(*dcol),
                                       [] __device__(auto b) { return static_cast<size_type>(b); });
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 2b1ac8aa704..5c2b686fd9c 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -212,7 +212,9 @@ struct scan_functor<Op, cudf::struct_view> {
     return make_structs_column(input.size(),
                                std::move(scanned_children),
                                UNKNOWN_NULL_COUNT,
-                               rmm::device_buffer{0, stream, mr});
+                               rmm::device_buffer{0, stream, mr},
+                               stream,
+                               mr);
   }
 };
 
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index e5633341ffa..642531434ae 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -122,7 +122,7 @@ std::unique_ptr<scalar> fixed_point_reduction(column_view const& col,
   }();
 
   auto const val = static_cast<cudf::scalar_type_t<Type>*>(result.get());
-  return cudf::make_fixed_point_scalar<DecimalXX>(val->value(stream), scale);
+  return cudf::make_fixed_point_scalar<DecimalXX>(val->value(stream), scale, stream, mr);
 }
 
 /**
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 7b47f8df28d..537b2f3d092 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -48,7 +48,7 @@ struct replace_nans_functor {
 
     if (input.is_empty()) { return cudf::make_empty_column(input.type()); }
 
-    auto input_device_view = column_device_view::create(input);
+    auto input_device_view = column_device_view::create(input, stream);
     size_type size         = input.size();
 
     auto predicate = [dinput = *input_device_view] __device__(auto i) {
@@ -89,7 +89,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
   return type_dispatcher(input.type(),
                          replace_nans_functor{},
                          input,
-                         *column_device_view::create(replacement),
+                         *column_device_view::create(replacement, stream),
                          replacement.nullable(),
                          stream,
                          mr);
@@ -180,10 +180,10 @@ void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view
   column_view input = in_out;
 
   // to device. unique_ptr which gets automatically cleaned up when we leave
-  auto device_in = column_device_view::create(input);
+  auto device_in = column_device_view::create(input, stream);
 
   // from device. unique_ptr which gets automatically cleaned up when we leave.
-  auto device_out = mutable_column_device_view::create(in_out);
+  auto device_out = mutable_column_device_view::create(in_out, stream);
 
   // invoke the actual kernel.
   cudf::type_dispatcher(
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index d12f18f4827..93bc6cf5ae5 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -180,9 +180,9 @@ struct replace_nulls_column_kernel_forwarder {
     auto replace = replace_nulls<col_type, false>;
     if (output_view.nullable()) replace = replace_nulls<col_type, true>;
 
-    auto device_in          = cudf::column_device_view::create(input);
-    auto device_out         = cudf::mutable_column_device_view::create(output_view);
-    auto device_replacement = cudf::column_device_view::create(replacement);
+    auto device_in          = cudf::column_device_view::create(input, stream);
+    auto device_out         = cudf::mutable_column_device_view::create(output_view, stream);
+    auto device_replacement = cudf::column_device_view::create(replacement, stream);
 
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
@@ -311,7 +311,7 @@ struct replace_nulls_scalar_kernel_forwarder {
 
     using ScalarType = cudf::scalar_type_t<col_type>;
     auto& s1         = static_cast<ScalarType const&>(replacement);
-    auto device_in   = cudf::column_device_view::create(input);
+    auto device_in   = cudf::column_device_view::create(input, stream);
 
     auto func = replace_nulls_functor<col_type>{s1.data()};
     thrust::transform(rmm::exec_policy(stream),
@@ -366,7 +366,7 @@ std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const&
                                                         rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  auto device_in = cudf::column_device_view::create(input);
+  auto device_in = cudf::column_device_view::create(input, stream);
   auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
   auto valid_it  = cudf::detail::make_validity_iterator(*device_in);
   auto in_begin  = thrust::make_zip_iterator(thrust::make_tuple(index, valid_it));
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index a9c7db048b3..3505fe1f5d7 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -323,10 +323,10 @@ struct replace_kernel_forwarder {
     auto output_view = output->mutable_view();
     auto grid        = cudf::detail::grid_1d{output_view.size(), BLOCK_SIZE, 1};
 
-    auto device_in                 = cudf::column_device_view::create(input_col);
-    auto device_out                = cudf::mutable_column_device_view::create(output_view);
-    auto device_values_to_replace  = cudf::column_device_view::create(values_to_replace);
-    auto device_replacement_values = cudf::column_device_view::create(replacement_values);
+    auto device_in                 = cudf::column_device_view::create(input_col, stream);
+    auto device_out                = cudf::mutable_column_device_view::create(output_view, stream);
+    auto device_values_to_replace  = cudf::column_device_view::create(values_to_replace, stream);
+    auto device_replacement_values = cudf::column_device_view::create(replacement_values, stream);
 
     replace<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(*device_in,
                                                                 *device_out,
@@ -412,7 +412,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
     sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
   auto offsets_view   = offsets->mutable_view();
-  auto device_offsets = cudf::mutable_column_device_view::create(offsets_view);
+  auto device_offsets = cudf::mutable_column_device_view::create(offsets_view, stream);
   auto const bytes =
     cudf::detail::get_value<int32_t>(offsets_view, offsets_view.size() - 1, stream);
 
@@ -422,7 +422,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
     cudf::strings::detail::create_chars_child_column(bytes, stream, mr);
 
   auto output_chars_view = output_chars->mutable_view();
-  auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view);
+  auto device_chars      = cudf::mutable_column_device_view::create(output_chars_view, stream);
 
   replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
     *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices);
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index e97c9f8109c..0e3ead3fd99 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -111,7 +111,7 @@ struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cu
     }
 
     auto const create_mask_fn = [&] {
-      auto const input_dv_ptr = table_device_view::create(structs_columns);
+      auto const input_dv_ptr = table_device_view::create(structs_columns, stream);
       auto const validity_fn  = [input_dv = *input_dv_ptr, num_columns] __device__(auto const idx) {
         return input_dv.column(idx % num_columns).is_valid(idx / num_columns);
       };
@@ -224,8 +224,8 @@ struct interleave_columns_impl<T, typename std::enable_if_t<cudf::is_fixed_width
     auto output_size = input.num_columns() * input.num_rows();
     auto output =
       allocate_like(arch_column, output_size, mask_allocation_policy::NEVER, stream, mr);
-    auto device_input  = table_device_view::create(input);
-    auto device_output = mutable_column_device_view::create(*output);
+    auto device_input  = table_device_view::create(input, stream);
+    auto device_output = mutable_column_device_view::create(*output, stream);
     auto index_begin   = thrust::make_counting_iterator<size_type>(0);
     auto index_end     = thrust::make_counting_iterator<size_type>(output_size);
 
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 36dd2dabd72..ef021ca8a35 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -246,7 +246,8 @@ std::unique_ptr<column> round_with(column_view const& input,
 
   // if rounding to more precision than fixed_point is capable of, just need to rescale
   // note: decimal_places has the opposite sign of numeric::scale_type (therefore have to negate)
-  if (input.type().scale() > -decimal_places) return cudf::detail::cast(input, result_type);
+  if (input.type().scale() > -decimal_places)
+    return cudf::detail::cast(input, result_type, stream, mr);
 
   auto result = cudf::make_fixed_width_column(
     result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 9a677d7907a..241b3c595f1 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -189,12 +189,12 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
   // first, find the value in the dictionary's key set
   auto index = cudf::dictionary::detail::get_index(dict_col, value, stream);
   // if found, check the index is actually in the indices column
-  return index->is_valid() ? cudf::type_dispatcher(dict_col.indices().type(),
-                                                   contains_scalar_dispatch{},
-                                                   dict_col.indices(),
-                                                   *index,
-                                                   stream)
-                           : false;
+  return index->is_valid(stream) ? cudf::type_dispatcher(dict_col.indices().type(),
+                                                         contains_scalar_dispatch{},
+                                                         dict_col.indices(),
+                                                         *index,
+                                                         stream)
+                                 : false;
 }
 
 }  // namespace
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index a31716ad2a2..929d21a024c 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -255,7 +255,9 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
                            std::move(offsets),
                            std::move(strings_output),
                            strings.null_count(),
-                           copy_bitmask(strings.parent(), stream, mr));
+                           copy_bitmask(strings.parent(), stream, mr),
+                           stream,
+                           mr);
 }
 
 template <Dir dir>
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index d4e2f48feba..43a32c8405a 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -430,7 +430,7 @@ std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent
   auto superimposed_columns   = std::vector<column_view>{};
   auto superimposed_nullmasks = std::vector<rmm::device_buffer>{};
   for (auto col : table) {
-    auto [superimposed_col, null_masks] = superimpose_parent_nulls(col);
+    auto [superimposed_col, null_masks] = superimpose_parent_nulls(col, stream, mr);
     superimposed_columns.push_back(superimposed_col);
     superimposed_nullmasks.insert(superimposed_nullmasks.begin(),
                                   std::make_move_iterator(null_masks.begin()),
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 131fde11cf8..f41ebacce53 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -177,13 +177,14 @@ std::unique_ptr<column> rescale(column_view input,
   using namespace numeric;
 
   if (input.type().scale() >= scale) {
-    auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale});
+    auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale}, rmm::cuda_stream_default);
     auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
     return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
   } else {
-    auto const diff   = input.type().scale() - scale;
-    auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff});
-    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    auto const diff = input.type().scale() - scale;
+    auto const scalar =
+      make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, rmm::cuda_stream_default);
+    auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
     return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
@@ -290,7 +291,9 @@ struct dispatch_unary_cast_to {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    if (input.type() == type) return std::make_unique<column>(input);  // TODO add test for this
+    if (input.type() == type) {
+      return std::make_unique<column>(input, stream, mr);  // TODO add test for this
+    }
 
     return detail::rescale<TargetT>(input, numeric::scale_type{type.scale()}, stream, mr);
   }
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 1840aebf8f0..c6b9ecefe94 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -34,7 +34,7 @@ struct nan_dispatcher {
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr)
   {
-    auto input_device_view = column_device_view::create(input);
+    auto input_device_view = column_device_view::create(input, stream);
 
     if (input.has_nulls()) {
       auto input_pair_iterator = make_pair_iterator<T, true>(*input_device_view);

From 8e5b23c24e1f34203cea349ee244c4100b608fa5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 9 Dec 2021 09:57:41 -0800
Subject: [PATCH 067/202] Load libcufile.so with RTLD_NODELETE flag (#9872)

Workaround for a known cuFile issue that can lead to segfault if cuFile library is dynamically unloaded.
Using `RTLD_NODELETE` when calling `dlopen` so that the library is not unloaded in `dlclose`.

Also adds a check for the result of dlopen, to help triage cuFile use issues.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9872
---
 cpp/src/io/utilities/file_io_utilities.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index d96bf93d10f..85a5f7e2281 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -75,8 +75,8 @@ class cufile_shim {
 
   ~cufile_shim()
   {
-    if (driver_close) driver_close();
-    if (cf_lib) dlclose(cf_lib);
+    if (driver_close != nullptr) driver_close();
+    if (cf_lib != nullptr) dlclose(cf_lib);
   }
 
   decltype(cuFileHandleRegister)* handle_register     = nullptr;
@@ -117,7 +117,8 @@ void cufile_shim::modify_cufile_json() const
 
 void cufile_shim::load_cufile_lib()
 {
-  cf_lib      = dlopen("libcufile.so", RTLD_NOW);
+  cf_lib = dlopen("libcufile.so", RTLD_LAZY | RTLD_LOCAL | RTLD_NODELETE);
+  CUDF_EXPECTS(cf_lib != nullptr, "Failed to load cuFile library");
   driver_open = reinterpret_cast<decltype(driver_open)>(dlsym(cf_lib, "cuFileDriverOpen"));
   CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol");
   driver_close = reinterpret_cast<decltype(driver_close)>(dlsym(cf_lib, "cuFileDriverClose"));

From c26779c33f2b602e8bf68c9cd2edb236dd8cc2b1 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 9 Dec 2021 16:28:38 -0600
Subject: [PATCH 068/202] Fix an out-of-bounds read in validity copying in
 contiguous_split. (#9842)

Fixes https://github.com/rapidsai/cudf/issues/9504

The bug was pretty straightforward:  when copying validity bits, we (potentially) perform a bit shift on the data so that we can read aligned 4 bytes at a time.  Under certain circumstances, we were reading 1 word past the end of the input incorrectly.

Adding a do not merge tag - waiting to get a full run of TPC-DS with this as an extra safety check.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9842
---
 cpp/src/copying/contiguous_split.cu | 17 ++++++++++++-----
 cpp/tests/copying/split_tests.cpp   | 15 +++++++++++++++
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index a9194ceea93..bcedc2f62c6 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -63,7 +63,7 @@ struct src_buf_info {
                int _offset_stack_pos,
                int _parent_offsets_index,
                bool _is_validity,
-               int _column_offset)
+               size_type _column_offset)
     : type(_type),
       offsets(_offsets),
       offset_stack_pos(_offset_stack_pos),
@@ -194,11 +194,18 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
     if (value_shift || bit_shift) {
       std::size_t idx = (num_bytes - remainder) / 4;
       uint32_t v = remainder > 0 ? (reinterpret_cast<uint32_t const*>(src)[idx] - value_shift) : 0;
+
+      constexpr size_type rows_per_element = 32;
+      auto const have_trailing_bits = ((num_elements * rows_per_element) - num_rows) < bit_shift;
       while (remainder) {
-        uint32_t const next = bit_shift > 0 || remainder > 4
-                                ? (reinterpret_cast<uint32_t const*>(src)[idx + 1] - value_shift)
-                                : 0;
-        uint32_t const val  = (v >> bit_shift) | (next << (32 - bit_shift));
+        // if we're at the very last word of a validity copy, we do not always need to read the next
+        // word to get the final trailing bits.
+        auto const read_trailing_bits = bit_shift > 0 && remainder == 4 && have_trailing_bits;
+        uint32_t const next           = (read_trailing_bits || remainder > 4)
+                                          ? (reinterpret_cast<uint32_t const*>(src)[idx + 1] - value_shift)
+                                          : 0;
+
+        uint32_t const val = (v >> bit_shift) | (next << (32 - bit_shift));
         if (valid_count) { thread_valid_count += __popc(val); }
         reinterpret_cast<uint32_t*>(dst)[idx] = val;
         v                                     = next;
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index f7714ce9ac7..b5a793ecd1c 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -1315,6 +1315,21 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase)
+{
+  // tests an edge case where the splits cause the final validity data to be copied
+  // to be < 32 full bits, making sure we don't unintentionally read past the end of the input
+  auto col = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, 512, cudf::mask_state::ALL_VALID);
+  auto result   = cudf::contiguous_split(cudf::table_view{{*col}}, {510});
+  auto expected = cudf::split(cudf::table_view{{*col}}, {510});
+
+  EXPECT_EQ(expected.size(), result.size());
+  for (unsigned long index = 0; index < result.size(); index++) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected[index].column(0), result[index].table.column(0));
+  }
+}
+
 // contiguous split with strings
 struct ContiguousSplitStringTableTest : public SplitTest<std::string> {
 };

From d7ce106f5df9daf7ef539ddc1473cf93712466e5 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Thu, 9 Dec 2021 17:09:20 -0600
Subject: [PATCH 069/202] Support statically linking CUDA runtime for Java
 bindings (#9873)

Fixes #9528.  Adds the ability to statically link the CUDA runtime when building the Java bindings.  This also adds the ability for the Java bindings to link against an archive of libcudf rather than the shared library.  This is recommended when statically linking the CUDA runtime, as it reduces the number of shared libraries that pull in the CUDA runtime.

The Java CI script to build the official artifact has been updated to building libcudf as an archive and statically linking the CUDA runtime by default.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9873
---
 java/README.md                         | 30 +++++++----------
 java/ci/build-in-docker.sh             | 19 +++++++++--
 java/pom.xml                           | 25 +++++---------
 java/src/main/native/CMakeLists.txt    | 46 +++++++++++++++++---------
 java/src/main/native/src/emptyfile.cpp | 17 ++++++++++
 5 files changed, 86 insertions(+), 51 deletions(-)
 create mode 100644 java/src/main/native/src/emptyfile.cpp

diff --git a/java/README.md b/java/README.md
index 730eeadb10a..afd69df11ef 100644
--- a/java/README.md
+++ b/java/README.md
@@ -87,24 +87,18 @@ within the libcudf build environment.
 
 ## Statically Linking the CUDA Runtime
 
-If you use the default cmake options libcudart will be dynamically linked to libcudf
-which is included.  If you do this the resulting jar will have a classifier associated with it
-because that jar can only be used with a single version of the CUDA runtime.  
-
-There is experimental work to try and remove that requirement but it is not fully functional
-you can build cuDF with `-DCUDA_STATIC_RUNTIME=ON` when running cmake, and similarly 
-`-DCUDA_STATIC_RUNTIME=ON` when running Maven.  This will statically link in the CUDA runtime
-and result in a jar with no classifier that should run on any host that has a version of the
-driver new enough to support the runtime that this was built with.
-
-To build the Java bindings with a statically-linked CUDA runtime, use a build command like:
-```
-mvn clean install -DCUDA_STATIC_RUNTIME=ON
-```
-
-You will get errors if the CUDA runtime linking is not consistent.  We tried to detect these
-up front and stop the build early if there is a mismatch, but there may be some cases we missed
-and this can result in some very hard to debug errors.
+If you use the default cmake options libcudart will be dynamically linked to libcudf and libcudfjni.
+To build with a static CUDA runtime, build libcudf with the `-DCUDA_STATIC_RUNTIME=ON` as a cmake
+parameter, and similarly build with `-DCUDA_STATIC_RUNTIME=ON` when building the Java bindings
+with Maven.
+
+### Building with a libcudf Archive
+
+When statically linking the CUDA runtime, it is recommended to build cuDF as an archive rather than
+a shared library, as this allows the Java bindings to only have a single shared library that uses
+the CUDA runtime. To build libcudf as an archive, specify `-DBUILD_SHARED_LIBS=OFF` as a cmake
+parameter when building libcudf, then specify `-DCUDF_JNI_LIBCUDF_STATIC=ON` when building the Java
+bindings with Maven.
 
 ## Per-thread Default Stream
 
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index df4ca853398..a99b6900830 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -22,6 +22,7 @@ gcc --version
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true}
 BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF}
+ENABLE_CUDA_STATIC_RUNTIME=${ENABLE_CUDA_STATIC_RUNTIME:-ON}
 ENABLE_PTDS=${ENABLE_PTDS:-ON}
 RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF}
 ENABLE_NVTX=${ENABLE_NVTX:-ON}
@@ -36,6 +37,7 @@ OUT_PATH="$WORKSPACE/$OUT"
 echo "SIGN_FILE: $SIGN_FILE,\
  SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\
  BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\
+ ENABLE_CUDA_STATIC_RUNTIME: $ENABLE_CUDA_STATIC_RUNTIME,\
  ENABLED_PTDS: $ENABLE_PTDS,\
  ENABLE_NVTX: $ENABLE_NVTX,\
  ENABLE_GDS: $ENABLE_GDS,\
@@ -52,13 +54,26 @@ export LIBCUDF_KERNEL_CACHE_PATH=/rapids
 rm -rf "$WORKSPACE/cpp/build"
 mkdir -p "$WORKSPACE/cpp/build"
 cd "$WORKSPACE/cpp/build"
-cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF -DBUILD_TESTS=$BUILD_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL
+cmake .. -DUSE_NVTX=$ENABLE_NVTX \
+         -DCUDF_USE_ARROW_STATIC=ON \
+         -DCUDF_ENABLE_ARROW_S3=OFF \
+         -DBUILD_TESTS=$BUILD_CPP_TESTS \
+         -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \
+         -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
+         -DBUILD_SHARED_LIBS=OFF
 
 make -j$PARALLEL_LEVEL
 make install DESTDIR=$INSTALL_PREFIX
 
 ###### Build cudf jar ######
-BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
+BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\"\
+ -DskipTests=$SKIP_JAVA_TESTS\
+ -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS\
+ -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME\
+ -DCUDF_JNI_LIBCUDF_STATIC=ON\
+ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL\
+ -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest"
+
 if [ "$SIGN_FILE" == true ]; then
     # Build javadoc and sources only when SIGN_FILE is true
     BUILD_ARG="$BUILD_ARG -Prelease"
diff --git a/java/pom.xml b/java/pom.xml
index c5a3bc64fad..ec6968ca761 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -169,6 +169,7 @@
         <USE_GDS>OFF</USE_GDS>
         <GPU_ARCHS>ALL</GPU_ARCHS>
         <CUDF_JNI_ARROW_STATIC>ON</CUDF_JNI_ARROW_STATIC>
+        <CUDF_JNI_LIBCUDF_STATIC>OFF</CUDF_JNI_LIBCUDF_STATIC>
         <native.build.path>${project.build.directory}/cmake-build</native.build.path>
         <slf4j.version>1.7.30</slf4j.version>
         <arrow.version>0.15.1</arrow.version>
@@ -390,6 +391,7 @@
                                     <arg value="-DCUDF_CPP_BUILD_DIR=${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="-DGPU_ARCHS=${GPU_ARCHS}"/>
                                     <arg value="-DCUDF_JNI_ARROW_STATIC=${CUDF_JNI_ARROW_STATIC}"/>
+                                    <arg value="-DCUDF_JNI_LIBCUDF_STATIC=${CUDF_JNI_LIBCUDF_STATIC}"/>
                                 </exec>
                                 <exec dir="${native.build.path}"
                                       failonerror="true"
@@ -434,26 +436,17 @@
                                 fail("Could not find cudf as a dependency of libcudfjni out> $sout err> $serr")
                             }
 
-                            def libcudart = ~/libcudart\\.so\\.(.*)\\s+=>.*/
-                            def cm = libcudart.matcher(sout)
+                            def nvccout = new StringBuffer(), nvccerr = new StringBuffer()
+                            def nvccproc = 'nvcc --version'.execute()
+                            nvccproc.consumeProcessOutput(nvccout, nvccerr)
+                            nvccproc.waitForOrKill(10000)
+                            def cudaPattern = ~/Cuda compilation tools, release ([0-9]+)/
+                            def cm = cudaPattern.matcher(nvccout)
                             if (cm.find()) {
-                                if (pom.properties['CUDA_STATIC_RUNTIME'] == 'ON') {
-                                    fail("found libcudart when we expect to be statically linked to it")
-                                }
                                 def classifier = 'cuda' + cm.group(1)
-                                  .replaceFirst(/\\./, '-') // First . becomes a -
-                                  .replaceAll(/\\..*$/, '') // Drop all of the subversions from cuda
-                                  .replaceAll(/-0$/, '') // If it is a X.0 version, like 10.0 drop the .0
                                 pom.properties['cuda.classifier'] = classifier
-                                println 'WARNING FOUND libcudart this means your jar will only work against a single version of the cuda runtime ' + classifier
-                            } else if (pom.properties['CUDA_STATIC_RUNTIME'] == 'OFF') {
-                                fail('could not find libcudart when we expect to be dynamically linked to it')
                             } else {
-                                pom.properties['cuda.classifier'] = ''
-                            }
-
-                            if (pom.properties['CUDA_STATIC_RUNTIME'] == 'ON') {
-                                println 'WARNING RUNNING WITH STATIC LINKING DOES NOT FULLY WORK. USE WITH CAUTION.'
+                                fail('could not find CUDA version')
                             }
                             </source>
                         </configuration>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index a5a6646c7e6..327445bdf0e 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -39,6 +39,7 @@ option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
 option(CUDF_JNI_ARROW_STATIC "Statically link Arrow" ON)
+option(CUDF_JNI_LIBCUDF_STATIC "Link with libcudf.a" OFF)
 
 message(VERBOSE "CUDF_JNI: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF_JNI: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -46,6 +47,7 @@ message(VERBOSE "CUDF_JNI: Build with per-thread default stream: ${PER_THREAD_DE
 message(VERBOSE "CUDF_JNI: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
 message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}")
+message(VERBOSE "CUDF_JNI: Link with libcudf statically: ${CUDF_JNI_LIBCUDF_STATIC}")
 
 set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp")
 if(DEFINED ENV{CUDF_CPP_BUILD_DIR})
@@ -85,7 +87,7 @@ endif()
 rapids_cmake_build_type("Release")
 
 # ##################################################################################################
-# * Thrust/CUB/libcudacxx
+# * Thrust/CUB
 # ------------------------------------------------------------------------------------
 find_path(
   THRUST_INCLUDE "thrust"
@@ -102,19 +104,6 @@ find_path(
 
 message(STATUS "CUB: CUB_INCLUDE set to ${CUB_INCLUDE}")
 
-find_path(LIBCUDACXX_INCLUDE "cuda" HINTS "$ENV{CUDF_ROOT}/_deps/libcudacxx-src/include"
-                                          "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include"
-)
-
-message(STATUS "LIBCUDACXX: LIBCUDACXX_INCLUDE set to ${LIBCUDACXX_INCLUDE}")
-
-find_path(
-  SPDLOG_INCLUDE "spdlog"
-  HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include" "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
-        "$ENV{RMM_ROOT}/include" "$ENV{CONDA_PREFIX}/include"
-)
-
-message(STATUS "SPDLOG: SPDLOG_INCLUDE set to ${SPDLOG_INCLUDE}")
 # ##################################################################################################
 # * CUDF ------------------------------------------------------------------------------------------
 
@@ -139,6 +128,14 @@ find_path(
 
 message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
 
+find_path(
+  SPDLOG_INCLUDE "spdlog"
+  HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include" "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
+        "$ENV{RMM_ROOT}/include" "$ENV{CONDA_PREFIX}/include"
+)
+
+message(STATUS "SPDLOG: SPDLOG_INCLUDE set to ${SPDLOG_INCLUDE}")
+
 # ##################################################################################################
 # * ARROW -----------------------------------------------------------------------------------------
 
@@ -236,6 +233,18 @@ add_library(
   src/check_nvcomp_output_sizes.cu
 )
 
+if(CUDF_JNI_LIBCUDF_STATIC)
+  # When linking against libcudf.a, the JNI library will include the old libcudf.so. For
+  # backwards-compatibility for software that expects to find libcudf.so in the JVM environment
+  # after cudf has loaded, the JNI code and libcudf.a will be combined into libcudf.so. A stub
+  # library will be created for libcudfjni.so that will simply require libcudf.so for backwards
+  # compatibility with software that expects to find libcudfjni.so at runtime.
+  set_target_properties(cudfjni PROPERTIES OUTPUT_NAME "cudf")
+  add_library(cudfjnistub SHARED src/emptyfile.cpp)
+  set_target_properties(cudfjnistub PROPERTIES OUTPUT_NAME "cudfjni")
+  target_link_libraries(cudfjnistub -Wl,--no-as-needed cudfjni -Wl,--as-needed)
+endif()
+
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 
@@ -310,7 +319,14 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM
 # ##################################################################################################
 # * link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(cudfjni PRIVATE ${NVCOMP_LIBRARY} ${CUDF_LIB} ${ARROW_LIBRARY})
+set(CUDF_LINK ${CUDF_LIB})
+if(CUDF_JNI_LIBCUDF_STATIC)
+  set(CUDF_LINK -Wl,--whole-archive ${CUDF_LIB} -Wl,--no-whole-archive)
+endif()
+
+target_link_libraries(
+  cudfjni PRIVATE ${CUDF_LINK} ${NVCOMP_LIBRARY} ${ARROW_LIBRARY} CUDA::cuda_driver
+)
 
 # ##################################################################################################
 # * cudart options --------------------------------------------------------------------------------
diff --git a/java/src/main/native/src/emptyfile.cpp b/java/src/main/native/src/emptyfile.cpp
new file mode 100644
index 00000000000..67fa3acd739
--- /dev/null
+++ b/java/src/main/native/src/emptyfile.cpp
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Intentionally empty

From 9435945913115295ab8705a0cb247fea12869366 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 9 Dec 2021 20:39:08 -0500
Subject: [PATCH 070/202] Add one-level list encoding support in parquet reader
 (#9848)

Closes https://github.com/rapidsai/cudf/issues/9240

This PR added the [one-level list encoding](https://github.com/apache/parquet-cpp/blob/master/src/parquet/schema.h#L43-L77) support in parquet reader. It also involved cleanups like removing the unused stream argument and fixing typos in docs/comments.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9848
---
 cpp/include/cudf/io/detail/parquet.hpp        |   2 -
 cpp/include/cudf/io/parquet.hpp               |   2 +-
 cpp/src/io/functions.cpp                      |   3 +-
 cpp/src/io/parquet/page_data.cu               |   8 ++--
 cpp/src/io/parquet/parquet.hpp                |   6 +++
 cpp/src/io/parquet/reader_impl.cu             |  45 +++++++++++++++---
 .../tests/data/parquet/one_level_list.parquet | Bin 0 -> 255 bytes
 python/cudf/cudf/tests/test_parquet.py        |   9 ++++
 8 files changed, 60 insertions(+), 15 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/parquet/one_level_list.parquet

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 98922ad10a4..a18bd450640 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -54,12 +54,10 @@ class reader {
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
-                  rmm::cuda_stream_view stream,
                   rmm::mr::device_memory_resource* mr);
 
   /**
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index a2771d6400f..2215f24b550 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -46,7 +46,7 @@ constexpr size_type default_row_group_size_rows = 1000000;
 class parquet_reader_options_builder;
 
 /**
- * @brief Settings or `read_parquet()`.
+ * @brief Settings for `read_parquet()`.
  */
 class parquet_reader_options {
   source_info _source;
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 402e212f07b..613ccf203cb 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -402,8 +402,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
-  auto reader      = std::make_unique<detail_parquet::reader>(
-    std::move(datasources), options, rmm::cuda_stream_default, mr);
+  auto reader      = std::make_unique<detail_parquet::reader>(std::move(datasources), options, mr);
 
   return reader->read(options);
 }
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 95a79fceb63..b31888b6ac2 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1557,10 +1557,10 @@ extern "C" __global__ void __launch_bounds__(block_size)
 
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  // optimization : it might be useful to have a version of gpuDecodeStream that could go
-  // wider than 1 warp.  Currently it only only uses 1 warp so that it can overlap work
-  // with the value decoding step when in the actual value decoding kernel.  however during
-  // this preprocess step we have no such limits -  we could go as wide as block_size
+  // optimization : it might be useful to have a version of gpuDecodeStream that could go wider than
+  // 1 warp.  Currently it only uses 1 warp so that it can overlap work with the value decoding step
+  // when in the actual value decoding kernel. However, during this preprocess step we have no such
+  // limits -  we could go as wide as block_size
   if (t < 32) {
     constexpr int batch_size = 32;
     int target_input_count   = batch_size;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 4390d1c788f..cd57414d98b 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -198,6 +198,12 @@ struct SchemaElement {
   //  };
   // }
   bool is_stub() const { return repetition_type == REPEATED && num_children == 1; }
+
+  // https://github.com/apache/parquet-cpp/blob/642da05/src/parquet/schema.h#L49-L50
+  // One-level LIST encoding: Only allows required lists with required cells:
+  //   repeated value_type name
+  bool is_one_level_list() const { return repetition_type == REPEATED and num_children == 0; }
+
   // in parquet terms, a group is a level of nesting in the schema. a group
   // can be a struct or a list
   bool is_struct() const
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 8ff0d14ffda..69d480edf85 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -185,6 +185,16 @@ type_id to_type_id(SchemaElement const& schema,
   return type_id::EMPTY;
 }
 
+/**
+ * @brief Converts cuDF type enum to column logical type
+ */
+data_type to_data_type(type_id t_id, SchemaElement const& schema)
+{
+  return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128
+           ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}}
+           : data_type{t_id};
+}
+
 /**
  * @brief Function that returns the required the number of bits to store a value
  */
@@ -414,6 +424,9 @@ class aggregate_metadata {
     // walk upwards, skipping repeated fields
     while (schema_index > 0) {
       if (!pfm.schema[schema_index].is_stub()) { depth++; }
+      // schema of one-level encoding list doesn't contain nesting information, so we need to
+      // manually add an extra nesting level
+      if (pfm.schema[schema_index].is_one_level_list()) { depth++; }
       schema_index = pfm.schema[schema_index].parent_idx;
     }
     return depth;
@@ -596,11 +609,11 @@ class aggregate_metadata {
         }
 
         // if we're at the root, this is a new output column
-        auto const col_type = to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
-        auto const dtype    = col_type == type_id::DECIMAL32 || col_type == type_id::DECIMAL64 ||
-                               col_type == type_id::DECIMAL128
-                                ? data_type{col_type, numeric::scale_type{-schema_elem.decimal_scale}}
-                                : data_type{col_type};
+        auto const col_type =
+          schema_elem.is_one_level_list()
+            ? type_id::LIST
+            : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+        auto const dtype = to_data_type(col_type, schema_elem);
 
         column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
         // store the index of this element if inserted in out_col_array
@@ -630,6 +643,23 @@ class aggregate_metadata {
         if (schema_elem.num_children == 0) {
           input_column_info& input_col =
             input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
+
+          // set up child output column for one-level encoding list
+          if (schema_elem.is_one_level_list()) {
+            // determine the element data type
+            auto const element_type =
+              to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+            auto const element_dtype = to_data_type(element_type, schema_elem);
+
+            column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+            // store the index of this element
+            nesting.push_back(static_cast<int>(output_col.children.size()));
+            // TODO: not sure if we should assign a name or leave it blank
+            element_col.name = "element";
+
+            output_col.children.push_back(std::move(element_col));
+          }
+
           std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
           path_is_valid = true;  // If we're able to reach leaf then path is valid
         }
@@ -850,6 +880,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
+        // if it's one-level encoding list
+        else if (cur_schema.is_one_level_list()) {
+          shallowest = cur_depth - 1;
+        }
         if (!cur_schema.is_stub()) { cur_depth--; }
         schema_idx = cur_schema.parent_idx;
       }
@@ -1770,7 +1804,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, mr))
 {
diff --git a/python/cudf/cudf/tests/data/parquet/one_level_list.parquet b/python/cudf/cudf/tests/data/parquet/one_level_list.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f10d3a10290450d77c0d6ab9c590e19b4edd0a09
GIT binary patch
literal 255
zcmZWkOA5k344t;qQc#yEQ<z0Jii%2W3srFAPQgX-1~bkK6x3?{oWUcw_7I-Wv|B@7
zNZw0AQmpq0P>3PkK^G`#&BACS97IF82p}+q@dOxld*E?K8<w>b0cLHp`|UMz%yQh<
zr{o<1b~xG4x~lGAW6`I~5+v|L9`b?DQSDPNL=H&6g0F=>@K=AU8^tHX?M2krhrUlw
zb@bHrt*p*bGF!}&Xeb}$QAL@MB2z-=QpPINN*+wUoTox3X}UD?g;A-=&B2Vw8?P+z
F^&c5tD4qZS

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 597ae6c05c0..c52dab5c72f 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2269,6 +2269,15 @@ def test_parquet_reader_brotli(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_one_level_list(datadir):
+    fname = datadir / "one_level_list.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname).to_pandas(nullable=True)
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000])
 @pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000])
 def test_parquet_writer_row_group_size(

From b545df49a6801a192a9f54a725e5eff765664d70 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 10 Dec 2021 21:56:12 +0530
Subject: [PATCH 071/202] add run_benchmarks target for running benchmarks with
 json output (#9879)

closes https://github.com/rapidsai/cudf/issues/2784
Adds run_benchmarks target for running benchmarks with json output

You can run all benchmarks with  `make run_benchmarks` or `ninja run_benchmarks`
json will be stored in `benchmarks/results/*.json`
The benchmark stdout will still print.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9879
---
 cpp/benchmarks/CMakeLists.txt | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 72b247ae748..71b452a1004 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -42,6 +42,14 @@ add_library(
                                synchronization/synchronization.cpp io/cuio_benchmark_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
+add_custom_command(
+  OUTPUT CUDF_BENCHMARKS
+  COMMAND echo Running benchmarks
+  COMMAND mkdir -p results
+  VERBATIM
+  COMMENT "Running cudf benchmarks."
+  USES_TERMINAL
+)
 
 # This function takes in a benchmark name and benchmark source and handles setting all of the
 # associated properties and linking to build the benchmark
@@ -54,6 +62,13 @@ function(ConfigureBench CMAKE_BENCH_NAME)
   target_link_libraries(
     ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
   )
+  add_custom_command(
+    OUTPUT CUDF_BENCHMARKS
+    COMMAND ${CMAKE_BENCH_NAME} --benchmark_out_format=json
+            --benchmark_out=results/${CMAKE_BENCH_NAME}.json
+    APPEND
+    COMMENT "Adding ${CMAKE_BENCH_NAME}"
+  )
 endfunction()
 
 # This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
@@ -254,3 +269,9 @@ ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
+
+add_custom_target(
+  run_benchmarks
+  DEPENDS CUDF_BENCHMARKS
+  COMMENT "Custom command for running cudf benchmarks."
+)

From 45001f6e1d8d9d06b35325f0144426029f8c793f Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Fri, 10 Dec 2021 14:23:12 -0500
Subject: [PATCH 072/202] Support round operation on datetime64 datatypes
 (#9820)

This PR fixes #9652, by adding support for doing `round` operation on `dtaetime64[ns]` types, which is essentially supporting `series.dt.round` and `DatetimeIndex.round`.

In addition to this, we move the round implementation that is currently there from `Frame` to `IndexedFrame` as `pd.Index` doesn't support `round`. This is why we move this implementation to `IndexedFrame`, and add the code specifically for `DatetimeIndex` (for this PR), so as to avoid having a round method for another index types.

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9820
---
 cpp/include/cudf/datetime.hpp            |  91 +++++++++++++++++
 cpp/src/datetime/datetime_ops.cu         | 124 ++++++++++++++++++-----
 cpp/tests/datetime/datetime_ops_test.cpp |  85 ++++++++++++++--
 docs/cudf/source/api_docs/series.rst     |   1 +
 python/cudf/cudf/_lib/cpp/datetime.pxd   |  13 +++
 python/cudf/cudf/_lib/datetime.pyx       |  27 +++++
 python/cudf/cudf/core/column/datetime.py |   3 +
 python/cudf/cudf/core/frame.py           | 113 ---------------------
 python/cudf/cudf/core/index.py           |  58 +++++++++--
 python/cudf/cudf/core/indexed_frame.py   | 121 +++++++++++++++++++++-
 python/cudf/cudf/core/series.py          |  39 +++++++
 python/cudf/cudf/tests/test_datetime.py  |  31 ++++++
 python/cudf/cudf/tests/test_index.py     |  13 +++
 13 files changed, 569 insertions(+), 150 deletions(-)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 71e5968bf07..17bea935dfd 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -469,5 +469,96 @@ std::unique_ptr<column> floor_nanosecond(
   column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Round to the nearest day
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> round_day(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest hour
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> round_hour(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest minute
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> round_minute(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest second
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<cudf::column> round_second(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest millisecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> round_millisecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest microsecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> round_microsecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Round to the nearest nanosecond
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column.
+ *
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ * @return cudf::column of the same datetime resolution as the input column
+ */
+std::unique_ptr<column> round_nanosecond(
+  column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 717bd7ac0a8..85653b4f0be 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -54,7 +54,11 @@ enum class datetime_component {
   NANOSECOND
 };
 
-enum class rounding_kind { CEIL, FLOOR };
+enum class rounding_function {
+  CEIL,   ///< Rounds up to the next integer multiple of the provided frequency
+  FLOOR,  ///< Rounds down to the next integer multiple of the provided frequency
+  ROUND   ///< Rounds to the nearest integer multiple of the provided frequency
+};
 
 template <datetime_component Component>
 struct extract_component_operator {
@@ -95,11 +99,12 @@ struct extract_component_operator {
 template <typename DurationType>
 struct RoundFunctor {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt)
+  CUDA_DEVICE_CALLABLE auto operator()(rounding_function round_kind, Timestamp dt)
   {
     switch (round_kind) {
-      case rounding_kind::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
-      case rounding_kind::FLOOR: return cuda::std::chrono::floor<DurationType>(dt);
+      case rounding_function::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
+      case rounding_function::FLOOR: return cuda::std::chrono::floor<DurationType>(dt);
+      case rounding_function::ROUND: return cuda::std::chrono::round<DurationType>(dt);
       default: cudf_assert(false && "Unsupported rounding kind.");
     }
     __builtin_unreachable();
@@ -107,10 +112,10 @@ struct RoundFunctor {
 };
 
 struct RoundingDispatcher {
-  rounding_kind round_kind;
+  rounding_function round_kind;
   datetime_component component;
 
-  RoundingDispatcher(rounding_kind round_kind, datetime_component component)
+  RoundingDispatcher(rounding_function round_kind, datetime_component component)
     : round_kind(round_kind), component(component)
   {
   }
@@ -224,11 +229,11 @@ struct is_leap_year_op {
   }
 };
 
-// Specific function for applying ceil/floor date ops
+// Specific function for applying ceil/floor/round date ops
 struct dispatch_round {
   template <typename Timestamp>
   std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
-    rounding_kind round_kind,
+    rounding_function round_kind,
     datetime_component component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
@@ -414,7 +419,7 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
   }
 }
 
-std::unique_ptr<column> round_general(rounding_kind round_kind,
+std::unique_ptr<column> round_general(rounding_function round_kind,
                                       datetime_component component,
                                       column_view const& column,
                                       rmm::cuda_stream_view stream,
@@ -529,7 +534,7 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::DAY,
                                column,
                                rmm::cuda_stream_default,
@@ -539,7 +544,7 @@ std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memo
 std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::HOUR,
                                column,
                                rmm::cuda_stream_default,
@@ -549,7 +554,7 @@ std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_mem
 std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::MINUTE,
                                column,
                                rmm::cuda_stream_default,
@@ -559,7 +564,7 @@ std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_m
 std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::SECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -570,7 +575,7 @@ std::unique_ptr<column> ceil_millisecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::MILLISECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -581,7 +586,7 @@ std::unique_ptr<column> ceil_microsecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::MICROSECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -592,7 +597,7 @@ std::unique_ptr<column> ceil_nanosecond(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::CEIL,
+  return detail::round_general(detail::rounding_function::CEIL,
                                detail::datetime_component::NANOSECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -602,7 +607,7 @@ std::unique_ptr<column> ceil_nanosecond(column_view const& column,
 std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::DAY,
                                column,
                                rmm::cuda_stream_default,
@@ -612,7 +617,7 @@ std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_mem
 std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::HOUR,
                                column,
                                rmm::cuda_stream_default,
@@ -622,7 +627,7 @@ std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_me
 std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::MINUTE,
                                column,
                                rmm::cuda_stream_default,
@@ -632,7 +637,7 @@ std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_
 std::unique_ptr<column> floor_second(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::SECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -643,7 +648,7 @@ std::unique_ptr<column> floor_millisecond(column_view const& column,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::MILLISECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -654,7 +659,7 @@ std::unique_ptr<column> floor_microsecond(column_view const& column,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
                                detail::datetime_component::MICROSECOND,
                                column,
                                rmm::cuda_stream_default,
@@ -665,7 +670,80 @@ std::unique_ptr<column> floor_nanosecond(column_view const& column,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_kind::FLOOR,
+  return detail::round_general(detail::rounding_function::FLOOR,
+                               detail::datetime_component::NANOSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::DAY,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::HOUR,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::MINUTE,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_second(column_view const& column, rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::SECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_millisecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::MILLISECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_microsecond(column_view const& column,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
+                               detail::datetime_component::MICROSECOND,
+                               column,
+                               rmm::cuda_stream_default,
+                               mr);
+}
+
+std::unique_ptr<column> round_nanosecond(column_view const& column,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::round_general(detail::rounding_function::ROUND,
                                detail::datetime_component::NANOSECOND,
                                column,
                                rmm::cuda_stream_default,
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 4ac24317145..62b8425704f 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -14,19 +14,18 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/timestamp_utilities.cuh>
-#include <cudf_test/type_lists.hpp>
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/timestamps.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/timestamp_utilities.cuh>
+#include <cudf_test/type_lists.hpp>
 
 #define XXX false  // stub for null values
 
@@ -914,4 +913,76 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond);
 }
 
+TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
+
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+
+  auto host_val   = to_host<T>(input);
+  auto timestamps = host_val.first;
+
+  std::vector<T> rounded_day(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<days>(i));
+  });
+  auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(rounded_day.begin(),
+                                                                               rounded_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_day(input), expected_day);
+
+  std::vector<T> rounded_hour(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_hour.begin(), rounded_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_hour(input), expected_hour);
+
+  std::vector<T> rounded_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_minute.begin(), rounded_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_minute(input), expected_minute);
+
+  std::vector<T> rounded_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_second.begin(), rounded_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_second);
+
+  std::vector<T> rounded_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_millisecond.begin(), rounded_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_millisecond);
+
+  std::vector<T> rounded_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_microsecond.begin(), rounded_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_microsecond(input), expected_microsecond);
+
+  std::vector<T> rounded_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), rounded_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(round<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    rounded_nanosecond.begin(), rounded_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_nanosecond(input), expected_nanosecond);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index a3b17926bdd..6dc38d985f8 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -306,6 +306,7 @@ Datetime methods
    isocalendar
    ceil
    floor
+   round
 
 
 Timedelta properties
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 38ed9fbd769..f75b39ce6ee 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -39,6 +39,19 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] floor_nanosecond(
         const column_view& column
     ) except +
+    cdef unique_ptr[column] round_day(const column_view& column) except +
+    cdef unique_ptr[column] round_hour(const column_view& column) except +
+    cdef unique_ptr[column] round_minute(const column_view& column) except +
+    cdef unique_ptr[column] round_second(const column_view& column) except +
+    cdef unique_ptr[column] round_millisecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] round_microsecond(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] round_nanosecond(
+        const column_view& column
+    ) except +
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 3215088c438..3c05a17c268 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -116,6 +116,33 @@ def floor_datetime(Column col, object field):
     return result
 
 
+def round_datetime(Column col, object field):
+    cdef unique_ptr[column] c_result
+    cdef column_view col_view = col.view()
+
+    with nogil:
+        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
+        if field == "D":
+            c_result = move(libcudf_datetime.round_day(col_view))
+        elif field == "H":
+            c_result = move(libcudf_datetime.round_hour(col_view))
+        elif field == "T" or field == "min":
+            c_result = move(libcudf_datetime.round_minute(col_view))
+        elif field == "S":
+            c_result = move(libcudf_datetime.round_second(col_view))
+        elif field == "L" or field == "ms":
+            c_result = move(libcudf_datetime.round_millisecond(col_view))
+        elif field == "U" or field == "us":
+            c_result = move(libcudf_datetime.round_microsecond(col_view))
+        elif field == "N":
+            c_result = move(libcudf_datetime.round_nanosecond(col_view))
+        else:
+            raise ValueError(f"Invalid resolution: '{field}'")
+
+    result = Column.from_unique_ptr(move(c_result))
+    return result
+
+
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7c8837ef45f..08d72f1c6ee 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -228,6 +228,9 @@ def ceil(self, freq: str) -> ColumnBase:
     def floor(self, freq: str) -> ColumnBase:
         return libcudf.datetime.floor_datetime(self, freq)
 
+    def round(self, freq: str) -> ColumnBase:
+        return libcudf.datetime.round_datetime(self, freq)
+
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, cudf.Scalar):
             return other
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9969b9ac0fa..61ce64e7d6b 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1836,119 +1836,6 @@ def _shift(self, offset, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
-    def round(self, decimals=0, how="half_even"):
-        """
-        Round to a variable number of decimal places.
-
-        Parameters
-        ----------
-        decimals : int, dict, Series
-            Number of decimal places to round each column to. This parameter
-            must be an int for a Series.  For a DataFrame, a dict or a Series
-            are also valid inputs. If an int is given, round each column to the
-            same number of places.  Otherwise dict and Series round to variable
-            numbers of places.  Column names should be in the keys if
-            `decimals` is a dict-like, or in the index if `decimals` is a
-            Series. Any columns not included in `decimals` will be left as is.
-            Elements of `decimals` which are not columns of the input will be
-            ignored.
-        how : str, optional
-            Type of rounding. Can be either "half_even" (default)
-            of "half_up" rounding.
-
-        Returns
-        -------
-        Series or DataFrame
-            A Series or DataFrame with the affected columns rounded to the
-            specified number of decimal places.
-
-        Examples
-        --------
-        **Series**
-
-        >>> s = cudf.Series([0.1, 1.4, 2.9])
-        >>> s.round()
-        0    0.0
-        1    1.0
-        2    3.0
-        dtype: float64
-
-        **DataFrame**
-
-        >>> df = cudf.DataFrame(
-                [(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
-        ...     columns=['dogs', 'cats']
-        ... )
-        >>> df
-           dogs  cats
-        0  0.21  0.32
-        1  0.01  0.67
-        2  0.66  0.03
-        3  0.21  0.18
-
-        By providing an integer each column is rounded to the same number
-        of decimal places
-
-        >>> df.round(1)
-           dogs  cats
-        0   0.2   0.3
-        1   0.0   0.7
-        2   0.7   0.0
-        3   0.2   0.2
-
-        With a dict, the number of places for specific columns can be
-        specified with the column names as key and the number of decimal
-        places as value
-
-        >>> df.round({'dogs': 1, 'cats': 0})
-           dogs  cats
-        0   0.2   0.0
-        1   0.0   1.0
-        2   0.7   0.0
-        3   0.2   0.0
-
-        Using a Series, the number of places for specific columns can be
-        specified with the column names as index and the number of
-        decimal places as value
-
-        >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs'])
-        >>> df.round(decimals)
-           dogs  cats
-        0   0.2   0.0
-        1   0.0   1.0
-        2   0.7   0.0
-        3   0.2   0.0
-        """
-        if isinstance(decimals, cudf.Series):
-            decimals = decimals.to_pandas()
-
-        if isinstance(decimals, pd.Series):
-            if not decimals.index.is_unique:
-                raise ValueError("Index of decimals must be unique")
-            decimals = decimals.to_dict()
-        elif isinstance(decimals, int):
-            decimals = {name: decimals for name in self._column_names}
-        elif not isinstance(decimals, abc.Mapping):
-            raise TypeError(
-                "decimals must be an integer, a dict-like or a Series"
-            )
-
-        cols = {
-            name: col.round(decimals[name], how=how)
-            if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype))
-            else col.copy(deep=True)
-            for name, col in self._data.items()
-        }
-
-        return self.__class__._from_data(
-            data=cudf.core.column_accessor.ColumnAccessor(
-                cols,
-                multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
-            ),
-            index=self._index,
-        )
-
     @annotate("SAMPLE", color="orange", domain="cudf_python")
     def sample(
         self,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8f905ee6d49..0002aaf38c5 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1871,13 +1871,13 @@ def _get_dt_field(self, field):
     def is_boolean(self):
         return False
 
-    def ceil(self, field):
+    def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
 
         Parameters
         ----------
-        field : str
+        freq : str
             One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
             Must be a fixed frequency like 'S' (second) not 'ME' (month end).
             See `frequency aliases <https://pandas.pydata.org/docs/\
@@ -1898,17 +1898,17 @@ def ceil(self, field):
         DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
         dtype='datetime64[ns]', freq=None)
         """
-        out_column = self._values.ceil(field)
+        out_column = self._values.ceil(freq)
 
         return self.__class__._from_data({self.name: out_column})
 
-    def floor(self, field):
+    def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
 
         Parameters
         ----------
-        field : str
+        freq : str
             One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
             Must be a fixed frequency like 'S' (second) not 'ME' (month end).
             See `frequency aliases <https://pandas.pydata.org/docs/\
@@ -1929,7 +1929,53 @@ def floor(self, field):
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
         dtype='datetime64[ns]', freq=None)
         """
-        out_column = self._values.floor(field)
+        out_column = self._values.floor(freq)
+
+        return self.__class__._from_data({self.name: out_column})
+
+    def round(self, freq):
+        """
+        Perform round operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        DatetimeIndex
+            Index containing rounded datetimes.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> dt_idx = cudf.Index([
+        ...     "2001-01-01 00:04:45",
+        ...     "2001-01-01 00:04:58",
+        ...     "2001-01-01 00:05:04",
+        ... ], dtype="datetime64[ns]")
+        >>> dt_idx
+        DatetimeIndex(['2001-01-01 00:04:45',
+                '2001-01-01 00:05:04',
+                '2001-01-01 00:04:58'],
+                dtype='datetime64[ns]', freq=None)
+        >>> dt_idx.round('H')
+        DatetimeIndex(['2001-01-01',
+                    '2001-01-01',
+                    '2001-01-01'],
+                    dtype='datetime64[ns]', freq=None)
+        >>> dt_idx.round('T')
+        DatetimeIndex(['2001-01-01 00:05:00',
+                    '2001-01-01 00:05:00',
+                    '2001-01-01 00:05:00'],
+                    dtype='datetime64[ns]', freq=None)
+        """
+        out_column = self._values.round(freq)
 
         return self.__class__._from_data({self.name: out_column})
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9625231a6ef..51bfad3a054 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import warnings
+from collections import abc
 from typing import Type, TypeVar
 from uuid import uuid4
 
@@ -15,7 +16,12 @@
 import cudf
 import cudf._lib as libcudf
 from cudf._typing import ColumnLike
-from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like
+from cudf.api.types import (
+    _is_non_decimal_numeric_dtype,
+    is_categorical_dtype,
+    is_integer_dtype,
+    is_list_like,
+)
 from cudf.core.column import arange
 from cudf.core.frame import Frame
 from cudf.core.index import Index
@@ -813,6 +819,119 @@ def _align_to_index(
 
         return result
 
+    def round(self, decimals=0, how="half_even"):
+        """
+        Round to a variable number of decimal places.
+
+        Parameters
+        ----------
+        decimals : int, dict, Series
+            Number of decimal places to round each column to. This parameter
+            must be an int for a Series. For a DataFrame, a dict or a Series
+            are also valid inputs. If an int is given, round each column to the
+            same number of places. Otherwise dict and Series round to variable
+            numbers of places. Column names should be in the keys if
+            `decimals` is a dict-like, or in the index if `decimals` is a
+            Series. Any columns not included in `decimals` will be left as is.
+            Elements of `decimals` which are not columns of the input will be
+            ignored.
+        how : str, optional
+            Type of rounding. Can be either "half_even" (default)
+            or "half_up" rounding.
+
+        Returns
+        -------
+        Series or DataFrame
+            A Series or DataFrame with the affected columns rounded to the
+            specified number of decimal places.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = cudf.Series([0.1, 1.4, 2.9])
+        >>> s.round()
+        0    0.0
+        1    1.0
+        2    3.0
+        dtype: float64
+
+        **DataFrame**
+
+        >>> df = cudf.DataFrame(
+        ...     [(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
+        ...     columns=['dogs', 'cats'],
+        ... )
+        >>> df
+           dogs  cats
+        0  0.21  0.32
+        1  0.01  0.67
+        2  0.66  0.03
+        3  0.21  0.18
+
+        By providing an integer each column is rounded to the same number
+        of decimal places.
+
+        >>> df.round(1)
+           dogs  cats
+        0   0.2   0.3
+        1   0.0   0.7
+        2   0.7   0.0
+        3   0.2   0.2
+
+        With a dict, the number of places for specific columns can be
+        specified with the column names as keys and the number of decimal
+        places as values.
+
+        >>> df.round({'dogs': 1, 'cats': 0})
+           dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+
+        Using a Series, the number of places for specific columns can be
+        specified with the column names as the index and the number of
+        decimal places as the values.
+
+        >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs'])
+        >>> df.round(decimals)
+           dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+        """
+        if isinstance(decimals, cudf.Series):
+            decimals = decimals.to_pandas()
+
+        if isinstance(decimals, pd.Series):
+            if not decimals.index.is_unique:
+                raise ValueError("Index of decimals must be unique")
+            decimals = decimals.to_dict()
+        elif isinstance(decimals, int):
+            decimals = {name: decimals for name in self._column_names}
+        elif not isinstance(decimals, abc.Mapping):
+            raise TypeError(
+                "decimals must be an integer, a dict-like or a Series"
+            )
+
+        cols = {
+            name: col.round(decimals[name], how=how)
+            if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype))
+            else col.copy(deep=True)
+            for name, col in self._data.items()
+        }
+
+        return self.__class__._from_data(
+            data=cudf.core.column_accessor.ColumnAccessor(
+                cols,
+                multiindex=self._data.multiindex,
+                level_names=self._data.level_names,
+            ),
+            index=self._index,
+        )
+
     def resample(
         self,
         rule,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 140c68d4ce0..3ca77105d1b 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4707,6 +4707,45 @@ def floor(self, freq):
             data={self.series.name: out_column}, index=self.series._index
         )
 
+    def round(self, freq):
+        """
+        Perform round operation on the data to the specified freq.
+
+        Parameters
+        ----------
+        freq : str
+            One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"].
+            Must be a fixed frequency like 'S' (second) not 'ME' (month end).
+            See `frequency aliases <https://pandas.pydata.org/docs/\
+                user_guide/timeseries.html#timeseries-offset-aliases>`__
+            for more details on these aliases.
+
+        Returns
+        -------
+        Series
+            Series with all timestamps rounded to the specified frequency.
+            The index is preserved.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> dt_sr = cudf.Series([
+        ...     "2001-01-01 00:04:45",
+        ...     "2001-01-01 00:04:58",
+        ...     "2001-01-01 00:05:04",
+        ... ], dtype="datetime64[ns]")
+        >>> dt_sr.dt.round("T")
+        0   2001-01-01 00:05:00
+        1   2001-01-01 00:05:00
+        2   2001-01-01 00:05:00
+        dtype: datetime64[ns]
+        """
+        out_column = self.series._column.round(freq)
+
+        return Series._from_data(
+            data={self.series.name: out_column}, index=self.series._index
+        )
+
     def strftime(self, date_format, *args, **kwargs):
         """
         Convert to Series using specified ``date_format``.
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index a95be4f7932..72601a3da2c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1819,3 +1819,34 @@ def test_floor(data, time_type, resolution):
     expect = ps.dt.floor(resolution)
     got = gs.dt.floor(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            [
+                "2020-05-31 08:00:00",
+                "1999-12-31 18:40:10",
+                "2000-12-31 04:00:05",
+                "1900-02-28 07:00:06",
+                "1800-03-14 07:30:20",
+                "2100-03-14 07:30:20",
+                "1970-01-01 00:00:09",
+                "1969-12-31 12:59:10",
+            ]
+        )
+    ],
+)
+@pytest.mark.parametrize("time_type", DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_round(data, time_type, resolution):
+
+    gs = cudf.Series(data, dtype=time_type)
+    ps = gs.to_pandas()
+
+    expect = ps.dt.round(resolution)
+    got = gs.dt.round(resolution)
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index ab211616a02..c7fca2075f5 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2496,3 +2496,16 @@ def test_index_datetime_floor(resolution):
     cuidx_floor = cuidx.floor(resolution)
 
     assert_eq(pidx_floor, cuidx_floor)
+
+
+@pytest.mark.parametrize(
+    "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]
+)
+def test_index_datetime_round(resolution):
+    cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000])
+    pidx = cuidx.to_pandas()
+
+    pidx_floor = pidx.round(resolution)
+    cuidx_floor = cuidx.round(resolution)
+
+    assert_eq(pidx_floor, cuidx_floor)

From c012de56a7fbad154055c119bfbcc83a3dd69318 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 10 Dec 2021 17:06:00 -0500
Subject: [PATCH 073/202] Revert regex $/EOL end-of-string new-line special
 case handling (#9774)

Closes #9764

This reverts the change made in #9620 for the reasons given in https://github.com/rapidsai/cudf/issues/9764#issuecomment-977180601

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Andy Grove (https://github.com/andygrove)

URL: https://github.com/rapidsai/cudf/pull/9774
---
 cpp/src/strings/regex/regex.inl       | 5 +----
 cpp/tests/strings/contains_tests.cpp  | 6 +++---
 python/cudf/cudf/tests/test_string.py | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index bc0679993d0..52cc69c69b8 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -276,10 +276,7 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if (last_character ||
-                (c == '\n' && (inst->u1.c == '$' ||
-                               // edge case where \n appears at the end of the string
-                               pos + 1 == dstr.length()))) {
+            if (last_character || (c == '\n' && inst->u1.c == '$')) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 229f9e4cc82..6f3896d7292 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -310,21 +310,21 @@ TEST_F(StringsContainsTests, MultiLine)
   auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   results           = cudf::strings::contains_re(view, "^abc$");
-  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
   results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
   auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
   results          = cudf::strings::matches_re(view, "^abc$");
-  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1});
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
   results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
   auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
   results        = cudf::strings::count_re(view, "^abc$");
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1});
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index cf52c4684c8..cc7be02a024 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1746,7 +1746,7 @@ def test_string_wrap(data, width):
         ["A B", "1.5", "3,000"],
         ["23", "³", "⅕", ""],
         [" ", "\t\r\n ", ""],
-        ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"],
+        ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\ndog"],
         ["line\nto be wrapped", "another\nline\nto be wrapped"],
     ],
 )

From b359a0fd7acd797dbe81b1b29d72cba9311e9fa2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 10 Dec 2021 17:05:40 -0600
Subject: [PATCH 074/202] Refactor bit counting APIs, introduce valid/null
 count functions, and split host/device side code for segmented counts.
 (#9588)

This PR is a first step towards #9552. I split the host and device code for `segmented_count_set_bits`, so that APIs with existing indices on the device can call the device function directly. Along the way, I discussed some pain points in the current design of bit counting functions with @jrhemstad and @mythrocks and made the following changes:

- Fixed inconsistencies in the behavior of bit counting functions (`count_set_bits` returned zero when `bitmask==nullptr`, but `segmented_count_set_bits` returned segment lengths) by raising an error when `bitmask==nullptr` _(breaking change)_

- Moved all bit counting functions to detail namespaces _(breaking change)_

- Separated "validity" logic from pure bit counting through the introduction of new `valid_count`, `null_count`, `segmented_valid_count`, `segmented_null_count` detail functions. These functions treat all elements as valid when `bitmask==nullptr` and otherwise return the same way as the corresponding bit counting functions.

- Split device-side code from host-side code to enable future work on segmented nullmask reductions over lists, with indices (offsets) already on the device.

- Updated all calling code (and added tests) to use the new valid/null count detail functions.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9588
---
 cpp/include/cudf/detail/null_mask.cuh         | 407 ++++++++++--------
 cpp/include/cudf/detail/null_mask.hpp         | 129 +++++-
 cpp/include/cudf/null_mask.hpp                |  64 ---
 cpp/src/bitmask/null_mask.cu                  | 262 +++++------
 cpp/src/column/column.cu                      |  15 +-
 cpp/src/column/column_view.cpp                |   8 +-
 cpp/src/copying/scatter.cu                    |   3 +-
 cpp/src/copying/slice.cu                      |   2 +-
 cpp/src/io/orc/writer_impl.cu                 |   5 +-
 cpp/tests/bitmask/bitmask_tests.cpp           | 325 ++++++++++----
 .../apply_boolean_mask_tests.cpp              |   8 +-
 11 files changed, 750 insertions(+), 478 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index cf8c3343406..6090477c28d 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -24,6 +24,18 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cub/block/block_reduce.cuh>
+#include <cub/device/device_segmented_reduce.cuh>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
 
 namespace cudf {
 namespace detail {
@@ -39,7 +51,7 @@ namespace detail {
  * @param source_begin_bits Array of offsets into corresponding @p source masks.
  *                          Must be same size as source array
  * @param source_size_bits Number of bits in each mask in @p source
- * @param count Pointer to counter of set bits
+ * @param count_ptr Pointer to counter of set bits
  */
 template <int block_size, typename Binop>
 __global__ void offset_bitmask_binop(Binop op,
@@ -171,24 +183,31 @@ size_type inplace_bitmask_binop(
   return d_counter.value(stream);
 }
 
+/**
+ * @brief Enum indicating whether to count unset (0) bits or set (1) bits.
+ */
+enum class count_bits_policy : bool {
+  UNSET_BITS,  /// Count unset (0) bits
+  SET_BITS     /// Count set (1) bits
+};
+
 /**
  * For each range `[first_bit_indices[i], last_bit_indices[i])`
  * (where 0 <= i < `num_ranges`), count the number of bits set outside the range
- * in the boundary words (i.e. words that include either
- * `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and
- * subtract the count from the range's null count.
+ * in the boundary words (i.e. words that include either the first or last bit)
+ * and subtract the count from the range's null count.
  *
  * Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`.
  *
  * @param[in] bitmask The bitmask whose non-zero bits outside the range in the
  * boundary words will be counted.
- * @param[in] num_ranges The number of ranges
- * @param[in] first_bit_indices The indices (inclusive) of the first bit in each
- * range
- * @param[in] last_bit_indices The indices (exclusive) of the last bit in each
- * range
- * @param[in,out] null_counts The number of non-zero bits in each range to be
- * updated
+ * @param[in] num_ranges The number of ranges.
+ * @param[in] first_bit_indices Random-access input iterator to the sequence of indices (inclusive)
+ * of the first bit in each range.
+ * @param[in] last_bit_indices Random-access input iterator to the sequence of indices (exclusive)
+ * of the last bit in each range.
+ * @param[in,out] null_counts Random-access input/output iterator where the number of non-zero bits
+ * in each range is updated.
  */
 template <typename OffsetIterator, typename OutputIterator>
 __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask,
@@ -199,175 +218,96 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi
 {
   constexpr size_type const word_size_in_bits{detail::size_in_bits<bitmask_type>()};
 
-  cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
-  cudf::size_type range_id  = tid;
+  size_type const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_type range_id  = tid;
 
   while (range_id < num_ranges) {
     size_type const first_bit_index = *(first_bit_indices + range_id);
     size_type const last_bit_index  = *(last_bit_indices + range_id);
     size_type delta                 = 0;
-    size_type num_slack_bits        = 0;
 
-    // compute delta due to the preceding bits in the first word in the range
-
-    num_slack_bits = intra_word_index(first_bit_index);
-    if (num_slack_bits > 0) {
-      bitmask_type word       = bitmask[word_index(first_bit_index)];
-      bitmask_type slack_mask = set_least_significant_bits(num_slack_bits);
+    // Compute delta due to the preceding bits in the first word in the range.
+    size_type const first_num_slack_bits = intra_word_index(first_bit_index);
+    if (first_num_slack_bits > 0) {
+      bitmask_type const word       = bitmask[word_index(first_bit_index)];
+      bitmask_type const slack_mask = set_least_significant_bits(first_num_slack_bits);
       delta -= __popc(word & slack_mask);
     }
 
-    // compute delta due to the following bits in the last word in the range
-
-    num_slack_bits = (last_bit_index % word_size_in_bits) == 0
-                       ? 0
-                       : word_size_in_bits - intra_word_index(last_bit_index);
-    if (num_slack_bits > 0) {
-      bitmask_type word       = bitmask[word_index(last_bit_index)];
-      bitmask_type slack_mask = set_most_significant_bits(num_slack_bits);
+    // Compute delta due to the following bits in the last word in the range.
+    size_type const last_num_slack_bits = (last_bit_index % word_size_in_bits) == 0
+                                            ? 0
+                                            : word_size_in_bits - intra_word_index(last_bit_index);
+    if (last_num_slack_bits > 0) {
+      bitmask_type const word       = bitmask[word_index(last_bit_index)];
+      bitmask_type const slack_mask = set_most_significant_bits(last_num_slack_bits);
       delta -= __popc(word & slack_mask);
     }
 
+    // Update the null count with the computed delta.
     size_type updated_null_count = *(null_counts + range_id) + delta;
     *(null_counts + range_id)    = updated_null_count;
-
     range_id += blockDim.x * gridDim.x;
   }
 }
 
-// convert [first_bit_index,last_bit_index) to
-// [first_word_index,last_word_index)
-struct to_word_index : public thrust::unary_function<size_type, size_type> {
-  const bool _inclusive                 = false;
-  size_type const* const _d_bit_indices = nullptr;
-
-  /**
-   * @brief Constructor of a functor that converts bit indices to bitmask word
-   * indices.
-   *
-   * @param[in] inclusive Flag that indicates whether bit indices are inclusive
-   * or exclusive.
-   * @param[in] d_bit_indices Pointer to an array of bit indices
-   */
-  __host__ to_word_index(bool inclusive, size_type const* d_bit_indices)
-    : _inclusive(inclusive), _d_bit_indices(d_bit_indices)
-  {
-  }
-
-  __device__ size_type operator()(const size_type& i) const
-  {
-    auto bit_index = _d_bit_indices[i];
-    return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
-  }
-};
-
 /**
- * @brief Functor that returns the number of set bits for a specified word
- * of a bitmask array.
+ * @brief Functor that converts bit segment indices to word segment indices.
  *
+ * Converts [first_bit_index, last_bit_index) to [first_word_index,
+ * last_word_index). The flag `inclusive` indicates whether the indices are inclusive or exclusive.
+ * the end of a segment, in which case the word index should be incremented for
+ * bits at the start of a word.
  */
-struct word_num_set_bits_functor {
-  word_num_set_bits_functor(bitmask_type const* bitmask_) : bitmask(bitmask_) {}
-  __device__ size_type operator()(size_type i) const
+struct bit_to_word_index {
+  bit_to_word_index(bool inclusive) : inclusive(inclusive) {}
+  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& bit_index) const
   {
-    return static_cast<size_type>(__popc(bitmask[i]));
+    return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
   }
-  bitmask_type const* bitmask;
+  bool const inclusive;
 };
 
-/**
- * @brief Given a bitmask, counts the number of set (1) bits in every range
- * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin,
- * indices_end) / 2).
- *
- * Returns an empty vector if `bitmask == nullptr`.
- *
- * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`
- * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] >
- * indices_begin[(2*i)+1]`
- *
- * @param bitmask Bitmask residing in device memory whose bits will be counted
- * @param indices_begin An iterator representing the beginning of the range of indices specifying
- * ranges to count the number of set bits within
- * @param indices_end An iterator representing the end of the range of indices specifying ranges to
- * count the number of set bits within
- * @param streaam CUDA stream used for device memory operations and kernel launches
- *
- * @return A vector storing the number of non-zero bits in the specified ranges
- */
-template <typename IndexIterator>
-std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                IndexIterator indices_begin,
-                                                IndexIterator indices_end,
-                                                rmm::cuda_stream_view stream)
-{
-  size_t const num_indices = std::distance(indices_begin, indices_end);
-
-  CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements.");
-  for (size_t i = 0; i < num_indices / 2; i++) {
-    auto begin = indices_begin[i * 2];
-    auto end   = indices_begin[i * 2 + 1];
-    CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
-    CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
-  }
+struct popc {
+  CUDA_DEVICE_CALLABLE size_type operator()(bitmask_type word) const { return __popc(word); }
+};
 
-  if (num_indices == 0) {
-    return std::vector<size_type>{};
-  } else if (bitmask == nullptr) {
-    std::vector<size_type> ret(num_indices / 2);
-    for (size_t i = 0; i < num_indices / 2; i++) {
-      ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i];
-    }
-    return ret;
-  }
+// Count set/unset bits in a segmented null mask, using offset iterators accessible by the device.
+template <typename OffsetIterator>
+rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
+                                                    OffsetIterator first_bit_indices_begin,
+                                                    OffsetIterator first_bit_indices_end,
+                                                    OffsetIterator last_bit_indices_begin,
+                                                    count_bits_policy count_bits,
+                                                    rmm::cuda_stream_view stream)
+{
+  auto const num_ranges =
+    static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
+  rmm::device_uvector<size_type> d_bit_counts(num_ranges, stream);
 
-  size_type num_ranges = num_indices / 2;
-  std::vector<size_type> h_first_indices(num_ranges);
-  std::vector<size_type> h_last_indices(num_ranges);
-  thrust::stable_partition_copy(thrust::seq,
-                                indices_begin,
-                                indices_end,
-                                thrust::make_counting_iterator(0),
-                                h_first_indices.begin(),
-                                h_last_indices.begin(),
-                                [](auto i) { return (i % 2) == 0; });
-
-  auto d_first_indices = make_device_uvector_async(h_first_indices, stream);
-  auto d_last_indices  = make_device_uvector_async(h_last_indices, stream);
-  rmm::device_uvector<size_type> d_null_counts(num_ranges, stream);
-
-  auto word_num_set_bits  = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
-                                                           word_num_set_bits_functor{bitmask});
-  auto first_word_indices = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
-    // first_word_indices and last_word_indices to have the same type.
-    to_word_index(true, d_first_indices.data()));
-  auto last_word_indices = thrust::make_transform_iterator(
-    thrust::make_counting_iterator(0),
-    // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires
-    // first_word_indices and last_word_indices to have the same type.
-    to_word_index(false, d_last_indices.data()));
-
-  // first allocate temporary memory
+  auto num_set_bits_in_word = thrust::make_transform_iterator(bitmask, popc{});
+  auto first_word_indices =
+    thrust::make_transform_iterator(first_bit_indices_begin, bit_to_word_index{true});
+  auto last_word_indices =
+    thrust::make_transform_iterator(last_bit_indices_begin, bit_to_word_index{false});
 
+  // Allocate temporary memory.
   size_t temp_storage_bytes{0};
   CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr,
                                            temp_storage_bytes,
-                                           word_num_set_bits,
-                                           d_null_counts.begin(),
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
                                            num_ranges,
                                            first_word_indices,
                                            last_word_indices,
                                            stream.value()));
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
-  // second perform segmented reduction
-
+  // Perform segmented reduction.
   CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(),
                                            temp_storage_bytes,
-                                           word_num_set_bits,
-                                           d_null_counts.begin(),
+                                           num_set_bits_in_word,
+                                           d_bit_counts.begin(),
                                            num_ranges,
                                            first_word_indices,
                                            last_word_indices,
@@ -375,75 +315,190 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
 
   CHECK_CUDA(stream.value());
 
-  // third, adjust counts in segment boundaries (if segments are not
-  // word-aligned)
-
+  // Adjust counts in segment boundaries (if segments are not word-aligned).
   constexpr size_type block_size{256};
-
   cudf::detail::grid_1d grid(num_ranges, block_size);
-
   subtract_set_bits_range_boundaries_kernel<<<grid.num_blocks,
                                               grid.num_threads_per_block,
                                               0,
                                               stream.value()>>>(
-    bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin());
+    bitmask, num_ranges, first_bit_indices_begin, last_bit_indices_begin, d_bit_counts.begin());
+
+  if (count_bits == count_bits_policy::UNSET_BITS) {
+    // Convert from set bits counts to unset bits by subtracting the number of
+    // set bits from the length of the segment.
+    auto segments_begin =
+      thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
+    auto segments_size = thrust::transform_iterator(segments_begin, [] __device__(auto segment) {
+      auto const begin = thrust::get<0>(segment);
+      auto const end   = thrust::get<1>(segment);
+      return end - begin;
+    });
+    thrust::transform(rmm::exec_policy(stream),
+                      segments_size,
+                      segments_size + num_ranges,
+                      d_bit_counts.data(),
+                      d_bit_counts.data(),
+                      [] __device__(auto segment_size, auto segment_bit_count) {
+                        return segment_size - segment_bit_count;
+                      });
+  }
 
   CHECK_CUDA(stream.value());
+  return d_bit_counts;
+}
 
-  std::vector<size_type> ret(num_ranges);
-  CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                           d_null_counts.data(),
-                           num_ranges * sizeof(size_type),
-                           cudaMemcpyDeviceToHost,
-                           stream.value()));
+/**
+ * @brief Given two iterators, validate that the iterators represent valid ranges of
+ * indices and return the number of ranges.
+ *
+ * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`
+ * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] >
+ * indices_begin[(2*i)+1]`
+ *
+ * @param indices_begin An iterator representing the beginning of the ranges of indices
+ * @param indices_end An iterator representing the end of the ranges of indices
+ *
+ * @return The number of segments specified by the input iterators.
+ */
+template <typename IndexIterator>
+size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator indices_end)
+{
+  auto const num_indices = static_cast<size_type>(std::distance(indices_begin, indices_end));
+  CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements.");
+  size_type const num_segments = num_indices / 2;
+  for (size_type i = 0; i < num_segments; i++) {
+    auto begin = indices_begin[2 * i];
+    auto end   = indices_begin[2 * i + 1];
+    CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative.");
+    CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index.");
+  }
+  return num_segments;
+}
 
-  stream.synchronize();  // now ret is valid.
+struct index_alternator {
+  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& i) const
+  {
+    return *(d_indices + 2 * i + (is_end ? 1 : 0));
+  }
 
-  return ret;
-}
+  bool const is_end = false;
+  const size_type* d_indices;
+};
 
 /**
- * @brief Given a bitmask, counts the number of unset (0) bits in every range
+ * @brief Given a bitmask, counts the number of set (1) or unset (0) bits in every range
  * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin,
  * indices_end) / 2).
  *
- * Returns an empty vector if `bitmask == nullptr`.
+ * If `bitmask == nullptr`, this function returns a vector containing the
+ * segment lengths, or a vector of zeros if counting unset bits.
  *
- * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`
+ * @throws cudf::logic_error if `bitmask == nullptr`.
+ * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`.
  * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] >
- * indices_begin[(2*i)+1]`
+ * indices_begin[(2*i)+1]`.
  *
- * @param bitmask Bitmask residing in device memory whose bits will be counted
+ * @param bitmask Bitmask residing in device memory whose bits will be counted.
  * @param indices_begin An iterator representing the beginning of the range of indices specifying
- * ranges to count the number of unset bits within
+ * ranges to count the number of set/unset bits within.
  * @param indices_end An iterator representing the end of the range of indices specifying ranges to
- * count the number of unset bits within
- * @param streaam CUDA stream used for device memory operations and kernel launches
+ * count the number of set/unset bits within.
+ * @param count_bits If SET_BITS, count set (1) bits. If UNSET_BITS, count unset (0) bits.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  *
  * @return A vector storing the number of non-zero bits in the specified ranges
  */
 template <typename IndexIterator>
+std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
+                                            IndexIterator indices_begin,
+                                            IndexIterator indices_end,
+                                            count_bits_policy count_bits,
+                                            rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(bitmask != nullptr, "Invalid bitmask.");
+  auto const num_segments = validate_segmented_indices(indices_begin, indices_end);
+
+  // Return an empty vector if there are zero segments.
+  if (num_segments == 0) { return std::vector<size_type>{}; }
+
+  // Construct a contiguous host buffer of indices and copy to device.
+  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto const d_indices = make_device_uvector_async(h_indices, stream);
+
+  // Compute the bit counts over each segment.
+  auto first_bit_indices_begin = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0), index_alternator{false, d_indices.data()});
+  auto const first_bit_indices_end = first_bit_indices_begin + num_segments;
+  auto last_bit_indices_begin      = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0), index_alternator{true, d_indices.data()});
+  rmm::device_uvector<size_type> d_bit_counts =
+    cudf::detail::segmented_count_bits(bitmask,
+                                       first_bit_indices_begin,
+                                       first_bit_indices_end,
+                                       last_bit_indices_begin,
+                                       count_bits,
+                                       stream);
+
+  // Copy the results back to the host.
+  return make_std_vector_sync(d_bit_counts, stream);
+}
+
+// Count non-zero bits in the specified ranges.
+template <typename IndexIterator>
+std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
+                                                IndexIterator indices_begin,
+                                                IndexIterator indices_end,
+                                                rmm::cuda_stream_view stream)
+{
+  return detail::segmented_count_bits(
+    bitmask, indices_begin, indices_end, count_bits_policy::SET_BITS, stream);
+}
+
+// Count zero bits in the specified ranges.
+template <typename IndexIterator>
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   IndexIterator indices_begin,
                                                   IndexIterator indices_end,
                                                   rmm::cuda_stream_view stream)
 {
-  size_t const num_indices = std::distance(indices_begin, indices_end);
+  return detail::segmented_count_bits(
+    bitmask, indices_begin, indices_end, count_bits_policy::UNSET_BITS, stream);
+}
 
-  if (num_indices == 0) {
-    return std::vector<size_type>{};
-  } else if (bitmask == nullptr) {
-    return std::vector<size_type>(num_indices / 2, 0);
+// Count valid elements in the specified ranges of a validity bitmask.
+template <typename IndexIterator>
+std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
+                                             IndexIterator indices_begin,
+                                             IndexIterator indices_end,
+                                             rmm::cuda_stream_view stream)
+{
+  if (bitmask == nullptr) {
+    // Return a vector of segment lengths.
+    auto const num_segments = validate_segmented_indices(indices_begin, indices_end);
+    auto ret                = std::vector<size_type>(num_segments, 0);
+    for (size_type i = 0; i < num_segments; i++) {
+      ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i];
+    }
+    return ret;
   }
 
-  auto ret = segmented_count_set_bits(bitmask, indices_begin, indices_end, stream);
-  for (size_t i = 0; i < ret.size(); i++) {
-    auto begin = indices_begin[i * 2];
-    auto end   = indices_begin[i * 2 + 1];
-    ret[i]     = (end - begin) - ret[i];
-  }
+  return detail::segmented_count_set_bits(bitmask, indices_begin, indices_end, stream);
+}
 
-  return ret;
+// Count null elements in the specified ranges of a validity bitmask.
+template <typename IndexIterator>
+std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
+                                            IndexIterator indices_begin,
+                                            IndexIterator indices_end,
+                                            rmm::cuda_stream_view stream)
+{
+  if (bitmask == nullptr) {
+    // Return a vector of zeros.
+    auto const num_segments = validate_segmented_indices(indices_begin, indices_end);
+    return std::vector<size_type>(num_segments, 0);
+  }
+  return detail::segmented_count_unset_bits(bitmask, indices_begin, indices_end, stream);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index d2819e665df..6ee406de5ef 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -48,9 +48,18 @@ void set_null_mask(bitmask_type* bitmask,
                    rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
- * @copydoc cudf::count_set_bits
+ * @brief Given a bitmask, counts the number of set (1) bits in the range
+ * `[start, stop)`.
  *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @throws cudf::logic_error if `bitmask == nullptr`
+ * @throws cudf::logic_error if `start > stop`
+ * @throws cudf::logic_error if `start < 0`
+ *
+ * @param bitmask Bitmask residing in device memory whose bits will be counted.
+ * @param start Index of the first bit to count (inclusive).
+ * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of non-zero bits in the specified range.
  */
 cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                size_type start,
@@ -58,9 +67,18 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::count_unset_bits
+ * @brief Given a bitmask, counts the number of unset (0) bits in the range
+ * `[start, stop)`.
  *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @throws cudf::logic_error if `bitmask == nullptr`
+ * @throws cudf::logic_error if `start > stop`
+ * @throws cudf::logic_error if `start < 0`
+ *
+ * @param bitmask Bitmask residing in device memory whose bits will be counted.
+ * @param start Index of the first bit to count (inclusive).
+ * @param stop Index of the last bit to count (exclusive).
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of zero bits in the specified range.
  */
 cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  size_type start,
@@ -68,23 +86,122 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::segmented_count_set_bits
+ * @brief Given a bitmask, counts the number of set (1) bits in every range
+ * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
+ *
+ * @throws cudf::logic_error if `bitmask == nullptr`
+ * @throws cudf::logic_error if `indices.size() % 2 != 0`
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
  *
+ * @param[in] bitmask Bitmask residing in device memory whose bits will be counted.
+ * @param[in] indices A host_span of indices specifying ranges to count the number of set bits.
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return A vector storing the number of non-zero bits in the specified ranges.
  */
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::segmented_count_unset_bits
+ * @brief Given a bitmask, counts the number of unset (0) bits in every range
+ * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
  *
+ * @throws cudf::logic_error if `bitmask == nullptr`
+ * @throws cudf::logic_error if `indices.size() % 2 != 0`
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
+ *
+ * @param[in] bitmask Bitmask residing in device memory whose bits will be counted.
+ * @param[in] indices A host_span of indices specifying ranges to count the number of unset bits.
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return A vector storing the number of zero bits in the specified ranges.
  */
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Given a validity bitmask, counts the number of valid elements (set bits)
+ * in the range `[start, stop)`.
+ *
+ * If `bitmask == nullptr`, all elements are assumed to be valid and the
+ * function returns `stop-start`.
+ *
+ * @throws cudf::logic_error if `start > stop`
+ * @throws cudf::logic_error if `start < 0`
+ *
+ * @param[in] bitmask Validity bitmask residing in device memory.
+ * @param[in] start Index of the first bit to count (inclusive).
+ * @param[in] stop Index of the last bit to count (exclusive).
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of valid elements in the specified range.
+ */
+cudf::size_type valid_count(bitmask_type const* bitmask,
+                            size_type start,
+                            size_type stop,
+                            rmm::cuda_stream_view stream);
+
+/**
+ * @brief Given a validity bitmask, counts the number of null elements (unset bits)
+ * in the range `[start, stop)`.
+ *
+ * If `bitmask == nullptr`, all elements are assumed to be valid and the
+ * function returns ``.
+ *
+ * @throws cudf::logic_error if `start > stop`
+ * @throws cudf::logic_error if `start < 0`
+ *
+ * @param[in] bitmask Validity bitmask residing in device memory.
+ * @param[in] start Index of the first bit to count (inclusive).
+ * @param[in] stop Index of the last bit to count (exclusive).
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return The number of null elements in the specified range.
+ */
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream);
+
+/**
+ * @brief Given a validity bitmask, counts the number of valid elements (set
+ * bits) in every range `[indices[2*i], indices[(2*i)+1])` (where 0 <= i <
+ * indices.size() / 2).
+ *
+ * If `bitmask == nullptr`, all elements are assumed to be valid and a vector of
+ * length `indices.size()` containing segment lengths is returned.
+ *
+ * @throws cudf::logic_error if `indices.size() % 2 != 0`.
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`.
+ *
+ * @param[in] bitmask Validity bitmask residing in device memory.
+ * @param[in] indices A host_span of indices specifying ranges to count the number of valid
+ * elements.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return A vector storing the number of valid elements in each specified range.
+ */
+std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
+                                             host_span<size_type const> indices,
+                                             rmm::cuda_stream_view stream);
+
+/**
+ * @brief Given a validity bitmask, counts the number of null elements (unset
+ * bits) in every range `[indices[2*i], indices[(2*i)+1])` (where 0 <= i <
+ * indices.size() / 2).
+ *
+ * If `bitmask == nullptr`, all elements are assumed to be valid and a vector of
+ * length `indices.size()` containing all zeros is returned.
+ *
+ * @throws cudf::logic_error if `indices.size() % 2 != 0`
+ * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
+ *
+ * @param[in] bitmask Validity bitmask residing in device memory.
+ * @param[in] indices A host_span of indices specifying ranges to count the number of null elements.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @return A vector storing the number of null elements in each specified range.
+ */
+std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
+                                            host_span<size_type const> indices,
+                                            rmm::cuda_stream_view stream);
+
 /**
  * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type,
  *rmm::mr::device_memory_resource*)
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index c74e077dc32..6585932f151 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -100,70 +100,6 @@ rmm::device_buffer create_null_mask(
  */
 void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid);
 
-/**
- * @brief Given a bitmask, counts the number of set (1) bits in the range
- * `[start, stop)`
- *
- * Returns `0` if `bitmask == nullptr`.
- *
- * @throws cudf::logic_error if `start > stop`
- * @throws cudf::logic_error if `start < 0`
- *
- * @param bitmask Bitmask residing in device memory whose bits will be counted
- * @param start Index of the first bit to count (inclusive)
- * @param stop Index of the last bit to count (exclusive)
- * @return The number of non-zero bits in the specified range
- */
-cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop);
-
-/**
- * @brief Given a bitmask, counts the number of unset (0) bits  in the range
- *`[start, stop)`.
- *
- * Returns `0` if `bitmask == nullptr`.
- *
- * @throws cudf::logic_error if `start > stop`
- * @throws cudf::logic_error if `start < 0`
- *
- * @param bitmask Bitmask residing in device memory whose bits will be counted
- * @param start Index of the first bit to count (inclusive)
- * @param stop Index of the last bit to count (exclusive)
- * @return The number of zero bits in the specified range
- */
-cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop);
-
-/**
- * @brief Given a bitmask, counts the number of set (1) bits in every range
- * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
- *
- * Returns an empty vector if `bitmask == nullptr`.
- *
- * @throws cudf::logic_error if `indices.size() % 2 != 0`
- * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
- *
- * @param[in] bitmask Bitmask residing in device memory whose bits will be counted
- * @param[in] indices A host_span of indices specifying ranges to count the number of set bits
- * @return A vector storing the number of non-zero bits in the specified ranges
- */
-std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                host_span<cudf::size_type const> indices);
-
-/**
- * @brief Given a bitmask, counts the number of unset (0) bits in every range
- * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2).
- *
- * Returns an empty vector if `bitmask == nullptr`.
- *
- * @throws cudf::logic_error if `indices.size() % 2 != 0`
- * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`
- *
- * @param[in] bitmask Bitmask residing in device memory whose bits will be counted
- * @param[in] indices A host_span of indices specifying ranges to count the number of unset bits
- * @return A vector storing the number of zero bits in the specified ranges
- */
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
-                                                  host_span<cudf::size_type const> indices);
-
 /**
  * @brief Creates a `device_buffer` from a slice of bitmask defined by a range
  * of indices `[begin_bit, end_bit)`.
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 1cd3def61ac..ec3776fb6d5 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -30,7 +31,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <thrust/binary_search.h>
@@ -167,67 +167,9 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit
   return detail::set_null_mask(bitmask, begin_bit, end_bit, valid);
 }
 
-namespace {
-
-/**
- * @brief Counts the number of non-zero bits in a bitmask in the range
- * `[first_bit_index, last_bit_index]`.
- *
- * Expects `0 <= first_bit_index <= last_bit_index`.
- *
- * @param[in] bitmask The bitmask whose non-zero bits will be counted.
- * @param[in] first_bit_index The index (inclusive) of the first bit to count
- * @param[in] last_bit_index The index (inclusive) of the last bit to count
- * @param[out] global_count The number of non-zero bits in the specified range
- */
-template <size_type block_size>
-__global__ void count_set_bits_kernel(bitmask_type const* bitmask,
-                                      size_type first_bit_index,
-                                      size_type last_bit_index,
-                                      size_type* global_count)
-{
-  constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
-
-  auto const first_word_index{word_index(first_bit_index)};
-  auto const last_word_index{word_index(last_bit_index)};
-  auto const tid         = threadIdx.x + blockIdx.x * blockDim.x;
-  auto thread_word_index = tid + first_word_index;
-  size_type thread_count{0};
-
-  // First, just count the bits in all words
-  while (thread_word_index <= last_word_index) {
-    thread_count += __popc(bitmask[thread_word_index]);
-    thread_word_index += blockDim.x * gridDim.x;
-  }
-
-  // Subtract any slack bits counted from the first and last word
-  // Two threads handle this -- one for first word, one for last
-  if (tid < 2) {
-    bool const first{tid == 0};
-    bool const last{not first};
-
-    size_type bit_index  = (first) ? first_bit_index : last_bit_index;
-    size_type word_index = (first) ? first_word_index : last_word_index;
-
-    size_type num_slack_bits = bit_index % word_size;
-    if (last) { num_slack_bits = word_size - num_slack_bits - 1; }
-
-    if (num_slack_bits > 0) {
-      bitmask_type word = bitmask[word_index];
-      auto slack_mask   = (first) ? set_least_significant_bits(num_slack_bits)
-                                  : set_most_significant_bits(num_slack_bits);
-
-      thread_count -= __popc(word & slack_mask);
-    }
-  }
-
-  using BlockReduce = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  size_type block_count{BlockReduce(temp_storage).Sum(thread_count)};
-
-  if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
-}
+namespace detail {
 
+namespace {
 /**
  * @brief Copies the bits starting at the specified offset from a source
  * bitmask into the destination bitmask.
@@ -257,8 +199,6 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination,
 
 }  // namespace
 
-namespace detail {
-
 // Create a bitmask from a specific range
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
@@ -299,20 +239,82 @@ rmm::device_buffer copy_bitmask(column_view const& view,
   return null_mask;
 }
 
+namespace {
+/**
+ * @brief Counts the number of non-zero bits in a bitmask in the range
+ * `[first_bit_index, last_bit_index]`.
+ *
+ * Expects `0 <= first_bit_index <= last_bit_index`.
+ *
+ * @param[in] bitmask The bitmask whose non-zero bits will be counted.
+ * @param[in] first_bit_index The index (inclusive) of the first bit to count
+ * @param[in] last_bit_index The index (inclusive) of the last bit to count
+ * @param[out] global_count The number of non-zero bits in the specified range
+ */
+template <size_type block_size>
+__global__ void count_set_bits_kernel(bitmask_type const* bitmask,
+                                      size_type first_bit_index,
+                                      size_type last_bit_index,
+                                      size_type* global_count)
+{
+  constexpr auto const word_size{detail::size_in_bits<bitmask_type>()};
+
+  auto const first_word_index{word_index(first_bit_index)};
+  auto const last_word_index{word_index(last_bit_index)};
+  auto const tid         = threadIdx.x + blockIdx.x * blockDim.x;
+  auto thread_word_index = tid + first_word_index;
+  size_type thread_count{0};
+
+  // First, just count the bits in all words
+  while (thread_word_index <= last_word_index) {
+    thread_count += __popc(bitmask[thread_word_index]);
+    thread_word_index += blockDim.x * gridDim.x;
+  }
+
+  // Subtract any slack bits counted from the first and last word
+  // Two threads handle this -- one for first word, one for last
+  if (tid < 2) {
+    bool const first{tid == 0};
+    bool const last{not first};
+
+    size_type bit_index  = (first) ? first_bit_index : last_bit_index;
+    size_type word_index = (first) ? first_word_index : last_word_index;
+
+    size_type num_slack_bits = bit_index % word_size;
+    if (last) { num_slack_bits = word_size - num_slack_bits - 1; }
+
+    if (num_slack_bits > 0) {
+      bitmask_type word = bitmask[word_index];
+      auto slack_mask   = (first) ? set_least_significant_bits(num_slack_bits)
+                                  : set_most_significant_bits(num_slack_bits);
+
+      thread_count -= __popc(word & slack_mask);
+    }
+  }
+
+  using BlockReduce = cub::BlockReduce<size_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  size_type block_count{BlockReduce(temp_storage).Sum(thread_count)};
+
+  if (threadIdx.x == 0) { atomicAdd(global_count, block_count); }
+}
+
+}  // namespace
+
+// Count non-zero bits in the specified range
 cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                size_type start,
                                size_type stop,
-                               rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+                               rmm::cuda_stream_view stream)
 {
-  if (nullptr == bitmask) { return 0; }
-
+  CUDF_EXPECTS(bitmask != nullptr, "Invalid bitmask.");
   CUDF_EXPECTS(start >= 0, "Invalid range.");
   CUDF_EXPECTS(start <= stop, "Invalid bit range.");
 
-  std::size_t num_bits_to_count = stop - start;
+  auto const num_bits_to_count = stop - start;
   if (num_bits_to_count == 0) { return 0; }
 
-  auto num_words = num_bitmask_words(num_bits_to_count);
+  auto const num_words = num_bitmask_words(num_bits_to_count);
 
   constexpr size_type block_size{256};
 
@@ -327,14 +329,78 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
   return non_zero_count.value(stream);
 }
 
+// Count zero bits in the specified range
 cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  size_type start,
                                  size_type stop,
-                                 rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+                                 rmm::cuda_stream_view stream)
+{
+  auto const num_set_bits   = detail::count_set_bits(bitmask, start, stop, stream);
+  auto const total_num_bits = (stop - start);
+  return total_num_bits - num_set_bits;
+}
+
+// Count valid elements in the specified range of a validity bitmask
+cudf::size_type valid_count(bitmask_type const* bitmask,
+                            size_type start,
+                            size_type stop,
+                            rmm::cuda_stream_view stream)
+{
+  if (bitmask == nullptr) {
+    CUDF_EXPECTS(start >= 0, "Invalid range.");
+    CUDF_EXPECTS(start <= stop, "Invalid bit range.");
+    auto const total_num_bits = (stop - start);
+    return total_num_bits;
+  }
+
+  return detail::count_set_bits(bitmask, start, stop, stream);
+}
+
+// Count null elements in the specified range of a validity bitmask
+cudf::size_type null_count(bitmask_type const* bitmask,
+                           size_type start,
+                           size_type stop,
+                           rmm::cuda_stream_view stream)
+{
+  if (bitmask == nullptr) {
+    CUDF_EXPECTS(start >= 0, "Invalid range.");
+    CUDF_EXPECTS(start <= stop, "Invalid bit range.");
+    return 0;
+  }
+
+  return detail::count_unset_bits(bitmask, start, stop, stream);
+}
+
+// Count non-zero bits in the specified ranges of a bitmask
+std::vector<size_type> segmented_count_set_bits(const bitmask_type* bitmask,
+                                                host_span<const size_type> indices,
+                                                rmm::cuda_stream_view stream)
+{
+  return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream);
+}
+
+// Count zero bits in the specified ranges of a bitmask
+std::vector<size_type> segmented_count_unset_bits(const bitmask_type* bitmask,
+                                                  host_span<const size_type> indices,
+                                                  rmm::cuda_stream_view stream)
+{
+  return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream);
+}
+
+// Count valid elements in the specified ranges of a validity bitmask
+std::vector<size_type> segmented_valid_count(const bitmask_type* bitmask,
+                                             host_span<const size_type> indices,
+                                             rmm::cuda_stream_view stream)
+{
+  return detail::segmented_valid_count(bitmask, indices.begin(), indices.end(), stream);
+}
+
+// Count null elements in the specified ranges of a validity bitmask
+std::vector<size_type> segmented_null_count(const bitmask_type* bitmask,
+                                            host_span<const size_type> indices,
+                                            rmm::cuda_stream_view stream)
 {
-  if (nullptr == bitmask) { return 0; }
-  auto num_bits = (stop - start);
-  return (num_bits - detail::count_set_bits(bitmask, start, stop, stream));
+  return detail::segmented_null_count(bitmask, indices.begin(), indices.end(), stream);
 }
 
 // Inplace Bitwise AND of the masks
@@ -437,60 +503,8 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
   return std::make_pair(std::move(null_mask), 0);
 }
 
-/**
- * @copydoc cudf::segmented_count_set_bits
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                host_span<size_type const> indices,
-                                                rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream);
-}
-
-// Count zero bits in the specified ranges
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
-                                                  host_span<size_type const> indices,
-                                                  rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream);
-}
-
 }  // namespace detail
 
-// Count non-zero bits in the specified range
-cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop)
-{
-  CUDF_FUNC_RANGE();
-  return detail::count_set_bits(bitmask, start, stop);
-}
-
-// Count zero bits in the specified range
-cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop)
-{
-  CUDF_FUNC_RANGE();
-  return detail::count_unset_bits(bitmask, start, stop);
-}
-
-// Count non-zero bits in the specified ranges
-std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
-                                                host_span<size_type const> indices)
-{
-  CUDF_FUNC_RANGE();
-  return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default);
-}
-
-// Count zero bits in the specified ranges
-std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
-                                                  host_span<size_type const> indices)
-{
-  CUDF_FUNC_RANGE();
-  return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default);
-}
-
 // Create a bitmask from a specific range
 rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 1357bbb10a5..992ff18456a 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -116,15 +116,16 @@ mutable_column_view column::mutable_view()
     child_views.emplace_back(*c);
   }
 
-  // Store the old null count before resetting it. By accessing the value directly instead of
-  // calling `null_count()`, we can avoid a potential invocation of `count_unset_bits()`. This does
-  // however mean that calling `null_count()` on the resulting mutable view could still potentially
-  // invoke `count_unset_bits()`.
+  // Store the old null count before resetting it. By accessing the value
+  // directly instead of calling `this->null_count()`, we can avoid a potential
+  // invocation of `cudf::detail::null_count()`. This does however mean that
+  // calling `this->null_count()` on the resulting mutable view could still
+  // potentially invoke `cudf::detail::null_count()`.
   auto current_null_count = _null_count;
 
   // The elements of a column could be changed through a `mutable_column_view`, therefore the
   // existing `null_count` is no longer valid. Reset it to `UNKNOWN_NULL_COUNT` forcing it to be
-  // recomputed on the next invocation of `null_count()`.
+  // recomputed on the next invocation of `this->null_count()`.
   set_null_count(cudf::UNKNOWN_NULL_COUNT);
 
   return mutable_column_view{type(),
@@ -141,8 +142,8 @@ size_type column::null_count() const
 {
   CUDF_FUNC_RANGE();
   if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
-    _null_count =
-      cudf::count_unset_bits(static_cast<bitmask_type const*>(_null_mask.data()), 0, size());
+    _null_count = cudf::detail::null_count(
+      static_cast<bitmask_type const*>(_null_mask.data()), 0, size(), rmm::cuda_stream_default);
   }
   return _null_count;
 }
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 5749cb48c0e..2f7297dbb54 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -16,7 +16,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/hashing.hpp>
-#include <cudf/null_mask.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -66,7 +66,8 @@ column_view_base::column_view_base(data_type type,
 size_type column_view_base::null_count() const
 {
   if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
-    _null_count = cudf::count_unset_bits(null_mask(), offset(), offset() + size());
+    _null_count =
+      cudf::detail::null_count(null_mask(), offset(), offset() + size(), rmm::cuda_stream_default);
   }
   return _null_count;
 }
@@ -76,7 +77,8 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
   CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds.");
   return (null_count() == 0)
            ? 0
-           : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
+           : cudf::detail::null_count(
+               null_mask(), offset() + begin, offset() + end, rmm::cuda_stream_default);
 }
 
 // Struct to use custom hash combine and fold expression
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 06ef42e4a08..9a364451b3b 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -88,7 +88,8 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper<const scalar> const&
     bitmask_kernel<<<grid_size, block_size, 0, stream.value()>>>(
       *target_view, scatter_map, num_scatter_rows);
 
-    target.set_null_count(count_unset_bits(target.view().null_mask(), 0, target.size(), stream));
+    target.set_null_count(
+      cudf::detail::null_count(target.view().null_mask(), 0, target.size(), stream));
   }
 }
 
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index 9a3e349b907..b2f05516e2c 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -40,7 +40,7 @@ std::vector<column_view> slice(column_view const& input,
   // to count
   auto indices_iter = cudf::detail::make_counting_transform_iterator(
     0, [offset = input.offset(), &indices](size_type index) { return indices[index] + offset; });
-  auto null_counts = cudf::detail::segmented_count_unset_bits(
+  auto null_counts = cudf::detail::segmented_null_count(
     input.null_mask(), indices_iter, indices_iter + indices.size(), stream);
 
   auto const children = std::vector<column_view>(input.child_begin(), input.child_end());
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index db02125ce77..a7a767585e6 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -25,9 +25,9 @@
 #include <io/utilities/column_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/span.hpp>
@@ -903,7 +903,8 @@ encoded_data encode_columns(orc_table_view const& orc_table,
     }
   }
   for (auto& cnt_in : validity_check_inputs) {
-    auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices);
+    auto const valid_counts =
+      cudf::detail::segmented_valid_count(cnt_in.second.mask, cnt_in.second.indices, stream);
     CUDF_EXPECTS(
       std::none_of(valid_counts.cbegin(),
                    valid_counts.cend(),
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index c7ae6e12366..19a0da21cb9 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -15,6 +15,7 @@
  */
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
@@ -67,14 +68,16 @@ struct CountBitmaskTest : public cudf::test::BaseFixture {
 
 TEST_F(CountBitmaskTest, NullMask)
 {
-  EXPECT_EQ(0, cudf::count_set_bits(nullptr, 0, 32));
+  EXPECT_THROW(cudf::detail::count_set_bits(nullptr, 0, 32, rmm::cuda_stream_default),
+               cudf::logic_error);
+  EXPECT_EQ(32, cudf::detail::valid_count(nullptr, 0, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 32, 7, 25};
-  auto counts                          = cudf::segmented_count_set_bits(nullptr, indices);
-  EXPECT_EQ(indices.size(), counts.size() * 2);
-  for (size_t i = 0; i < counts.size(); i++) {
-    EXPECT_EQ(indices[2 * i + 1] - indices[2 * i], counts[i]);
-  }
+  EXPECT_THROW(cudf::detail::segmented_count_set_bits(nullptr, indices, rmm::cuda_stream_default),
+               cudf::logic_error);
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(nullptr, indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 18}));
 }
 
 // Utility to construct a mask vector. If fill_valid is false (default), it is initialized to all
@@ -96,159 +99,243 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 TEST_F(CountBitmaskTest, NegativeStart)
 {
   auto mask = make_mask(1);
-  EXPECT_THROW(cudf::count_set_bits(mask.data(), -1, 32), cudf::logic_error);
+  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), -1, 32, rmm::cuda_stream_default),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::detail::valid_count(mask.data(), -1, 32, rmm::cuda_stream_default),
+               cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, -1, 32};
-  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default),
+    cudf::logic_error);
+  EXPECT_THROW(cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default),
+               cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, StartLargerThanStop)
 {
   auto mask = make_mask(1);
-  EXPECT_THROW(cudf::count_set_bits(mask.data(), 32, 31), cudf::logic_error);
+  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), 32, 31, rmm::cuda_stream_default),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::detail::valid_count(mask.data(), 32, 31, rmm::cuda_stream_default),
+               cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, 31, 30};
-  EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default),
+    cudf::logic_error);
+  EXPECT_THROW(cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default),
+               cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, EmptyRange)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 17));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 17, rmm::cuda_stream_default));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 17, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 0, 17, 17};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllZero)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 0, 32));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 0, 32, rmm::cuda_stream_default));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 0, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
+  auto valid_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllZero)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 18));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 18, rmm::cuda_stream_default));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 18, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {17, 18, 7, 8};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 13, 14));
+  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 13, 14, rmm::cuda_stream_default));
+  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 13, 14, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {13, 14, 0, 1};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{1, 1}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllBitsSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(32, cudf::count_set_bits(mask.data(), 0, 32));
+  EXPECT_EQ(32, cudf::detail::count_set_bits(mask.data(), 0, 32, rmm::cuda_stream_default));
+  EXPECT_EQ(32, cudf::detail::valid_count(mask.data(), 0, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{32, 32}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPreSlack)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(25, cudf::count_set_bits(mask.data(), 7, 32));
+  EXPECT_EQ(25, cudf::detail::count_set_bits(mask.data(), 7, 32, rmm::cuda_stream_default));
+  EXPECT_EQ(25, cudf::detail::valid_count(mask.data(), 7, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{25, 24}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPostSlack)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(17, cudf::count_set_bits(mask.data(), 0, 17));
+  EXPECT_EQ(17, cudf::detail::count_set_bits(mask.data(), 0, 17, rmm::cuda_stream_default));
+  EXPECT_EQ(17, cudf::detail::valid_count(mask.data(), 0, 17, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{17, 18}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(30, cudf::count_set_bits(mask.data(), 1, 31));
+  EXPECT_EQ(30, cudf::detail::count_set_bits(mask.data(), 1, 31, rmm::cuda_stream_default));
+  EXPECT_EQ(30, cudf::detail::valid_count(mask.data(), 1, 31, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{30, 10}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset2)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(28, cudf::count_set_bits(mask.data(), 2, 30));
+  EXPECT_EQ(28, cudf::detail::count_set_bits(mask.data(), 2, 30, rmm::cuda_stream_default));
+  EXPECT_EQ(28, cudf::detail::valid_count(mask.data(), 2, 30, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{12, 28}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsAllBits)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(320, cudf::count_set_bits(mask.data(), 0, 320));
+  EXPECT_EQ(320, cudf::detail::count_set_bits(mask.data(), 0, 320, rmm::cuda_stream_default));
+  EXPECT_EQ(320, cudf::detail::valid_count(mask.data(), 0, 320, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{320, 320}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubsetWordBoundary)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(256, cudf::count_set_bits(mask.data(), 32, 288));
+  EXPECT_EQ(256, cudf::detail::count_set_bits(mask.data(), 32, 288, rmm::cuda_stream_default));
+  EXPECT_EQ(256, cudf::detail::valid_count(mask.data(), 32, 288, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{160, 256}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSplitWordBoundary)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(2, cudf::count_set_bits(mask.data(), 31, 33));
+  EXPECT_EQ(2, cudf::detail::count_set_bits(mask.data(), 31, 33, rmm::cuda_stream_default));
+  EXPECT_EQ(2, cudf::detail::valid_count(mask.data(), 31, 33, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{2, 7}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubset)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(226, cudf::count_set_bits(mask.data(), 67, 293));
+  EXPECT_EQ(226, cudf::detail::count_set_bits(mask.data(), 67, 293, rmm::cuda_stream_default));
+  EXPECT_EQ(226, cudf::detail::valid_count(mask.data(), 67, 293, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{226, 282}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSingleBit)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 67, 68));
+  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 67, 68, rmm::cuda_stream_default));
+  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 67, 68, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
-  auto counts                          = cudf::segmented_count_set_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{1, 1, 1}));
+  auto set_counts =
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
+  auto valid_counts =
+    cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
 using CountUnsetBitsTest = CountBitmaskTest;
@@ -256,123 +343,179 @@ using CountUnsetBitsTest = CountBitmaskTest;
 TEST_F(CountUnsetBitsTest, SingleBitAllSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(0, cudf::count_unset_bits(mask.data(), 13, 14));
+  EXPECT_EQ(0, cudf::detail::count_unset_bits(mask.data(), 13, 14, rmm::cuda_stream_default));
+  EXPECT_EQ(0, cudf::detail::null_count(mask.data(), 13, 14, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {13, 14, 31, 32};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{0, 0}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
+  auto null_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountUnsetBitsTest, NullMask)
 {
-  EXPECT_EQ(0, cudf::count_unset_bits(nullptr, 0, 32));
+  EXPECT_THROW(cudf::detail::count_unset_bits(nullptr, 0, 32, rmm::cuda_stream_default),
+               cudf::logic_error);
+  EXPECT_EQ(0, cudf::detail::null_count(nullptr, 0, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 32, 7, 25};
-  auto counts                          = cudf::segmented_count_unset_bits(nullptr, indices);
-  EXPECT_EQ(indices.size(), counts.size() * 2);
-  for (size_t i = 0; i < counts.size(); i++) {
-    EXPECT_EQ(0, counts[i]);
-  }
+  EXPECT_THROW(cudf::detail::segmented_count_unset_bits(nullptr, indices, rmm::cuda_stream_default),
+               cudf::logic_error);
+  auto null_counts = cudf::detail::segmented_null_count(nullptr, indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordAllBits)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(32, cudf::count_unset_bits(mask.data(), 0, 32));
+  EXPECT_EQ(32, cudf::detail::count_unset_bits(mask.data(), 0, 32, rmm::cuda_stream_default));
+  EXPECT_EQ(32, cudf::detail::null_count(mask.data(), 0, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{32, 32}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPreSlack)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(25, cudf::count_unset_bits(mask.data(), 7, 32));
+  EXPECT_EQ(25, cudf::detail::count_unset_bits(mask.data(), 7, 32, rmm::cuda_stream_default));
+  EXPECT_EQ(25, cudf::detail::null_count(mask.data(), 7, 32, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{25, 24}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPostSlack)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(17, cudf::count_unset_bits(mask.data(), 0, 17));
+  EXPECT_EQ(17, cudf::detail::count_unset_bits(mask.data(), 0, 17, rmm::cuda_stream_default));
+  EXPECT_EQ(17, cudf::detail::null_count(mask.data(), 0, 17, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{17, 18}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(30, cudf::count_unset_bits(mask.data(), 1, 31));
+  EXPECT_EQ(30, cudf::detail::count_unset_bits(mask.data(), 1, 31, rmm::cuda_stream_default));
+  EXPECT_EQ(30, cudf::detail::null_count(mask.data(), 1, 31, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{30, 10}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset2)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(28, cudf::count_unset_bits(mask.data(), 2, 30));
+  EXPECT_EQ(28, cudf::detail::count_unset_bits(mask.data(), 2, 30, rmm::cuda_stream_default));
+  EXPECT_EQ(28, cudf::detail::null_count(mask.data(), 2, 30, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{12, 28}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsAllBits)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(320, cudf::count_unset_bits(mask.data(), 0, 320));
+  EXPECT_EQ(320, cudf::detail::count_unset_bits(mask.data(), 0, 320, rmm::cuda_stream_default));
+  EXPECT_EQ(320, cudf::detail::null_count(mask.data(), 0, 320, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{320, 320}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubsetWordBoundary)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(256, cudf::count_unset_bits(mask.data(), 32, 288));
+  EXPECT_EQ(256, cudf::detail::count_unset_bits(mask.data(), 32, 288, rmm::cuda_stream_default));
+  EXPECT_EQ(256, cudf::detail::null_count(mask.data(), 32, 288, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, testing::ContainerEq(std::vector<cudf::size_type>{160, 256}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSplitWordBoundary)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(2, cudf::count_unset_bits(mask.data(), 31, 33));
+  EXPECT_EQ(2, cudf::detail::count_unset_bits(mask.data(), 31, 33, rmm::cuda_stream_default));
+  EXPECT_EQ(2, cudf::detail::null_count(mask.data(), 31, 33, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{2, 7}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubset)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(226, cudf::count_unset_bits(mask.data(), 67, 293));
+  EXPECT_EQ(226, cudf::detail::count_unset_bits(mask.data(), 67, 293, rmm::cuda_stream_default));
+  EXPECT_EQ(226, cudf::detail::null_count(mask.data(), 67, 293, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{226, 282}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSingleBit)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(1, cudf::count_unset_bits(mask.data(), 67, 68));
+  EXPECT_EQ(1, cudf::detail::count_unset_bits(mask.data(), 67, 68, rmm::cuda_stream_default));
+  EXPECT_EQ(1, cudf::detail::null_count(mask.data(), 67, 68, rmm::cuda_stream_default));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
-  auto counts                          = cudf::segmented_count_unset_bits(mask.data(), indices);
-  EXPECT_THAT(counts, ::testing::ContainerEq(std::vector<cudf::size_type>{1, 1, 1}));
+  auto unset_counts =
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
+  auto null_counts =
+    cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default);
+  EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
 struct CopyBitmaskTest : public cudf::test::BaseFixture, cudf::test::UniformRandomGenerator<int> {
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index c80a8fba55c..b78c3b9417f 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -264,9 +265,10 @@ TEST_F(ApplyBooleanMask, CorrectNullCount)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 277) == 0; });
   cudf::test::fixed_width_column_wrapper<bool> boolean_mask(seq3, seq3 + inputRows);
 
-  auto got                 = cudf::apply_boolean_mask(input, boolean_mask);
-  auto out_col             = got->get_column(0).view();
-  auto expected_null_count = cudf::count_unset_bits(out_col.null_mask(), 0, out_col.size());
+  auto got     = cudf::apply_boolean_mask(input, boolean_mask);
+  auto out_col = got->get_column(0).view();
+  auto expected_null_count =
+    cudf::detail::null_count(out_col.null_mask(), 0, out_col.size(), rmm::cuda_stream_default);
 
   ASSERT_EQ(out_col.null_count(), expected_null_count);
 }

From e5817349cbcad8a68cf9d2b54d12b097ee1502f6 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 10 Dec 2021 21:28:50 -0600
Subject: [PATCH 075/202] Add zlib to cudfjni link when using static libcudf
 library dependency (#9890)

When the Java bindings use libcudf as a static library dependency, the resulting link is missing zlib as a dynamic link dependency.  This can result in runtime errors such as `undefined symbol: inflateInit2_` when trying to load a file using gzip compression.  This adds zlib to the cudfjni link when using libcudf.a as the cudf dependency.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9890
---
 java/src/main/native/CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 327445bdf0e..0ed2f31bfac 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -117,6 +117,12 @@ set(CUDF_LIB_HINTS HINTS "$ENV{CUDF_ROOT}" "$ENV{CUDF_ROOT}/lib" "$ENV{CONDA_PRE
 
 find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
 
+# ##################################################################################################
+# * ZLIB ------------------------------------------------------------------------------------------
+
+# find zlib
+rapids_find_package(ZLIB REQUIRED)
+
 # ##################################################################################################
 # * RMM -------------------------------------------------------------------------------------------
 
@@ -321,7 +327,7 @@ target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM
 
 set(CUDF_LINK ${CUDF_LIB})
 if(CUDF_JNI_LIBCUDF_STATIC)
-  set(CUDF_LINK -Wl,--whole-archive ${CUDF_LIB} -Wl,--no-whole-archive)
+  set(CUDF_LINK -Wl,--whole-archive ${CUDF_LIB} -Wl,--no-whole-archive ZLIB::ZLIB)
 endif()
 
 target_link_libraries(

From d23bcb475273d6874bf797b4e77a90d0c478f9ef Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Sun, 12 Dec 2021 17:31:26 -0500
Subject: [PATCH 076/202] Remove `IncludeCategories` from `.clang-format`
 (#9876)

It was recently noticed that the `IncludeCategories`:
```
IncludeCategories:
  - Regex:           '^<ext/.*\.h>'
    Priority:        2
  - Regex:           '^<.*\.h>'
    Priority:        1
  - Regex:           '^<.*'
    Priority:        2
  - Regex:           '.*'
    Priority:        3
```
In the `.clang-format` are not really necessary as `ext` has no meaning in RAPIDS. This PR removes these.

Depends on https://github.com/rapidsai/cudf/pull/9870.

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/9876
---
 cpp/.clang-format                                       | 9 ---------
 cpp/benchmarks/transpose/transpose_benchmark.cu         | 2 +-
 cpp/include/cudf/detail/reduction_operators.cuh         | 2 +-
 cpp/include/cudf/strings/string.cuh                     | 2 +-
 cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp | 2 +-
 cpp/libcudf_kafka/src/kafka_consumer.cpp                | 2 +-
 cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp        | 2 +-
 cpp/src/dictionary/set_keys.cu                          | 2 +-
 cpp/src/groupby/hash/groupby_kernels.cuh                | 2 +-
 cpp/src/groupby/hash/multi_pass_kernels.cuh             | 4 +++-
 cpp/src/groupby/sort/group_nth_element.cu               | 2 +-
 cpp/src/groupby/sort/group_quantiles.cu                 | 2 +-
 cpp/src/io/avro/avro.h                                  | 4 ++--
 cpp/src/io/avro/avro_common.h                           | 2 +-
 cpp/src/io/comp/cpu_unbz2.cpp                           | 4 ++--
 cpp/src/io/functions.cpp                                | 2 +-
 cpp/src/io/orc/orc.h                                    | 6 +++---
 cpp/src/io/orc/orc_field_reader.hpp                     | 2 +-
 cpp/src/io/orc/orc_field_writer.hpp                     | 2 +-
 cpp/src/io/orc/orc_gpu.h                                | 6 +++---
 cpp/src/io/orc/timezone.cuh                             | 2 +-
 cpp/src/io/orc/writer_impl.hpp                          | 2 +-
 cpp/src/io/parquet/compact_protocol_writer.hpp          | 2 +-
 cpp/src/io/parquet/page_data.cu                         | 2 +-
 cpp/src/io/parquet/page_enc.cu                          | 6 +++---
 cpp/src/io/parquet/page_hdr.cu                          | 4 ++--
 cpp/src/io/parquet/parquet.hpp                          | 2 +-
 cpp/src/io/parquet/writer_impl.cu                       | 4 ++--
 cpp/src/io/utilities/data_sink.cpp                      | 2 +-
 cpp/src/io/utilities/file_io_utilities.hpp              | 2 +-
 cpp/src/join/hash_join.cu                               | 2 +-
 cpp/src/lists/contains.cu                               | 4 ++--
 cpp/src/merge/merge.cu                                  | 2 +-
 cpp/src/rolling/range_window_bounds.cpp                 | 2 +-
 cpp/src/rolling/rolling.cu                              | 2 +-
 cpp/src/strings/case.cu                                 | 4 ++--
 cpp/src/strings/regex/regcomp.cpp                       | 2 +-
 cpp/src/structs/structs_column_view.cpp                 | 2 +-
 cpp/src/text/subword/load_hash_file.cu                  | 2 +-
 cpp/src/transform/mask_to_bools.cu                      | 4 ++--
 cpp/tests/binaryop/assert-binops.h                      | 2 +-
 cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp  | 2 +-
 cpp/tests/binaryop/binop-compiled-test.cpp              | 2 +-
 cpp/tests/binaryop/binop-generic-ptx-test.cpp           | 2 +-
 cpp/tests/bitmask/bitmask_tests.cpp                     | 4 ++--
 cpp/tests/column/column_test.cu                         | 2 +-
 cpp/tests/copying/gather_str_tests.cpp                  | 2 +-
 cpp/tests/hash_map/map_test.cu                          | 2 +-
 cpp/tests/iterator/value_iterator_test.cuh              | 2 +-
 cpp/tests/iterator/value_iterator_test_strings.cu       | 2 +-
 cpp/tests/lists/extract_tests.cpp                       | 2 +-
 cpp/tests/partitioning/hash_partition_test.cpp          | 2 +-
 cpp/tests/replace/replace_tests.cpp                     | 4 ++--
 cpp/tests/reshape/interleave_columns_tests.cpp          | 2 +-
 cpp/tests/reshape/tile_tests.cpp                        | 2 +-
 cpp/tests/scalar/scalar_device_view_test.cu             | 4 ++--
 cpp/tests/strings/booleans_tests.cpp                    | 2 +-
 cpp/tests/strings/case_tests.cpp                        | 2 +-
 cpp/tests/strings/combine/concatenate_tests.cpp         | 2 +-
 cpp/tests/strings/combine/join_strings_tests.cpp        | 2 +-
 cpp/tests/strings/concatenate_tests.cpp                 | 2 +-
 cpp/tests/strings/contains_tests.cpp                    | 2 +-
 cpp/tests/strings/datetime_tests.cpp                    | 2 +-
 cpp/tests/strings/durations_tests.cpp                   | 2 +-
 cpp/tests/strings/extract_tests.cpp                     | 2 +-
 cpp/tests/strings/factories_test.cu                     | 2 +-
 cpp/tests/strings/fill_tests.cpp                        | 2 +-
 cpp/tests/strings/findall_tests.cpp                     | 2 +-
 cpp/tests/strings/floats_tests.cpp                      | 2 +-
 cpp/tests/strings/integers_tests.cpp                    | 2 +-
 cpp/tests/strings/ipv4_tests.cpp                        | 2 +-
 cpp/tests/strings/pad_tests.cpp                         | 2 +-
 cpp/tests/strings/replace_regex_tests.cpp               | 2 +-
 cpp/tests/strings/replace_tests.cpp                     | 2 +-
 cpp/tests/strings/split_tests.cpp                       | 2 +-
 cpp/tests/strings/strip_tests.cpp                       | 2 +-
 cpp/tests/strings/substring_tests.cpp                   | 4 ++--
 cpp/tests/strings/translate_tests.cpp                   | 2 +-
 cpp/tests/strings/urls_tests.cpp                        | 2 +-
 cpp/tests/text/ngrams_tests.cpp                         | 2 +-
 80 files changed, 98 insertions(+), 105 deletions(-)

diff --git a/cpp/.clang-format b/cpp/.clang-format
index 0c05436e922..6019a6f3d5c 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -72,15 +72,6 @@ ForEachMacros:
   - Q_FOREACH
   - BOOST_FOREACH
 IncludeBlocks: Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentPPDirectives: None
diff --git a/cpp/benchmarks/transpose/transpose_benchmark.cu b/cpp/benchmarks/transpose/transpose_benchmark.cu
index 49ea361fe36..31861c12ebe 100644
--- a/cpp/benchmarks/transpose/transpose_benchmark.cu
+++ b/cpp/benchmarks/transpose/transpose_benchmark.cu
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <thrust/transform.h>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 #include <cudf/transpose.hpp>
 #include <cudf/types.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <memory>
+#include <thrust/transform.h>
 
 using cudf::test::fixed_width_column_wrapper;
 
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/detail/reduction_operators.cuh
index 3a3b0338035..866e26cd655 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/detail/reduction_operators.cuh
@@ -21,8 +21,8 @@
 #include <cudf/detail/utilities/transform_unary_functions.cuh>
 #include <cudf/types.hpp>  //for CUDA_HOST_DEVICE_CALLABLE
 
-#include <thrust/functional.h>
 #include <cmath>
+#include <thrust/functional.h>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh
index d85d19d7f10..a215a3f36c0 100644
--- a/cpp/include/cudf/strings/string.cuh
+++ b/cpp/include/cudf/strings/string.cuh
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <thrust/logical.h>
 #include <cudf/strings/string_view.cuh>
+#include <thrust/logical.h>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
index d752acbceaf..464d1cd71b1 100644
--- a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
+++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
@@ -15,10 +15,10 @@
  */
 #pragma once
 
-#include <librdkafka/rdkafkacpp.h>
 #include <algorithm>
 #include <chrono>
 #include <cudf/io/datasource.hpp>
+#include <librdkafka/rdkafkacpp.h>
 #include <map>
 #include <memory>
 #include <string>
diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp
index a76d6b0a985..4f7cdba632e 100644
--- a/cpp/libcudf_kafka/src/kafka_consumer.cpp
+++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp
@@ -15,8 +15,8 @@
  */
 
 #include "cudf_kafka/kafka_consumer.hpp"
-#include <librdkafka/rdkafkacpp.h>
 #include <chrono>
+#include <librdkafka/rdkafkacpp.h>
 #include <memory>
 
 namespace cudf {
diff --git a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
index fa3d7d887aa..ca4b70531db 100644
--- a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
+++ b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "cudf_kafka/kafka_consumer.hpp"
 #include <gtest/gtest.h>
 #include <map>
 #include <memory>
 #include <string>
-#include "cudf_kafka/kafka_consumer.hpp"
 
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 2baf336bb9e..72f6e034479 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -31,9 +31,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/binary_search.h>
 #include <algorithm>
 #include <iterator>
+#include <thrust/binary_search.h>
 
 namespace cudf {
 namespace dictionary {
diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/groupby_kernels.cuh
index b6ed7f3ae93..7238186b7d9 100644
--- a/cpp/src/groupby/hash/groupby_kernels.cuh
+++ b/cpp/src/groupby/hash/groupby_kernels.cuh
@@ -16,11 +16,11 @@
 
 #pragma once
 
+#include "multi_pass_kernels.cuh"
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/bit.hpp>
-#include "multi_pass_kernels.cuh"
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index 24de22705a9..02977bb4ece 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -16,15 +16,17 @@
 
 #pragma once
 
-#include <cmath>
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cmath>
+
 namespace cudf {
 namespace detail {
 
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index c1fc58beb80..7e9bd4539ba 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <thrust/iterator/discard_iterator.h>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -24,6 +23,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+#include <thrust/iterator/discard_iterator.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index e4ae3cf5915..b910c96731c 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <quantiles/quantiles_util.hpp>
 #include "group_reductions.hpp"
+#include <quantiles/quantiles_util.hpp>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h
index 1f2389eab84..f84693fdba3 100644
--- a/cpp/src/io/avro/avro.h
+++ b/cpp/src/io/avro/avro.h
@@ -18,12 +18,12 @@
 
 #include "avro_common.h"
 
+#include <algorithm>
+#include <map>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include <algorithm>
-#include <map>
 #include <string>
 #include <vector>
 
diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h
index 3ef36863cd2..17f12da3165 100644
--- a/cpp/src/io/avro/avro_common.h
+++ b/cpp/src/io/avro/avro_common.h
@@ -16,9 +16,9 @@
 
 #pragma once
 
+#include <io/utilities/column_buffer.hpp>
 #include <stdint.h>
 #include <stdio.h>
-#include <io/utilities/column_buffer.hpp>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index f4cb6edd41f..7f37b62e9c2 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -79,11 +79,11 @@ Jon L. Bentley
 For more information on these sources, see the manual.
 --*/
 
+#include "io_uncomp.h"
+#include "unbz2.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
-#include "io_uncomp.h"
-#include "unbz2.h"
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 613ccf203cb..768d6b25690 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <io/orc/orc.h>
 #include <algorithm>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/avro.hpp>
@@ -32,6 +31,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/orc/orc.h>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index d75b76a0341..277c5d99f8f 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -18,17 +18,17 @@
 
 #include "orc_common.h"
 
-#include <io/comp/io_uncomp.h>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/error.hpp>
+#include <io/comp/io_uncomp.h>
 
-#include <stddef.h>
-#include <stdint.h>
 #include <algorithm>
 #include <memory>
 #include <optional>
+#include <stddef.h>
+#include <stdint.h>
 #include <string>
 #include <vector>
 
diff --git a/cpp/src/io/orc/orc_field_reader.hpp b/cpp/src/io/orc/orc_field_reader.hpp
index 45d2cbe3bf2..da1c8ec2b2f 100644
--- a/cpp/src/io/orc/orc_field_reader.hpp
+++ b/cpp/src/io/orc/orc_field_reader.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <string>
 #include "orc.h"
+#include <string>
 
 /**
  * @file orc_field_reader.hpp
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 7882810b50d..afcd99a2cd6 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -15,9 +15,9 @@
  */
 #pragma once
 
+#include "orc.h"
 #include <numeric>
 #include <string>
-#include "orc.h"
 
 /**
  * @file orc_field_writer.hpp
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index ad4450bc6a7..d94aa00c7b9 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -18,14 +18,14 @@
 
 #include "timezone.cuh"
 
-#include <io/comp/gpuinflate.h>
+#include "orc.h"
+#include "orc_common.h"
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+#include <io/comp/gpuinflate.h>
 #include <io/statistics/statistics.cuh>
 #include <io/utilities/column_buffer.hpp>
-#include "orc.h"
-#include "orc_common.h"
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index 11db05cf28c..77c2bd4ffa0 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -20,9 +20,9 @@
 #include <cudf/utilities/span.hpp>
 #include <io/utilities/time_utils.cuh>
 
+#include <rmm/device_uvector.hpp>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
-#include <rmm/device_uvector.hpp>
 
 #include <stdint.h>
 #include <string>
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 89b8c305424..68622d17b28 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -29,9 +29,9 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <memory>
 #include <string>
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 633bbdf1e19..71452bd7809 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -19,9 +19,9 @@
 #include "parquet.hpp"
 #include "parquet_common.hpp"
 
+#include <algorithm>
 #include <stddef.h>
 #include <stdint.h>
-#include <algorithm>
 #include <string>
 #include <vector>
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index b31888b6ac2..751d6b62319 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "parquet_gpu.hpp"
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
-#include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/utilities/bit.hpp>
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 4728a8001f2..33647ff626c 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <io/utilities/block_utils.cuh>
 #include "parquet_gpu.hpp"
+#include <io/utilities/block_utils.cuh>
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -25,10 +25,10 @@
 
 #include <cub/cub.cuh>
 
-#include <thrust/gather.h>
-#include <thrust/iterator/discard_iterator.h>
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
+#include <thrust/gather.h>
+#include <thrust/iterator/discard_iterator.h>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index a5536775116..7c0775076f0 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include <thrust/tuple.h>
-#include <io/utilities/block_utils.cuh>
 #include "parquet_gpu.hpp"
+#include <io/utilities/block_utils.cuh>
+#include <thrust/tuple.h>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index cd57414d98b..21610638843 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -18,9 +18,9 @@
 
 #include "parquet_common.hpp"
 
+#include <algorithm>
 #include <stddef.h>
 #include <stdint.h>
-#include <algorithm>
 #include <string>
 #include <tuple>
 #include <vector>
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1a5fc1a3c64..e04c8371df8 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,12 +19,12 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
-#include <io/statistics/column_statistics.cuh>
 #include "writer_impl.hpp"
+#include <io/statistics/column_statistics.cuh>
 
+#include "compact_protocol_writer.hpp"
 #include <io/utilities/column_utils.cuh>
 #include <io/utilities/config_utils.hpp>
-#include "compact_protocol_writer.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index a0ed54b96ef..642f3518edd 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -16,9 +16,9 @@
 
 #include <fstream>
 
+#include "file_io_utilities.hpp"
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
-#include "file_io_utilities.hpp"
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index ede0eb6f925..7178418bbbf 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,8 +19,8 @@
 #ifdef CUFILE_FOUND
 #include "thread_pool.hpp"
 
-#include <cufile.h>
 #include <cudf_test/file_utilities.hpp>
+#include <cufile.h>
 #endif
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 57303ed7795..ee62008b90f 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -13,9 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <join/hash_join.cuh>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/uninitialized_fill.h>
-#include <join/hash_join.cuh>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index b48982d205a..3d135992dea 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/logical.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
@@ -28,6 +26,8 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <rmm/exec_policy.hpp>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/logical.h>
 #include <type_traits>
 
 namespace cudf {
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 05bd195e764..f7e9b114f7b 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -36,9 +36,9 @@
 #include <thrust/merge.h>
 #include <thrust/pair.h>
 
+#include "cudf/utilities/traits.hpp"
 #include <queue>
 #include <vector>
-#include "cudf/utilities/traits.hpp"
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 9c3bc0edf92..dd147c99614 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
+#include "range_window_bounds_detail.hpp"
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/wrappers/durations.hpp>
-#include "range_window_bounds_detail.hpp"
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index ab0f78bcf5d..fdb9f09a812 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include "rolling_detail.cuh"
+#include <cudf/detail/aggregation/aggregation.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 2b4d832e85e..e09367d72a4 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <strings/char_types/char_cases.h>
-#include <strings/char_types/is_flags.h>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -27,6 +25,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <strings/char_types/char_cases.h>
+#include <strings/char_types/is_flags.h>
 
 #include <strings/utf8.cuh>
 #include <strings/utilities.hpp>
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 0e3dcb93826..7da4915d668 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -18,9 +18,9 @@
 
 #include <cudf/utilities/error.hpp>
 
-#include <string.h>
 #include <algorithm>
 #include <array>
+#include <string.h>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp
index dba31ecc21e..db9496f18be 100644
--- a/cpp/src/structs/structs_column_view.cpp
+++ b/cpp/src/structs/structs_column_view.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf/utilities/error.hpp"
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include "cudf/utilities/error.hpp"
 
 namespace cudf {
 
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index b2230f95842..0af34eb8092 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -28,10 +28,10 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <stdint.h>
 #include <algorithm>
 #include <fstream>
 #include <iostream>
+#include <stdint.h>
 #include <vector>
 
 namespace nvtext {
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index f2a6c65cb48..f4bdb2f50b2 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/transform.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/transform.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/tests/binaryop/assert-binops.h b/cpp/tests/binaryop/assert-binops.h
index 65859251e42..b257ca21dd7 100644
--- a/cpp/tests/binaryop/assert-binops.h
+++ b/cpp/tests/binaryop/assert-binops.h
@@ -19,9 +19,9 @@
 
 #pragma once
 
-#include <tests/binaryop/util/operation.h>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <tests/binaryop/util/operation.h>
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 5020fbf898b..29905171907 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -26,9 +26,9 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include "cudf/utilities/error.hpp"
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
-#include "cudf/utilities/error.hpp"
 
 namespace cudf::test::binop {
 
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 37212c30d80..0339d52dda9 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -26,9 +26,9 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/utilities/error.hpp>
 #include <tests/binaryop/assert-binops.h>
 #include <tests/binaryop/binop-fixture.hpp>
-#include "cudf/utilities/error.hpp"
 
 #include <type_traits>
 
diff --git a/cpp/tests/binaryop/binop-generic-ptx-test.cpp b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
index f4407834786..0b6cfdab498 100644
--- a/cpp/tests/binaryop/binop-generic-ptx-test.cpp
+++ b/cpp/tests/binaryop/binop-generic-ptx-test.cpp
@@ -20,8 +20,8 @@
 #include <cudf/binaryop.hpp>
 
 #include <tests/binaryop/assert-binops.h>
-#include <tests/binaryop/util/runtime_support.h>
 #include <tests/binaryop/binop-fixture.hpp>
+#include <tests/binaryop/util/runtime_support.h>
 
 namespace cudf {
 namespace test {
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 19a0da21cb9..3a479f0860b 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -25,9 +25,9 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
+#include <rmm/device_uvector.hpp>
 
 struct BitmaskUtilitiesTest : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index 5d7d570b321..48de5c2e5c6 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -30,8 +30,8 @@
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <thrust/sequence.h>
 #include <random>
+#include <thrust/sequence.h>
 
 template <typename T>
 struct TypedColumnTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 3083631f7a1..a9a9a4f9342 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <tests/strings/utilities.h>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
@@ -24,6 +23,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <tests/strings/utilities.h>
 
 class GatherTestStr : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 94648ef00b1..54f7a97fb2b 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -26,13 +26,13 @@
 
 #include <thrust/logical.h>
 
+#include "rmm/exec_policy.hpp"
 #include <cstdlib>
 #include <iostream>
 #include <limits>
 #include <random>
 #include <unordered_map>
 #include <vector>
-#include "rmm/exec_policy.hpp"
 
 template <typename K, typename V>
 struct key_value_types {
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index 3a7ef075a41..8e542af643d 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -12,8 +12,8 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-#include <tests/iterator/iterator_tests.cuh>
 #include "cudf/detail/utilities/vector_factories.hpp"
+#include <tests/iterator/iterator_tests.cuh>
 
 // tests for non-null iterator (pointer of device array)
 template <typename T>
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index f28067649fd..c0ed9fa7480 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -12,10 +12,10 @@
  * or implied. See the License for the specific language governing permissions and limitations under
  * the License.
  */
-#include <tests/iterator/iterator_tests.cuh>
 #include "cudf/detail/utilities/vector_factories.hpp"
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
+#include <tests/iterator/iterator_tests.cuh>
 
 auto strings_to_string_views(std::vector<std::string>& input_strings)
 {
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index d6ee62a7731..c7e8ba7e5de 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -18,12 +18,12 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/extract.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <tests/strings/utilities.h>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 9eb9814373e..ab8a394ab37 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing.hpp>
 #include <cudf/partitioning.hpp>
 #include <cudf/sorting.hpp>
@@ -22,7 +23,6 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
-#include "cudf/detail/utilities/vector_factories.hpp"
 
 using cudf::test::fixed_width_column_wrapper;
 using cudf::test::strings_column_wrapper;
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 92ccbbfbfd8..fe65fe0474a 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -31,11 +31,11 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <gtest/gtest.h>
 #include <cstdlib>
+#include <cudf/types.hpp>
+#include <gtest/gtest.h>
 #include <iostream>
 #include <vector>
-#include "cudf/types.hpp"
 
 struct ReplaceErrorTest : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index fb8e5bdd01a..c682e4ab29f 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <tests/strings/utilities.h>
 
 #include <cudf/reshape.hpp>
 
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index 48eb8919d23..e605fd7a84b 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -22,7 +22,7 @@
 
 #include <cudf/reshape.hpp>
 #include <cudf/table/table.hpp>
-#include "cudf/utilities/error.hpp"
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test;
 
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 984e591d19f..ee4c878726f 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -25,9 +25,9 @@
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <thrust/sequence.h>
 #include <random>
-#include "rmm/cuda_stream_view.hpp"
+#include <rmm/cuda_stream_view.hpp>
+#include <thrust/sequence.h>
 
 template <typename T>
 struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index 17b163a8690..cc637bf55a0 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index a9d4c9c76b5..c399c640bb6 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -19,10 +19,10 @@
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index be1dacd0b5d..569767531bc 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -21,10 +21,10 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index 552cd5b0f95..e018540e84c 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -20,10 +20,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 74ec5b8bc08..0318fc3edb9 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -18,10 +18,10 @@
 #include <cudf/strings/detail/concatenate.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 6f3896d7292..f95b282171f 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <algorithm>
 #include <vector>
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index 9a01d5dd041..9375a29a078 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -21,10 +21,10 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index 5dc30472471..523c64159f4 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -18,10 +18,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/wrappers/durations.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <vector>
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index bca5014a46c..824bf7deb34 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -22,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 1d1a6074b08..38f905078a7 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -27,6 +26,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index d35cc0e1acd..3952f02d5f3 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -19,10 +19,10 @@
 #include <cudf/strings/detail/fill.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 561efa67d51..d7bf162d36f 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/findall.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index e6f4f6bb8d9..01dd19bf308 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -17,10 +17,10 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index a9f9eacede4..81e45f2808e 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -17,12 +17,12 @@
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
+#include <tests/strings/utilities.h>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 30fa533725a..6abe9a55da1 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index f344b5432a2..a07f298b3af 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -21,10 +21,10 @@
 #include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index fc1c20d8719..16308265a9b 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 536cfc8daab..63a65e178c7 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -19,10 +19,10 @@
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include "./utilities.h"
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include "./utilities.h"
 
 #include <vector>
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 53912faab69..de4e48fd70a 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -21,11 +21,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/strip_tests.cpp b/cpp/tests/strings/strip_tests.cpp
index 6297db6c87c..661444ff515 100644
--- a/cpp/tests/strings/strip_tests.cpp
+++ b/cpp/tests/strings/strip_tests.cpp
@@ -18,10 +18,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
 
+#include "./utilities.h"
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include "./utilities.h"
 
 #include <vector>
 
diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp
index 448b61300fd..4fa4686e887 100644
--- a/cpp/tests/strings/substring_tests.cpp
+++ b/cpp/tests/strings/substring_tests.cpp
@@ -19,13 +19,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
-#include <thrust/sequence.h>
 #include <string>
+#include <thrust/sequence.h>
 #include <vector>
 
 struct StringsSubstringsTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index 92daee46a7a..c516383b8a1 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -20,10 +20,10 @@
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <vector>
 
diff --git a/cpp/tests/strings/urls_tests.cpp b/cpp/tests/strings/urls_tests.cpp
index 6ac3e08b1e9..86c94a85025 100644
--- a/cpp/tests/strings/urls_tests.cpp
+++ b/cpp/tests/strings/urls_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <random>
 #include <vector>
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index a9b631cb6ad..5c1e27eea3d 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <tests/strings/utilities.h>
 
 #include <nvtext/generate_ngrams.hpp>
 

From 335862bf5bc8433d5d0896adf8210a13c5a21ab3 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 13 Dec 2021 15:08:39 -0600
Subject: [PATCH 077/202] Fix fallback to sort aggregation for grouping only
 hash aggregate (#9891)

The following fixes what looks like an unintended fallback to sort aggregate introduced in https://github.com/rapidsai/cudf/pull/9545 for a grouping only (no aggregation request) case.

In the PR, the `std::all_of` function is used to determine whether the aggregation requests would be for struct types. That said, when there are no aggregation requests the `std::all_of` function will return true, causing a fallback to the sort aggregation (relevant code: https://github.com/rapidsai/cudf/pull/9545/files#diff-e409f72ddc11ad10fa0099e21b409b92f12bfac8ba1817266696c34a620aa081R645-R650).

I added a benchmark `group_no_requests_benchmark.cu` by mostly copying `group_sum_benchmark.cu` but I changed one critical part. I am re-creating the `groupby` object for each `state`:

```
  for (auto _ : state) {
    cuda_event_timer timer(state, true);
    cudf::groupby::groupby gb_obj(cudf::table_view({keys}));e
    auto result = gb_obj.aggregate(requests);
  }
```

This shows what would happen in the scenario where the `groupby` instance is created each time an aggregate is issued, which would re-create the `helper` each time for the sorted case.

If the `groupby` object is not recreated each time, the difference in performance between the before/after cases is negligible. We never recycle a `groupby` instance when using the groupby API from Spark.

Posting this as draft for feedback as I am not sure if I handled the benchmark part correctly.

This was executed on a T4 GPU.

Before the patch:

```
Groupby/BasicNoRequest/10000/manual_time               0.158 ms        0.184 ms         4420
Groupby/BasicNoRequest/1000000/manual_time              1.72 ms         1.74 ms          408
Groupby/BasicNoRequest/10000000/manual_time             18.9 ms         18.9 ms           37
Groupby/BasicNoRequest/100000000/manual_time             198 ms          198 ms            3
```
<details>
<summary>Full output</summary>
<p>

```
2021-12-12T13:41:08+00:00
Running ./GROUPBY_BENCH
Run on (64 X 2801.89 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x32)
  L1 Instruction 32 KiB (x32)
  L2 Unified 1024 KiB (x32)
  L3 Unified 22528 KiB (x2)
Load Average: 1.01, 0.78, 0.42
--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
Groupby/Basic/10000/manual_time                        0.117 ms        0.143 ms         5906
Groupby/Basic/1000000/manual_time                      0.524 ms        0.542 ms         1352
Groupby/Basic/10000000/manual_time                      4.41 ms         4.43 ms          159
Groupby/Basic/100000000/manual_time                     50.1 ms         50.1 ms           13
Groupby/PreSorted/1000000/manual_time                  0.332 ms        0.350 ms         2118
Groupby/PreSorted/10000000/manual_time                  2.22 ms         2.23 ms          315
Groupby/PreSorted/100000000/manual_time                 22.2 ms         22.2 ms           30
Groupby/PreSortedNth/1000000/manual_time               0.160 ms        0.188 ms         4381
Groupby/PreSortedNth/10000000/manual_time              0.890 ms        0.917 ms          774
Groupby/PreSortedNth/100000000/manual_time              8.43 ms         8.46 ms           68
Groupby/Shift/1000000/manual_time                      0.764 ms        0.785 ms          902
Groupby/Shift/10000000/manual_time                      9.51 ms         9.53 ms           63
Groupby/Shift/100000000/manual_time                      145 ms          145 ms            4
Groupby/Aggregation/10000/manual_time                   1.56 ms         1.58 ms          442
Groupby/Aggregation/16384/manual_time                   1.59 ms         1.62 ms          435
Groupby/Aggregation/65536/manual_time                   1.73 ms         1.76 ms          404
Groupby/Aggregation/262144/manual_time                  2.95 ms         2.98 ms          237
Groupby/Aggregation/1048576/manual_time                 9.20 ms         9.23 ms           73
Groupby/Aggregation/4194304/manual_time                 36.3 ms         36.3 ms           19
Groupby/Aggregation/10000000/manual_time                92.0 ms         92.1 ms            7
Groupby/Scan/10000/manual_time                          1.56 ms         1.58 ms          447
Groupby/Scan/16384/manual_time                          1.62 ms         1.65 ms          429
Groupby/Scan/65536/manual_time                          1.85 ms         1.88 ms          378
Groupby/Scan/262144/manual_time                         3.54 ms         3.56 ms          197
Groupby/Scan/1048576/manual_time                        12.0 ms         12.0 ms           57
Groupby/Scan/4194304/manual_time                        48.6 ms         48.6 ms           14
Groupby/Scan/10000000/manual_time                        126 ms          126 ms            4
Groupby/BasicNoRequest/10000/manual_time               0.158 ms        0.184 ms         4420
Groupby/BasicNoRequest/1000000/manual_time              1.72 ms         1.74 ms          408
Groupby/BasicNoRequest/10000000/manual_time             18.9 ms         18.9 ms           37
Groupby/BasicNoRequest/100000000/manual_time             198 ms          198 ms            3
Groupby/PreSortedNoRequests/1000000/manual_time        0.194 ms        0.214 ms         3624
Groupby/PreSortedNoRequests/10000000/manual_time        1.25 ms         1.27 ms          571
Groupby/PreSortedNoRequests/100000000/manual_time       12.6 ms         12.7 ms           50
```

</details>

After the patch:

```
Groupby/BasicNoRequest/10000/manual_time               0.058 ms        0.085 ms        11991
Groupby/BasicNoRequest/1000000/manual_time             0.282 ms        0.301 ms         2478
Groupby/BasicNoRequest/10000000/manual_time             2.42 ms         2.44 ms          291
Groupby/BasicNoRequest/100000000/manual_time            29.2 ms         29.2 ms           21
```

<details>
<summary>Full output</summary>
<p>

```
2021-12-12T13:37:50+00:00
Running ./GROUPBY_BENCH
Run on (64 X 2654.22 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x32)
  L1 Instruction 32 KiB (x32)
  L2 Unified 1024 KiB (x32)
  L3 Unified 22528 KiB (x2)
Load Average: 0.64, 0.50, 0.26
--------------------------------------------------------------------------------------------
Benchmark                                                  Time             CPU   Iterations
--------------------------------------------------------------------------------------------
Groupby/Basic/10000/manual_time                        0.116 ms        0.142 ms         5918
Groupby/Basic/1000000/manual_time                      0.523 ms        0.542 ms         1374
Groupby/Basic/10000000/manual_time                      4.37 ms         4.39 ms          162
Groupby/Basic/100000000/manual_time                     51.4 ms         51.5 ms           10
Groupby/PreSorted/1000000/manual_time                  0.331 ms        0.350 ms         2121
Groupby/PreSorted/10000000/manual_time                  2.21 ms         2.23 ms          316
Groupby/PreSorted/100000000/manual_time                 22.2 ms         22.2 ms           27
Groupby/PreSortedNth/1000000/manual_time               0.160 ms        0.188 ms         4384
Groupby/PreSortedNth/10000000/manual_time              0.888 ms        0.915 ms          775
Groupby/PreSortedNth/100000000/manual_time              8.36 ms         8.39 ms           70
Groupby/Shift/1000000/manual_time                      0.764 ms        0.785 ms          904
Groupby/Shift/10000000/manual_time                      9.50 ms         9.52 ms           63
Groupby/Shift/100000000/manual_time                      146 ms          146 ms            4
Groupby/Aggregation/10000/manual_time                   1.53 ms         1.55 ms          446
Groupby/Aggregation/16384/manual_time                   1.58 ms         1.61 ms          437
Groupby/Aggregation/65536/manual_time                   1.72 ms         1.75 ms          405
Groupby/Aggregation/262144/manual_time                  2.93 ms         2.96 ms          236
Groupby/Aggregation/1048576/manual_time                 9.18 ms         9.21 ms           74
Groupby/Aggregation/4194304/manual_time                 36.2 ms         36.3 ms           19
Groupby/Aggregation/10000000/manual_time                91.5 ms         91.6 ms            7
Groupby/Scan/10000/manual_time                          1.55 ms         1.57 ms          452
Groupby/Scan/16384/manual_time                          1.60 ms         1.62 ms          434
Groupby/Scan/65536/manual_time                          1.84 ms         1.87 ms          379
Groupby/Scan/262144/manual_time                         3.54 ms         3.56 ms          198
Groupby/Scan/1048576/manual_time                        12.0 ms         12.0 ms           57
Groupby/Scan/4194304/manual_time                        48.4 ms         48.4 ms           14
Groupby/Scan/10000000/manual_time                        125 ms          125 ms            4
Groupby/BasicNoRequest/10000/manual_time               0.058 ms        0.085 ms        11991
Groupby/BasicNoRequest/1000000/manual_time             0.282 ms        0.301 ms         2478
Groupby/BasicNoRequest/10000000/manual_time             2.42 ms         2.44 ms          291
Groupby/BasicNoRequest/100000000/manual_time            29.2 ms         29.2 ms           21
Groupby/PreSortedNoRequests/1000000/manual_time        0.195 ms        0.215 ms         3604
Groupby/PreSortedNoRequests/10000000/manual_time        1.25 ms         1.27 ms          575
Groupby/PreSortedNoRequests/100000000/manual_time       12.7 ms         12.8 ms           50
```

</details>

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9891
---
 cpp/benchmarks/CMakeLists.txt                 |   8 +-
 .../groupby/group_no_requests_benchmark.cu    | 115 ++++++++++++++++++
 cpp/src/groupby/hash/groupby.cu               |  27 ++--
 3 files changed, 131 insertions(+), 19 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_no_requests_benchmark.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 71b452a1004..63f6857ee08 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -171,8 +171,12 @@ ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
 # ##################################################################################################
 # * groupby benchmark -----------------------------------------------------------------------------
 ConfigureBench(
-  GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
+  GROUPBY_BENCH
+  groupby/group_sum_benchmark.cu
+  groupby/group_nth_benchmark.cu
+  groupby/group_shift_benchmark.cu
+  groupby/group_struct_benchmark.cu
+  groupby/group_no_requests_benchmark.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
new file mode 100644
index 00000000000..7dbe1888cee
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <fixture/benchmark_fixture.hpp>
+#include <synchronization/synchronization.hpp>
+
+#include <memory>
+#include <random>
+
+class Groupby : public cudf::benchmark {
+};
+
+// TODO: put it in a struct so `uniform` can be remade with different min, max
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
+
+void BM_basic_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicNoRequest)(::benchmark::State& state)
+{
+  BM_basic_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, BasicNoRequest)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(10000)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_no_requests(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+    auto result = gb_obj.aggregate(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedNoRequests)(::benchmark::State& state)
+{
+  BM_pre_sorted_no_requests(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedNoRequests)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 1474cfd8a19..4f2cb4de14b 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -636,23 +636,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
  */
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
-  auto const all_hash_aggregations =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return cudf::has_atomic_support(r.values.type()) and
-             std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-               return is_hash_aggregation(a->kind);
-             });
-    });
-
-  // Currently, structs are not supported in any of hash-based aggregations.
-  // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
-  // TODO: Support structs in hash-based aggregations.
-  auto const has_struct =
-    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-      return r.values.type().id() == type_id::STRUCT;
-    });
-
-  return all_hash_aggregations && !has_struct;
+  return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+    // Currently, structs are not supported in any of hash-based aggregations.
+    // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
+    // TODO: Support structs in hash-based aggregations.
+    return not(r.values.type().id() == type_id::STRUCT) and
+           cudf::has_atomic_support(r.values.type()) and
+           std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+             return is_hash_aggregation(a->kind);
+           });
+  });
 }
 
 // Hash-based groupby

From b3b299ae22f31adb8f380d7add1ce2bdb726ab26 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 13 Dec 2021 20:00:37 -0800
Subject: [PATCH 078/202] Break tie for `top` categorical columns in
 `Series.describe` (#9867)

Closes #9825

This PR fixes the bug that when there are ties with most frequent categories in categorical series, the order returned is undefined. We break the tie by sorting the category and take its top.

Also included in this PR: refactors `describe` to use a dictionary items to describe for better readability. And enables https://github.com/rapidsai/cudf/blob/024003ca444f9d1a8374a1133337419f22cc880a/python/cudf/cudf/tests/test_series.py#L405

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9867
---
 python/cudf/cudf/core/series.py       | 144 ++++++++++++++------------
 python/cudf/cudf/tests/test_series.py |  37 +++----
 2 files changed, 93 insertions(+), 88 deletions(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 3ca77105d1b..bbeae1adc5e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3312,51 +3312,55 @@ def _format_percentile_names(percentiles):
             return ["{0}%".format(int(x * 100)) for x in percentiles]
 
         def _format_stats_values(stats_data):
-            return list(map(lambda x: round(x, 6), stats_data))
+            return map(lambda x: round(x, 6), stats_data)
 
         def _describe_numeric(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-            data = (
-                [self.count(), self.mean(), self.std(), self.min()]
-                + self.quantile(percentiles).to_numpy(na_value=np.nan).tolist()
-                + [self.max()]
-            )
-            data = _format_stats_values(data)
+            data = {
+                "count": self.count(),
+                "mean": self.mean(),
+                "std": self.std(),
+                "min": self.min(),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": self.max(),
+            }
 
             return Series(
-                data=data, index=index, nan_as_null=False, name=self.name,
+                data=_format_stats_values(data.values()),
+                index=data.keys(),
+                nan_as_null=False,
+                name=self.name,
             )
 
         def _describe_timedelta(self):
             # mimicking pandas
-            index = (
-                ["count", "mean", "std", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean()),
-                    str(self.std()),
-                    str(pd.Timedelta(self.min())),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timedelta(self.max()))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(self.mean()),
+                "std": str(self.std()),
+                "min": str(pd.Timedelta(self.min())),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan)
+                        .tolist(),
+                    )
+                ),
+                "max": str(pd.Timedelta(self.max())),
+            }
 
             return Series(
-                data=data,
-                index=index,
+                data=data.values(),
+                index=data.keys(),
                 dtype="str",
                 nan_as_null=False,
                 name=self.name,
@@ -3365,51 +3369,55 @@ def _describe_timedelta(self):
         def _describe_categorical(self):
             # blocked by StringColumn/DatetimeColumn support for
             # value_counts/unique
-            index = ["count", "unique", "top", "freq"]
-            val_counts = self.value_counts(ascending=False)
-            data = [self.count(), self.unique().size]
-
-            if data[1] > 0:
-                top, freq = val_counts.index[0], val_counts.iloc[0]
-                data += [str(top), freq]
-            # If the DataFrame is empty, set 'top' and 'freq' to None
-            # to maintain output shape consistency
-            else:
-                data += [None, None]
+            data = {
+                "count": self.count(),
+                "unique": len(self.unique()),
+                "top": None,
+                "freq": None,
+            }
+            if data["count"] > 0:
+                # In case there's a tie, break the tie by sorting the index
+                # and take the top.
+                val_counts = self.value_counts(ascending=False)
+                tied_val_counts = val_counts[
+                    val_counts == val_counts.iloc[0]
+                ].sort_index()
+                data.update(
+                    {
+                        "top": tied_val_counts.index[0],
+                        "freq": tied_val_counts.iloc[0],
+                    }
+                )
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )
 
         def _describe_timestamp(self):
-
-            index = (
-                ["count", "mean", "min"]
-                + _format_percentile_names(percentiles)
-                + ["max"]
-            )
-
-            data = (
-                [
-                    str(self.count()),
-                    str(self.mean().to_numpy().astype("datetime64[ns]")),
-                    str(pd.Timestamp(self.min().astype("datetime64[ns]"))),
-                ]
-                + self.quantile(percentiles)
-                .astype("str")
-                .to_numpy(na_value=None)
-                .tolist()
-                + [str(pd.Timestamp((self.max()).astype("datetime64[ns]")))]
-            )
+            data = {
+                "count": str(self.count()),
+                "mean": str(pd.Timestamp(self.mean())),
+                "min": str(pd.Timestamp(self.min())),
+                **dict(
+                    zip(
+                        _format_percentile_names(percentiles),
+                        self.quantile(percentiles)
+                        .astype(self.dtype)
+                        .astype("str")
+                        .to_numpy(na_value=np.nan),
+                    )
+                ),
+                "max": str(pd.Timestamp((self.max()))),
+            }
 
             return Series(
-                data=data,
+                data=data.values(),
                 dtype="str",
-                index=index,
+                index=data.keys(),
                 nan_as_null=False,
                 name=self.name,
             )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f214e54c57e..1e11e862329 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -12,7 +12,6 @@
 
 import cudf
 from cudf.testing._utils import (
-    DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
     assert_eq,
@@ -402,30 +401,21 @@ def test_series_describe_numeric(dtype):
     assert_eq(expected, actual)
 
 
-@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/6219")
-@pytest.mark.parametrize("dtype", DATETIME_TYPES)
+@pytest.mark.parametrize("dtype", ["datetime64[ns]"])
 def test_series_describe_datetime(dtype):
+    # Note that other datetime units are not tested because pandas does not
+    # support them. When specified coarser units, cuDF datetime columns cannot
+    # represent fractional time for quantiles of the column, which may require
+    # interpolation, this differs from pandas which always stay in [ns] unit.
     gs = cudf.Series([0, 1, 2, 3, 1, 2, 3], dtype=dtype)
     ps = gs.to_pandas()
 
-    pdf_results = ps.describe(datetime_is_numeric=True)
-    gdf_results = gs.describe()
-
-    # Assert count
-    p_count = pdf_results["count"]
-    g_count = gdf_results["count"]
-
-    assert_eq(int(g_count), p_count)
-
-    # Assert Index
-    assert_eq(gdf_results.index, pdf_results.index)
+    # Treating datetimes as categoricals is deprecated in pandas and will
+    # be removed in future. Future behavior is treating datetime as numeric.
+    expected = ps.describe(datetime_is_numeric=True)
+    actual = gs.describe()
 
-    # Assert rest of the element apart from
-    # the first index('count')
-    actual = gdf_results.tail(-1).astype("datetime64[ns]")
-    expected = pdf_results.tail(-1).astype("str").astype("datetime64[ns]")
-
-    assert_eq(expected, actual)
+    assert_eq(expected.astype("str"), actual)
 
 
 @pytest.mark.parametrize("dtype", TIMEDELTA_TYPES)
@@ -446,6 +436,13 @@ def test_series_describe_timedelta(dtype):
         pd.Series([True, False, True, True, False]),
         pd.Series([], dtype="str"),
         pd.Series(["a", "b", "c", "a"], dtype="category"),
+        pd.Series(["d", "e", "f"], dtype="category"),
+        pd.Series(pd.Categorical(["d", "e", "f"], categories=["f", "e", "d"])),
+        pd.Series(
+            pd.Categorical(
+                ["d", "e", "f"], categories=["f", "e", "d"], ordered=True
+            )
+        ),
     ],
 )
 def test_series_describe_other_types(ps):

From 262715390f739075f0bdac01ff8c92206a1c2fb5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Dec 2021 13:58:28 -0600
Subject: [PATCH 079/202] Change default `dtype` of all nulls column from
 `float` to `object` (#9803)

Fixes: #9337

- [x] This PR changes the default `dtype` of `all-nulls` column to `object` dtype from `float64` dtype.
- [x] To make `np.nan` values read as `float` column `nan_as_null` has to be passed as `False` in `cudf.DataFrame` constructor - This change is in-line with what is already supported by `cudf.Series` constructor.
- [x] Added `has_nans` & `nan_count` property which is needed for some of the checks.
- [x] Cached the `nan_count` since it is repeatedly used in math operations and clearing the cache in the regular `_clear_cache` call.
- [x] Fixes pytests that are going to break due to this breaking change of types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9803
---
 python/cudf/cudf/_lib/column.pyi              |  3 +-
 python/cudf/cudf/_lib/column.pyx              |  3 +-
 python/cudf/cudf/core/_base_index.py          |  2 +-
 python/cudf/cudf/core/column/column.py        | 34 ++++++----
 python/cudf/cudf/core/column/datetime.py      |  2 +-
 python/cudf/cudf/core/column/numerical.py     | 64 +++++++++++++++++--
 .../cudf/cudf/core/column/numerical_base.py   |  9 ++-
 python/cudf/cudf/core/column/string.py        | 15 +----
 python/cudf/cudf/core/dataframe.py            | 44 +++++++++----
 python/cudf/cudf/core/frame.py                |  2 +-
 python/cudf/cudf/core/index.py                |  2 +-
 python/cudf/cudf/core/multiindex.py           |  2 +-
 python/cudf/cudf/core/series.py               |  2 +-
 python/cudf/cudf/core/tools/datetimes.py      |  2 +-
 python/cudf/cudf/core/tools/numeric.py        |  2 +-
 python/cudf/cudf/core/window/rolling.py       |  2 +-
 python/cudf/cudf/tests/test_dataframe.py      | 54 +++++++++++++---
 python/cudf/cudf/tests/test_interpolate.py    |  8 ++-
 python/cudf/cudf/tests/test_list.py           | 15 +++--
 python/cudf/cudf/tests/test_onehot.py         | 12 +++-
 python/cudf/cudf/tests/test_repr.py           |  6 +-
 python/cudf/cudf/tests/test_series.py         |  2 +-
 python/cudf/cudf/tests/test_stats.py          |  5 +-
 23 files changed, 210 insertions(+), 82 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index dafaa8f4d1d..235cb4fd973 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -70,8 +70,7 @@ class Column:
     def nullable(self) -> bool:
         ...
 
-    @property
-    def has_nulls(self) -> bool:
+    def has_nulls(self, include_nan: bool=False) -> bool:
         ...
 
     @property
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index ff3f3050e63..5e0ee3136b7 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -146,8 +146,7 @@ cdef class Column:
     def nullable(self):
         return self.base_mask is not None
 
-    @property
-    def has_nulls(self):
+    def has_nulls(self, include_nan=False):
         return self.null_count != 0
 
     @property
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2fcc976d8e1..ac5e152d011 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -147,7 +147,7 @@ def _clean_nulls_from_index(self):
         methods using this method to replace or handle representation
         of the actual types correctly.
         """
-        if self._values.has_nulls:
+        if self._values.has_nulls():
             return cudf.Index(
                 self._values.astype("str").fillna(cudf._NA_REP), name=self.name
             )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1d113f6e159..a98052ce906 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -139,7 +139,7 @@ def values_host(self) -> "np.ndarray":
         if len(self) == 0:
             return np.array([], dtype=self.dtype)
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return self.data_array_view.copy_to_host()
@@ -152,7 +152,7 @@ def values(self) -> "cupy.ndarray":
         if len(self) == 0:
             return cupy.array([], dtype=self.dtype)
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return cupy.asarray(self.data_array_view)
@@ -193,7 +193,7 @@ def all(self, skipna: bool = True) -> bool:
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
         result_col = self.nans_to_nulls() if skipna else self
-        if not skipna and result_col.has_nulls:
+        if not skipna and result_col.has_nulls():
             return True
         elif skipna and result_col.null_count == result_col.size:
             return False
@@ -786,7 +786,7 @@ def as_mask(self) -> Buffer:
         Buffer
         """
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return bools_to_mask(self)
@@ -797,13 +797,13 @@ def is_unique(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls and self.as_frame()._is_sorted(
+        return not self.has_nulls() and self.as_frame()._is_sorted(
             ascending=None, null_position=None
         )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls and self.as_frame()._is_sorted(
+        return not self.has_nulls() and self.as_frame()._is_sorted(
             ascending=[False], null_position=None
         )
 
@@ -942,7 +942,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         )
 
         # columns include null index in factorization; remove:
-        if self.has_nulls:
+        if self.has_nulls():
             cats = cats._column.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
@@ -1216,10 +1216,10 @@ def _process_for_reduction(
 
         if skipna:
             result_col = self.nans_to_nulls()
-            if result_col.has_nulls:
+            if result_col.has_nulls():
                 result_col = result_col.dropna()
         else:
-            if self.has_nulls:
+            if self.has_nulls():
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
             result_col = self
@@ -1766,12 +1766,20 @@ def as_column(
                 "https://issues.apache.org/jira/browse/ARROW-3802"
             )
         col = ColumnBase.from_arrow(arbitrary)
+
         if isinstance(arbitrary, pa.NullArray):
-            if type(dtype) == str and dtype == "empty":
-                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+            new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+            if dtype is not None:
+                # Cast the column to the `dtype` if specified.
+                col = col.astype(dtype)
+            elif len(arbitrary) == 0:
+                # If the column is empty, it has to be
+                # a `float64` dtype.
+                col = col.astype("float64")
             else:
-                new_dtype = cudf.dtype(dtype)
-            col = col.astype(new_dtype)
+                # If the null column is not empty, it has to
+                # be of `object` dtype.
+                col = col.astype(new_dtype)
 
         return col
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 08d72f1c6ee..24ec25acbbb 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -292,7 +292,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
             "version": 1,
         }
 
-        if self.nullable and self.has_nulls:
+        if self.nullable and self.has_nulls():
 
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index becb303feeb..c947440edb1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,16 @@
 from __future__ import annotations
 
 from types import SimpleNamespace
-from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import cupy
 import numpy as np
@@ -47,6 +56,8 @@ class NumericalColumn(NumericalBaseColumn):
     mask : Buffer, optional
     """
 
+    _nan_count: Optional[int]
+
     def __init__(
         self,
         data: Buffer,
@@ -62,7 +73,7 @@ def __init__(
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
-
+        self._nan_count = None
         super().__init__(
             data,
             size=size,
@@ -72,6 +83,10 @@ def __init__(
             null_count=null_count,
         )
 
+    def _clear_cache(self):
+        super()._clear_cache()
+        self._nan_count = None
+
     def __contains__(self, item: ScalarLike) -> bool:
         """
         Returns True if column contains item, else False.
@@ -90,6 +105,11 @@ def __contains__(self, item: ScalarLike) -> bool:
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    def has_nulls(self, include_nan=False):
+        return self.null_count != 0 or (
+            self.nan_count != 0 if include_nan else False
+        )
+
     @property
     def __cuda_array_interface__(self) -> Mapping[str, Any]:
         output = {
@@ -100,7 +120,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
             "version": 1,
         }
 
-        if self.nullable and self.has_nulls:
+        if self.nullable and self.has_nulls():
 
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
@@ -280,6 +300,15 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
             return self
         return libcudf.unary.cast(self, dtype)
 
+    @property
+    def nan_count(self) -> int:
+        if self.dtype.kind != "f":
+            self._nan_count = 0
+        elif self._nan_count is None:
+            nan_col = libcudf.unary.is_nan(self)
+            self._nan_count = nan_col.sum()
+        return self._nan_count
+
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
@@ -296,6 +325,20 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
+    def _can_return_nan(self, skipna: bool = None) -> bool:
+        return not skipna and self.has_nulls(include_nan=True)
+
+    def _process_for_reduction(
+        self, skipna: bool = None, min_count: int = 0
+    ) -> Union[ColumnBase, ScalarLike]:
+        skipna = True if skipna is None else skipna
+
+        if self._can_return_nan(skipna=skipna):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return super()._process_for_reduction(
+            skipna=skipna, min_count=min_count
+        )
+
     def _default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column"""
         dkind = self.dtype.kind
@@ -319,8 +362,19 @@ def find_and_replace(
         """
         Return col with *to_replace* replaced with *value*.
         """
+
+        # If all of `to_replace`/`replacement` are `None`,
+        # dtype of `to_replace_col`/`replacement_col`
+        # is inferred as `string`, but this is a valid
+        # float64 column too, Hence we will need to type-cast
+        # to self.dtype.
         to_replace_col = column.as_column(to_replace)
+        if to_replace_col.null_count == len(to_replace_col):
+            to_replace_col = to_replace_col.astype(self.dtype)
+
         replacement_col = column.as_column(replacement)
+        if replacement_col.null_count == len(replacement_col):
+            replacement_col = replacement_col.astype(self.dtype)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
@@ -578,7 +632,7 @@ def to_pandas(
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
             pd_series = pd.Series(pandas_array, copy=False)
-        elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls:
+        elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls():
             pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
         else:
             pd_series = self.to_arrow().to_pandas(**kwargs)
@@ -597,6 +651,8 @@ def _normalize_find_and_replace_input(
     )
     col_to_normalize_dtype = normalized_column.dtype
     if isinstance(col_to_normalize, list):
+        if normalized_column.null_count == len(normalized_column):
+            normalized_column = normalized_column.astype(input_column_dtype)
         col_to_normalize_dtype = min_column_type(
             normalized_column, input_column_dtype
         )
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 853fb360c50..1f84cb88e37 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -77,10 +77,13 @@ def sum_of_squares(
             "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
         )
 
+    def _can_return_nan(self, skipna: bool = None) -> bool:
+        return not skipna and self.has_nulls()
+
     def kurtosis(self, skipna: bool = None) -> float:
         skipna = True if skipna is None else skipna
 
-        if len(self) == 0 or (not skipna and self.has_nulls):
+        if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         self = self.nans_to_nulls().dropna()  # type: ignore
@@ -105,7 +108,7 @@ def kurtosis(self, skipna: bool = None) -> float:
     def skew(self, skipna: bool = None) -> ScalarLike:
         skipna = True if skipna is None else skipna
 
-        if len(self) == 0 or (not skipna and self.has_nulls):
+        if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         self = self.nans_to_nulls().dropna()  # type: ignore
@@ -148,7 +151,7 @@ def quantile(
     def median(self, skipna: bool = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
-        if not skipna and self.has_nulls:
+        if self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         # enforce linear in case the default ever changes
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2a91abc5701..1c9a013810a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5018,7 +5018,7 @@ def edit_distance_matrix(self) -> ParentType:
             raise ValueError(
                 "Require size >= 2 to compute edit distance matrix."
             )
-        if self._column.has_nulls:
+        if self._column.has_nulls():
             raise ValueError(
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
@@ -5440,20 +5440,7 @@ def find_and_replace(
         """
 
         to_replace_col = column.as_column(to_replace)
-        if to_replace_col.null_count == len(to_replace_col):
-            # If all of `to_replace` are `None`, dtype of `to_replace_col`
-            # is inferred as `float64`, but this is a valid
-            # string column too, Hence we will need to type-cast
-            # to self.dtype.
-            to_replace_col = to_replace_col.astype(self.dtype)
-
         replacement_col = column.as_column(replacement)
-        if replacement_col.null_count == len(replacement_col):
-            # If all of `replacement` are `None`, dtype of `replacement_col`
-            # is inferred as `float64`, but this is a valid
-            # string column too, Hence we will need to type-cast
-            # to self.dtype.
-            replacement_col = replacement_col.astype(self.dtype)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bbe691595e7..88c8aaebd9e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -440,6 +440,11 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
         Data type to force. Only a single dtype is allowed.
         If None, infer.
 
+    nan_as_null : bool, Default True
+        If ``None``/``True``, converts ``np.nan`` values to
+        ``null`` values.
+        If ``False``, leaves ``np.nan`` values as is.
+
     Examples
     --------
 
@@ -514,7 +519,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _iloc_indexer_type = _DataFrameIlocIndexer
 
     @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
-    def __init__(self, data=None, index=None, columns=None, dtype=None):
+    def __init__(
+        self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
+    ):
 
         super().__init__()
 
@@ -523,7 +530,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
         if isinstance(data, (DataFrame, pd.DataFrame)):
             if isinstance(data, pd.DataFrame):
-                data = self.from_pandas(data)
+                data = self.from_pandas(data, nan_as_null=nan_as_null)
 
             if index is not None:
                 if not data.index.equals(index):
@@ -546,11 +553,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 self.columns = data.columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
-                data = cudf.Series.from_pandas(data)
+                data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
 
             name = data.name or 0
             self._init_from_dict_like(
-                {name: data}, index=index, columns=columns
+                {name: data},
+                index=index,
+                columns=columns,
+                nan_as_null=nan_as_null,
             )
         elif data is None:
             if index is None:
@@ -620,7 +630,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 if not is_dict_like(data):
                     raise TypeError("data must be list or dict-like")
 
-                self._init_from_dict_like(data, index=index, columns=columns)
+                self._init_from_dict_like(
+                    data, index=index, columns=columns, nan_as_null=nan_as_null
+                )
 
         if dtype:
             self._data = self.astype(dtype)._data
@@ -759,7 +771,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
             self.columns = columns
 
-    def _init_from_dict_like(self, data, index=None, columns=None):
+    def _init_from_dict_like(
+        self, data, index=None, columns=None, nan_as_null=None
+    ):
         if columns is not None:
             # remove all entries in `data` that are
             # not in `columns`
@@ -794,7 +808,9 @@ def _init_from_dict_like(self, data, index=None, columns=None):
                 if is_scalar(data[col_name]):
                     num_rows = num_rows or 1
                 else:
-                    data[col_name] = column.as_column(data[col_name])
+                    data[col_name] = column.as_column(
+                        data[col_name], nan_as_null=nan_as_null
+                    )
                     num_rows = len(data[col_name])
             self._index = RangeIndex(0, num_rows)
         else:
@@ -806,7 +822,9 @@ def _init_from_dict_like(self, data, index=None, columns=None):
                 self._data.multiindex = self._data.multiindex and isinstance(
                     col_name, tuple
                 )
-                self.insert(i, col_name, data[col_name])
+                self.insert(
+                    i, col_name, data[col_name], nan_as_null=nan_as_null
+                )
 
         if columns is not None:
             self.columns = columns
@@ -1747,7 +1765,7 @@ def _clean_nulls_from_dataframe(self, df):
             if is_list_dtype(df._data[col]) or is_struct_dtype(df._data[col]):
                 # TODO we need to handle this
                 pass
-            elif df._data[col].has_nulls:
+            elif df._data[col].has_nulls():
                 df[col] = df._data[col].astype("str").fillna(cudf._NA_REP)
             else:
                 df[col] = df._data[col]
@@ -2582,7 +2600,7 @@ def take(self, indices, axis=0, keep_index=None):
         return out
 
     @annotate("INSERT", color="green", domain="cudf_python")
-    def insert(self, loc, name, value):
+    def insert(self, loc, name, value, nan_as_null=None):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
@@ -2625,11 +2643,11 @@ def insert(self, loc, name, value):
                         )
                 self._data = new_data
         elif isinstance(value, (pd.Series, Series)):
-            value = Series(value)._align_to_index(
+            value = Series(value, nan_as_null=nan_as_null)._align_to_index(
                 self._index, how="right", sort=False
             )
 
-        value = column.as_column(value)
+        value = column.as_column(value, nan_as_null=nan_as_null)
 
         self._data.insert(name, value, loc=loc)
 
@@ -3081,7 +3099,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
 
         dtype = find_common_type([col.dtype for col in cols])
         for k, c in self._data.items():
-            if c.has_nulls:
+            if c.has_nulls():
                 raise ValueError(
                     f"column '{k}' has null values. "
                     f"hint: use .fillna() to replace null values"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 61ce64e7d6b..c85ed0c8555 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -4755,7 +4755,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
                 result_col = self._data[name].nans_to_nulls()
             else:
                 result_col = self._data[name].copy()
-                if result_col.has_nulls:
+                if result_col.has_nulls(include_nan=True):
                     # Workaround as find_first_value doesn't seem to work
                     # incase of bools.
                     first_index = int(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0002aaf38c5..29e0d17bc39 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2515,7 +2515,7 @@ def _clean_nulls_from_index(self):
         Convert all na values(if any) in Index object
         to `<NA>` as a preprocessing step to `__repr__` methods.
         """
-        if self._values.has_nulls:
+        if self._values.has_nulls():
             return self.fillna(cudf._NA_REP)
         else:
             return self
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 65c79b4cf59..c403c697e3d 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -386,7 +386,7 @@ def __repr__(self):
         else:
             preprocess = self.copy(deep=False)
 
-        if any(col.has_nulls for col in preprocess._data.columns):
+        if any(col.has_nulls() for col in preprocess._data.columns):
             preprocess_df = preprocess.to_frame(index=False)
             for name, col in preprocess._data.items():
                 if isinstance(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index bbeae1adc5e..036c8c1ee00 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1474,7 +1474,7 @@ def has_nulls(self):
         >>> series.dropna().has_nulls
         False
         """
-        return self._column.has_nulls
+        return self._column.has_nulls()
 
     def dropna(self, axis=0, inplace=False, how=None):
         """
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 34d62ffc048..3efbd982b53 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -330,7 +330,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
 
     elif col.dtype.kind in ("O"):
-        if unit not in (None, "ns"):
+        if unit not in (None, "ns") or col.null_count == len(col):
             try:
                 col = col.astype(dtype="int64")
             except ValueError:
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 7c688b92009..bd1b505c57f 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -165,7 +165,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     if isinstance(arg, (cudf.Series, pd.Series)):
         return cudf.Series(col)
     else:
-        if col.has_nulls:
+        if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.
             col = col.astype(float).fillna(np.nan)
         return col.values
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 617dbdeaea5..0f4256e49a6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -326,7 +326,7 @@ def apply(self, func, *args, **kwargs):
         """
         has_nulls = False
         if isinstance(self.obj, cudf.Series):
-            if self.obj._column.has_nulls:
+            if self.obj._column.has_nulls():
                 has_nulls = True
         else:
             for col in self.obj._data:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c40f9f0b0a5..ab0856fad1e 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -246,7 +246,7 @@ def test_series_init_none():
     sr1 = cudf.Series()
     got = sr1.to_string()
 
-    expect = "Series([], dtype: float64)"
+    expect = sr1.to_pandas().__repr__()
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -254,7 +254,7 @@ def test_series_init_none():
     sr2 = cudf.Series(None)
     got = sr2.to_string()
 
-    expect = "Series([], dtype: float64)"
+    expect = sr2.to_pandas().__repr__()
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -1308,7 +1308,7 @@ def test_concat_empty_dataframe(df_1, df_2):
     [
         {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]},
         {"b": [1.9, 10.9], "c": ["s1", "s2"]},
-        {"c": ["s1"], "b": [None], "a": [False]},
+        {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]},
     ],
 )
 @pytest.mark.parametrize(
@@ -2008,8 +2008,8 @@ def test_dataframe_count_reduction(data, func):
         {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]},
         {"x": [1, 2, 3], "y": [4, 5, 6]},
         {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]},
-        {"x": [], "y": []},
-        {"x": []},
+        {"x": pd.Series([], dtype="float"), "y": pd.Series([], dtype="float")},
+        {"x": pd.Series([], dtype="int")},
     ],
 )
 @pytest.mark.parametrize("ops", ["sum", "product", "prod"])
@@ -2017,7 +2017,7 @@ def test_dataframe_count_reduction(data, func):
 @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10])
 def test_dataframe_min_count_ops(data, ops, skipna, min_count):
     psr = pd.DataFrame(data)
-    gsr = cudf.DataFrame(data)
+    gsr = cudf.from_pandas(psr)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna, min_count=min_count),
@@ -2498,7 +2498,7 @@ def test_series_all_null(num_elements, null_type):
 
     # Typecast Pandas because None will return `object` dtype
     expect = pd.Series(data, dtype="float64")
-    got = cudf.Series(data)
+    got = cudf.Series(data, dtype="float64")
 
     assert_eq(expect, got)
 
@@ -8480,10 +8480,10 @@ def test_agg_for_dataframe_with_string_columns(aggs):
 )
 def test_update_for_dataframes(data, data2, join, overwrite, errors):
     pdf = pd.DataFrame(data)
-    gdf = cudf.DataFrame(data)
+    gdf = cudf.DataFrame(data, nan_as_null=False)
 
     other_pd = pd.DataFrame(data2)
-    other_gd = cudf.DataFrame(data2)
+    other_gd = cudf.DataFrame(data2, nan_as_null=False)
 
     pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors)
     gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors)
@@ -8949,7 +8949,9 @@ def test_frame_series_where_other(data):
         (
             {
                 "id": ["a", "a", "b", "b", "c", "c"],
-                "val": [None, None, None, None, None, None],
+                "val": cudf.Series(
+                    [None, None, None, None, None, None], dtype="float64"
+                ),
             },
             ["id"],
         ),
@@ -9041,6 +9043,38 @@ def test_pearson_corr_multiindex_dataframe():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [np.nan, 1, 2], "b": [None, None, None]},
+        {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]},
+        {
+            "a": [1, 2, np.nan, 2, None],
+            "b": [np.nan, np.nan, None, np.nan, np.nan],
+        },
+        {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]},
+    ],
+)
+@pytest.mark.parametrize("nan_as_null", [True, False])
+def test_dataframe_constructor_nan_as_null(data, nan_as_null):
+    actual = cudf.DataFrame(data, nan_as_null=nan_as_null)
+
+    if nan_as_null:
+        assert (
+            not (
+                actual.astype("float").replace(
+                    cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1])
+                )
+                == -1
+            )
+            .any()
+            .any()
+        )
+    else:
+        actual = actual.select_dtypes(exclude=["object"])
+        assert (actual.replace(np.nan, -1) == -1).any().any()
+
+
 def test_dataframe_add_prefix():
     cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
     pdf = cdf.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index 66556c48828..2c544dfc17c 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -50,7 +50,9 @@ def test_interpolate_series(data, method, axis):
     expect = psr.interpolate(method=method, axis=axis)
     got = gsr.interpolate(method=method, axis=axis)
 
-    assert_eq(expect, got)
+    assert_eq(
+        expect, got, check_dtype=False if psr.dtype == "object" else True
+    )
 
 
 @pytest.mark.parametrize(
@@ -88,7 +90,9 @@ def test_interpolate_series_values_or_index(data, index, method):
     expect = psr.interpolate(method=method)
     got = gsr.interpolate(method=method)
 
-    assert_eq(expect, got)
+    assert_eq(
+        expect, got, check_dtype=False if psr.dtype == "object" else True
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 2b71ca7ac36..b898222d7d7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -76,10 +76,14 @@ def test_leaves(data):
     pa_array = pa.array(data)
     while hasattr(pa_array, "flatten"):
         pa_array = pa_array.flatten()
-    dtype = "int8" if isinstance(pa_array, pa.NullArray) else None
-    expect = cudf.Series(pa_array, dtype=dtype)
+
+    expect = cudf.Series(pa_array)
     got = cudf.Series(data).list.leaves
-    assert_eq(expect, got)
+    assert_eq(
+        expect,
+        got,
+        check_dtype=False if isinstance(pa_array, pa.NullArray) else True,
+    )
 
 
 def test_list_to_pandas_nullable_true():
@@ -269,7 +273,10 @@ def test_get(data, index, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect)
     got = sr.list.get(index)
-    assert_eq(expect, got)
+
+    assert_eq(
+        expect, got, check_dtype=False if expect.isnull().all() else True
+    )
 
 
 def test_get_nested_lists():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index ed55fb86820..f2a20a73b63 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -113,10 +113,18 @@ def test_get_dummies(data):
     encoded_expected = pd.get_dummies(pdf, prefix="test")
     encoded_actual = cudf.get_dummies(gdf, prefix="test")
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    utils.assert_eq(
+        encoded_expected,
+        encoded_actual,
+        check_dtype=False if len(data) == 0 else True,
+    )
     encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    utils.assert_eq(
+        encoded_expected,
+        encoded_actual,
+        check_dtype=False if len(data) == 0 else True,
+    )
 
 
 @pytest.mark.parametrize("n_cols", [5, 10, 20])
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 736bcf131cc..fe95b2930df 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -328,10 +328,14 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
         ),
         (
             cudf.Index([None, None, None], name="hello"),
+            "StringIndex([None None None], dtype='object', name='hello')",
+        ),
+        (
+            cudf.Index([None, None, None], dtype="float", name="hello"),
             "Float64Index([<NA>, <NA>, <NA>], dtype='float64', name='hello')",
         ),
         (
-            cudf.Index([None], name="hello"),
+            cudf.Index([None], dtype="float64", name="hello"),
             "Float64Index([<NA>], dtype='float64', name='hello')",
         ),
         (
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 1e11e862329..583d2c7a8dd 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -971,7 +971,7 @@ def test_series_update(data, other):
 @pytest.mark.parametrize("nan_as_null", [True, False])
 @pytest.mark.parametrize("fill_value", [1.2, 332, np.nan])
 def test_fillna_with_nan(data, nan_as_null, fill_value):
-    gs = cudf.Series(data, nan_as_null=nan_as_null)
+    gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null)
     ps = gs.to_pandas()
 
     expected = ps.fillna(fill_value)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index ebe78d56c3f..142ca6c6831 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -460,7 +460,8 @@ def test_df_corr():
 @pytest.mark.parametrize("skipna", [True, False, None])
 def test_nans_stats(data, ops, skipna):
     psr = cudf.utils.utils._create_pandas_series(data=data)
-    gsr = cudf.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
     assert_eq(
         getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)
     )
@@ -486,7 +487,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10])
 def test_min_count_ops(data, ops, skipna, min_count):
     psr = pd.Series(data)
-    gsr = cudf.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna, min_count=min_count),

From 7a23f1a01547648db7ad684fa3dc0482b7ac813f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Dec 2021 15:28:59 -0500
Subject: [PATCH 080/202] Add utility to format ninja-log build times (#9631)

Reference: https://github.com/rapidsai/ops/issues/1896

Generate build times log from formatted, sorted `.ninja_log` file.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert Maynard (https://github.com/robertmaynard)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9631
---
 build.sh                      |  18 +++++
 ci/gpu/build.sh               |  12 +++-
 cpp/scripts/sort_ninja_log.py | 121 ++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100755 cpp/scripts/sort_ninja_log.py

diff --git a/build.sh b/build.sh
index d0ccd4821e0..adf6e220744 100755
--- a/build.sh
+++ b/build.sh
@@ -172,6 +172,12 @@ if buildAll || hasArg libcudf; then
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
+    # get the current count before the compile starts
+    FILES_IN_CCACHE=""
+    if [ -x "$(command -v ccache)" ]; then
+        FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
+    fi
+
     cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           ${CUDF_CMAKE_CUDA_ARCHITECTURES} \
@@ -185,7 +191,19 @@ if buildAll || hasArg libcudf; then
 
     cd ${LIB_BUILD_DIR}
 
+    compile_start=$(date +%s)
     cmake --build . -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+    compile_end=$(date +%s)
+    compile_total=$(( compile_end - compile_start ))
+
+    # Record build times
+    if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
+        echo "Formatting build times"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
+        message="$FILES_IN_CCACHE <p>$PARALLEL_LEVEL parallel build time is $compile_total seconds"
+        echo "$message"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html
+    fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
         cmake --build . -j${PARALLEL_LEVEL} --target install ${VERBOSE_FLAG}
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index d8b5cc7ba4c..00ad6bf812d 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -166,16 +166,26 @@ else
     gpuci_logger "Check GPU usage"
     nvidia-smi
 
-    gpuci_logger "GoogleTests"
     set -x
     cd $LIB_BUILD_DIR
 
+    gpuci_logger "GoogleTests"
+
     for gt in gtests/* ; do
         test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    # Copy libcudf build time results
+    echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html"
+    if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then
+        gpuci_logger "Copying build time results"
+        cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml"
+        mkdir -p "$WORKSPACE/build-metrics"
+        cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+    fi
+
     ################################################################################
     # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
     ################################################################################
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
new file mode 100755
index 00000000000..5eada13aea2
--- /dev/null
+++ b/cpp/scripts/sort_ninja_log.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+import argparse
+import os
+import sys
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
+)
+parser.add_argument(
+    "--fmt",
+    type=str,
+    default="csv",
+    choices=["csv", "xml", "html"],
+    help="output format (to stdout)",
+)
+parser.add_argument(
+    "--msg",
+    type=str,
+    default=None,
+    help="optional message to include in html output",
+)
+args = parser.parse_args()
+
+log_file = args.log_file
+log_path = os.path.dirname(os.path.abspath(log_file))
+
+output_fmt = args.fmt
+
+# build a map of the log entries
+entries = {}
+with open(log_file, "r") as log:
+    for line in log:
+        entry = line.split()
+        if len(entry) > 4:
+            elapsed = int(entry[1]) - int(entry[0])
+            obj_file = entry[3]
+            file_size = (
+                os.path.getsize(os.path.join(log_path, obj_file))
+                if os.path.exists(obj_file)
+                else 0
+            )
+            entries[entry[3]] = (elapsed, file_size)
+
+# check file could be loaded
+if len(entries) == 0:
+    print("Could not parse", log_file)
+    exit()
+
+# sort the keys by build time (descending order)
+keys = list(entries.keys())
+sl = sorted(keys, key=lambda k: entries[k][0], reverse=True)
+
+if output_fmt == "xml":
+    # output results in XML format
+    root = ET.Element("testsuites")
+    testsuite = ET.Element(
+        "testsuite",
+        attrib={
+            "name": "build-time",
+            "tests": str(len(keys)),
+            "failures": str(0),
+            "errors": str(0),
+        },
+    )
+    root.append(testsuite)
+    for key in sl:
+        entry = entries[key]
+        elapsed = float(entry[0]) / 1000
+        item = ET.Element(
+            "testcase",
+            attrib={
+                "classname": "BuildTime",
+                "name": key,
+                "time": str(elapsed),
+            },
+        )
+        testsuite.append(item)
+
+    tree = ET.ElementTree(root)
+    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
+    print(xmlstr)
+
+elif output_fmt == "html":
+    # output results in HTML format
+    print("<html><head><title>Sorted Ninja Build Times</title>")
+    print("<style>", "table, th, td { border:1px solid black; }", "</style>")
+    print("</head><body>")
+    if args.msg is not None:
+        print("<p>", args.msg, "</p>")
+    print("<table>")
+    print(
+        "<tr><th>File</th>",
+        "<th align='right'>Compile time (ms)</th>",
+        "<th align='right'>Size (bytes)</th><tr>",
+        sep="",
+    )
+    for key in sl:
+        result = entries[key]
+        print(
+            "<tr><td>",
+            key,
+            "</td><td align='right'>",
+            result[0],
+            "</td><td align='right'>",
+            result[1],
+            "</td></tr>",
+            sep="",
+        )
+    print("</table></body></html>")
+
+else:
+    # output results in CSV format
+    print("time,size,file")
+    for key in sl:
+        result = entries[key]
+        print(result[0], result[1], key, sep=",")

From 61794aaef64d1eb918910817799e66d51d2d6cda Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 14 Dec 2021 13:13:12 -0800
Subject: [PATCH 081/202] Fix a memcheck error in ORC writer (#9896)

Follow up of https://github.com/rapidsai/cudf/pull/9808
Skips some kernels when input columns are empty to avoid OOB memory access.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9896
---
 cpp/src/io/orc/writer_impl.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a7a767585e6..c1eb9891229 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1018,6 +1018,7 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
   hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
   hostdevice_2dvector<gpu::StripeStream>* strm_desc)
 {
+  if (segmentation.num_stripes() == 0) { return {}; }
   std::vector<StripeInformation> stripes(segmentation.num_stripes());
   for (auto const& stripe : segmentation.stripes) {
     for (size_t col_idx = 0; col_idx < enc_streams->size().first; col_idx++) {

From 41f99565d9d85d6dfb63ccd29a9717ce1dbb7eb5 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Wed, 15 Dec 2021 03:39:41 +0530
Subject: [PATCH 082/202] Add partitioning support in parquet writer (#9810)

Contributes to https://github.com/rapidsai/cudf/issues/5059

Adds libcudf support for writing partitioned datasets in parquet writer. With the new API, one can specify a vector of `{start_row, num_rows}` structs along with a table st slices of the input table gets written to the corresponding sink.
Adds Multi-sink support in `sink_info`

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9810
---
 cpp/include/cudf/io/data_sink.hpp        |  16 +
 cpp/include/cudf/io/detail/parquet.hpp   |  15 +-
 cpp/include/cudf/io/orc.hpp              |  59 +++
 cpp/include/cudf/io/parquet.hpp          | 153 ++++++-
 cpp/include/cudf/io/types.hpp            |  87 +++-
 cpp/src/io/functions.cpp                 |  88 ++--
 cpp/src/io/orc/writer_impl.cu            |  12 +-
 cpp/src/io/orc/writer_impl.hpp           |   2 +
 cpp/src/io/parquet/chunk_dict.cu         | 141 ++-----
 cpp/src/io/parquet/page_enc.cu           | 161 ++-----
 cpp/src/io/parquet/parquet_gpu.hpp       |  46 +-
 cpp/src/io/parquet/writer_impl.cu        | 511 +++++++++++++++--------
 cpp/src/io/parquet/writer_impl.hpp       |  33 +-
 cpp/tests/io/parquet_test.cpp            | 102 ++++-
 python/cudf/cudf/_lib/cpp/io/orc.pxd     |   9 +
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |  24 +-
 python/cudf/cudf/_lib/cpp/io/types.pxd   |  15 +-
 python/cudf/cudf/_lib/orc.pyx            |   9 +-
 python/cudf/cudf/_lib/parquet.pyx        |  29 +-
 19 files changed, 962 insertions(+), 550 deletions(-)

diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 42421aed716..2c1966ee6ba 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -69,6 +69,22 @@ class data_sink {
    */
   static std::unique_ptr<data_sink> create(cudf::io::data_sink* const user_sink);
 
+  /**
+   * @brief Creates a vector of data sinks, one per element in the input vector.
+   *
+   * @param[in] args vector of parameters
+   */
+  template <typename T>
+  static std::vector<std::unique_ptr<data_sink>> create(std::vector<T> const& args)
+  {
+    std::vector<std::unique_ptr<data_sink>> sinks;
+    sinks.reserve(args.size());
+    std::transform(args.cbegin(), args.cend(), std::back_inserter(sinks), [](auto const& arg) {
+      return data_sink::create(arg);
+    });
+    return sinks;
+  }
+
   /**
    * @brief Base class destructor
    */
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index a18bd450640..9af2e3f278d 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -89,13 +89,13 @@ class writer {
   /**
    * @brief Constructor for output to a file.
    *
-   * @param sink The data sink to write the data to
+   * @param sinks The data sinks to write the data to
    * @param options Settings for controlling writing behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
+  explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
                   SingleWriteMode mode,
                   rmm::cuda_stream_view stream,
@@ -104,7 +104,7 @@ class writer {
   /**
    * @brief Constructor for writer to handle chunked parquet options.
    *
-   * @param sink The data sink to write the data to
+   * @param sinks The data sinks to write the data to
    * @param options Settings for controlling writing behavior for chunked writer
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
@@ -112,7 +112,7 @@ class writer {
    *
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
+  explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
                   SingleWriteMode mode,
                   rmm::cuda_stream_view stream,
@@ -127,8 +127,10 @@ class writer {
    * @brief Writes a single subtable as part of a larger parquet file/table write.
    *
    * @param[in] table The table information to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
    */
-  void write(table_view const& table);
+  void write(table_view const& table, std::vector<partition_info> const& partitions = {});
 
   /**
    * @brief Finishes the chunked/streamed write process.
@@ -138,7 +140,8 @@ class writer {
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
    * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_path = {});
 
   /**
    * @brief Merges multiple metadata blobs returned by write_all into a single metadata blob
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 16588185f3d..b3a2f6bcbbb 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -454,6 +454,8 @@ class orc_writer_options {
   table_view _table;
   // Optional associated metadata
   const table_input_metadata* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::map<std::string, std::string> _user_data;
 
   friend orc_writer_options_builder;
 
@@ -530,6 +532,11 @@ class orc_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+
   // Setters
 
   /**
@@ -591,6 +598,16 @@ class orc_writer_options {
    * @param meta Associated metadata.
    */
   void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
+
+  /**
+   * @brief Sets metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    _user_data = std::move(metadata);
+  }
 };
 
 class orc_writer_options_builder {
@@ -698,6 +715,18 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief move orc_writer_options member once it's built.
    */
@@ -753,6 +782,8 @@ class chunked_orc_writer_options {
   size_type _row_index_stride = default_row_index_stride;
   // Optional associated metadata
   const table_input_metadata* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::map<std::string, std::string> _user_data;
 
   friend chunked_orc_writer_options_builder;
 
@@ -819,6 +850,11 @@ class chunked_orc_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+
   // Setters
 
   /**
@@ -873,6 +909,16 @@ class chunked_orc_writer_options {
    * @param meta Associated metadata.
    */
   void metadata(table_input_metadata const* meta) { _metadata = meta; }
+
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    _user_data = std::move(metadata);
+  }
 };
 
 class chunked_orc_writer_options_builder {
@@ -965,6 +1011,19 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  chunked_orc_writer_options_builder& key_value_metadata(
+    std::map<std::string, std::string> metadata)
+  {
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 2215f24b550..740f7a8b2db 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -364,13 +364,17 @@ class parquet_writer_options {
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Sets of columns to output
   table_view _table;
+  // Partitions described as {start_row, num_rows} pairs
+  std::vector<partition_info> _partitions;
   // Optional associated metadata
   table_input_metadata const* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
-  // Column chunks file path to be set in the raw output metadata
-  std::string _column_chunks_file_path;
+  // Column chunks file paths to be set in the raw output metadata. One per output file
+  std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -434,20 +438,36 @@ class parquet_writer_options {
    */
   table_view get_table() const { return _table; }
 
+  /**
+   * @brief Returns partitions.
+   */
+  std::vector<partition_info> const& get_partitions() const { return _partitions; }
+
   /**
    * @brief Returns associated metadata.
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
+
   /**
    * @brief Returns `true` if timestamps will be written as INT96
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
   /**
-   * @brief Returns Column chunks file path to be set in the raw output metadata.
+   * @brief Returns Column chunks file paths to be set in the raw output metadata.
    */
-  std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
+  std::vector<std::string> const& get_column_chunks_file_paths() const
+  {
+    return _column_chunks_file_paths;
+  }
 
   /**
    * @brief Returns maximum row group size, in bytes.
@@ -459,6 +479,19 @@ class parquet_writer_options {
    */
   auto get_row_group_size_rows() const { return _row_group_size_rows; }
 
+  /**
+   * @brief Sets partitions.
+   *
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
+   */
+  void set_partitions(std::vector<partition_info> partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of partitions");
+    _partitions = std::move(partitions);
+  }
+
   /**
    * @brief Sets metadata.
    *
@@ -466,6 +499,18 @@ class parquet_writer_options {
    */
   void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
+  /**
+   * @brief Sets metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    _user_data = std::move(metadata);
+  }
+
   /**
    * @brief Sets the level of statistics.
    *
@@ -491,11 +536,14 @@ class parquet_writer_options {
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param file_path String which indicates file path.
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks in sink info
    */
-  void set_column_chunks_file_path(std::string file_path)
+  void set_column_chunks_file_paths(std::vector<std::string> file_paths)
   {
-    _column_chunks_file_path.assign(file_path);
+    CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of chunk paths to set");
+    _column_chunks_file_paths = std::move(file_paths);
   }
 
   /**
@@ -543,6 +591,21 @@ class parquet_writer_options_builder {
   {
   }
 
+  /**
+   * @brief Sets partitions in parquet_writer_options.
+   *
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of partitions");
+    options.set_partitions(std::move(partitions));
+    return *this;
+  }
+
   /**
    * @brief Sets metadata in parquet_writer_options.
    *
@@ -555,6 +618,21 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata in parquet_writer_options.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& key_value_metadata(
+    std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
    *
@@ -582,12 +660,15 @@ class parquet_writer_options_builder {
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param file_path String which indicates file path.
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks
    * @return this for chaining.
    */
-  parquet_writer_options_builder& column_chunks_file_path(std::string file_path)
+  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths)
   {
-    options._column_chunks_file_path.assign(file_path);
+    CUDF_EXPECTS(file_paths.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of chunk paths to set");
+    options.set_column_chunks_file_paths(std::move(file_paths));
     return *this;
   }
 
@@ -690,6 +771,8 @@ class chunked_parquet_writer_options {
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Optional associated metadata.
   table_input_metadata const* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
@@ -735,6 +818,14 @@ class chunked_parquet_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
+
   /**
    * @brief Returns `true` if timestamps will be written as INT96
    */
@@ -757,6 +848,18 @@ class chunked_parquet_writer_options {
    */
   void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    _user_data = std::move(metadata);
+  }
+
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
    *
@@ -841,6 +944,21 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata in parquet_writer_options.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& key_value_metadata(
+    std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    options.set_key_value_metadata(std::move(metadata));
+    return *this;
+  }
+
   /**
    * @brief Sets Sets the level of statistics in chunked_parquet_writer_options.
    *
@@ -958,18 +1076,25 @@ class parquet_chunked_writer {
    * @brief Writes table to output.
    *
    * @param[in] table Table that needs to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
+   *
+   * @throws cudf::logic_error If the number of partitions is not the smae as number of sinks
    * @return returns reference of the class object
    */
-  parquet_chunked_writer& write(table_view const& table);
+  parquet_chunked_writer& write(table_view const& table,
+                                std::vector<partition_info> const& partitions = {});
 
   /**
    * @brief Finishes the chunked/streamed write process.
    *
-   * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata
+   * @param[in] column_chunks_file_paths Column chunks file path to be set in the raw output
+   * metadata
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
-   * `column_chunks_file_path` is provided, else null.
+   * `column_chunks_file_paths` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_paths = {});
 
   // Unique pointer to impl writer class
   std::unique_ptr<cudf::io::detail::parquet::writer> writer;
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index cf6be8a20af..512a90b3249 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -151,61 +151,93 @@ struct host_buffer {
  * @brief Source information for read interfaces
  */
 struct source_info {
-  io_type type = io_type::FILEPATH;
-  std::vector<std::string> filepaths;
-  std::vector<host_buffer> buffers;
-  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> files;
-  std::vector<cudf::io::datasource*> user_sources;
+  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> _files;
 
   source_info() = default;
 
   explicit source_info(std::vector<std::string> const& file_paths)
-    : type(io_type::FILEPATH), filepaths(file_paths)
+    : _type(io_type::FILEPATH), _filepaths(file_paths)
   {
   }
   explicit source_info(std::string const& file_path)
-    : type(io_type::FILEPATH), filepaths({file_path})
+    : _type(io_type::FILEPATH), _filepaths({file_path})
   {
   }
 
   explicit source_info(std::vector<host_buffer> const& host_buffers)
-    : type(io_type::HOST_BUFFER), buffers(host_buffers)
+    : _type(io_type::HOST_BUFFER), _buffers(host_buffers)
   {
   }
   explicit source_info(const char* host_data, size_t size)
-    : type(io_type::HOST_BUFFER), buffers({{host_data, size}})
+    : _type(io_type::HOST_BUFFER), _buffers({{host_data, size}})
   {
   }
 
   explicit source_info(std::vector<cudf::io::datasource*> const& sources)
-    : type(io_type::USER_IMPLEMENTED), user_sources(sources)
+    : _type(io_type::USER_IMPLEMENTED), _user_sources(sources)
   {
   }
   explicit source_info(cudf::io::datasource* source)
-    : type(io_type::USER_IMPLEMENTED), user_sources({source})
+    : _type(io_type::USER_IMPLEMENTED), _user_sources({source})
   {
   }
+
+  auto type() const { return _type; }
+  auto const& filepaths() const { return _filepaths; }
+  auto const& buffers() const { return _buffers; }
+  auto const& files() const { return _files; }
+  auto const& user_sources() const { return _user_sources; }
+
+ private:
+  io_type _type = io_type::FILEPATH;
+  std::vector<std::string> _filepaths;
+  std::vector<host_buffer> _buffers;
+  std::vector<cudf::io::datasource*> _user_sources;
 };
 
 /**
  * @brief Destination information for write interfaces
  */
 struct sink_info {
-  io_type type = io_type::VOID;
-  std::string filepath;
-  std::vector<char>* buffer      = nullptr;
-  cudf::io::data_sink* user_sink = nullptr;
-
   sink_info() = default;
+  sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {}
 
-  explicit sink_info(const std::string& file_path) : type(io_type::FILEPATH), filepath(file_path) {}
+  explicit sink_info(std::vector<std::string> const& file_paths)
+    : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
+  {
+  }
+  explicit sink_info(std::string const& file_path)
+    : _type(io_type::FILEPATH), _filepaths({file_path})
+  {
+  }
 
-  explicit sink_info(std::vector<char>* buffer) : type(io_type::HOST_BUFFER), buffer(buffer) {}
+  explicit sink_info(std::vector<std::vector<char>*> const& buffers)
+    : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(buffers)
+  {
+  }
+  explicit sink_info(std::vector<char>* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {}
 
-  explicit sink_info(class cudf::io::data_sink* user_sink_)
-    : type(io_type::USER_IMPLEMENTED), user_sink(user_sink_)
+  explicit sink_info(std::vector<cudf::io::data_sink*> const& user_sinks)
+    : _type(io_type::USER_IMPLEMENTED), _num_sinks(user_sinks.size()), _user_sinks(user_sinks)
   {
   }
+  explicit sink_info(class cudf::io::data_sink* user_sink)
+    : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink})
+  {
+  }
+
+  auto type() const { return _type; }
+  auto num_sinks() const { return _num_sinks; }
+  auto const& filepaths() const { return _filepaths; }
+  auto const& buffers() const { return _buffers; }
+  auto const& user_sinks() const { return _user_sinks; }
+
+ private:
+  io_type _type     = io_type::VOID;
+  size_t _num_sinks = 1;
+  std::vector<std::string> _filepaths;
+  std::vector<std::vector<char>*> _buffers;
+  std::vector<cudf::io::data_sink*> _user_sinks;
 };
 
 class table_input_metadata;
@@ -369,12 +401,21 @@ class table_input_metadata {
    * The constructed table_input_metadata has the same structure as the passed table_view
    *
    * @param table The table_view to construct metadata for
-   * @param user_data Optional Additional metadata to encode, as key-value pairs
    */
-  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+  table_input_metadata(table_view const& table);
 
   std::vector<column_in_metadata> column_metadata;
-  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
+
+/**
+ * @brief Information used while writing partitioned datasets
+ *
+ * This information defines the slice of an input table to write to file. In partitioned dataset
+ * writing, one partition_info struct defines one partition and corresponds to one output file
+ */
+struct partition_info {
+  size_type start_row;
+  size_type num_rows;
 };
 
 }  // namespace io
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 768d6b25690..04638d3eca9 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -111,27 +111,33 @@ std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info
                                                                     size_t range_offset = 0,
                                                                     size_t range_size   = 0)
 {
-  switch (info.type) {
+  switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
-      for (auto const& filepath : info.filepaths) {
+      for (auto const& filepath : info.filepaths()) {
         sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
       }
       return sources;
     }
-    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
-    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers());
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources());
     default: CUDF_FAIL("Unsupported source type");
   }
 }
 
-std::unique_ptr<data_sink> make_datasink(sink_info const& info)
+std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
 {
-  switch (info.type) {
-    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
-    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
-    case io_type::VOID: return cudf::io::data_sink::create();
-    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+  switch (info.type()) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepaths());
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffers());
+    case io_type::VOID: {
+      std::vector<std::unique_ptr<data_sink>> sinks;
+      for (size_t i = 0; i < info.num_sinks(); ++i) {
+        sinks.push_back(cudf::io::data_sink::create());
+      }
+      return sinks;
+    }
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sinks());
     default: CUDF_FAIL("Unsupported sink type");
   }
 }
@@ -156,9 +162,9 @@ compression_type infer_compression_type(compression_type compression, source_inf
 {
   if (compression != compression_type::AUTO) { return compression; }
 
-  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
 
-  auto filepath = info.filepaths[0];
+  auto filepath = info.filepaths()[0];
 
   // Attempt to infer from the file extension
   const auto pos = filepath.find_last_of('.');
@@ -218,10 +224,11 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for CSV writing");
 
   return csv::write_csv(  //
-    sink.get(),
+    sinks[0].get(),
     options.get_table(),
     options.get_metadata(),
     options,
@@ -235,15 +242,16 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 {
   // Get source to read statistics from
   std::unique_ptr<datasource> source;
-  if (src_info.type == io_type::FILEPATH) {
-    CUDF_EXPECTS(src_info.filepaths.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.filepaths[0]);
-  } else if (src_info.type == io_type::HOST_BUFFER) {
-    CUDF_EXPECTS(src_info.buffers.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.buffers[0]);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    CUDF_EXPECTS(src_info.user_sources.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.user_sources[0]);
+  if (src_info.type() == io_type::FILEPATH) {
+    CUDF_EXPECTS(src_info.filepaths().size() == 1, "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.filepaths()[0]);
+  } else if (src_info.type() == io_type::HOST_BUFFER) {
+    CUDF_EXPECTS(src_info.buffers().size() == 1, "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.buffers()[0]);
+  } else if (src_info.type() == io_type::USER_IMPLEMENTED) {
+    CUDF_EXPECTS(src_info.user_sources().size() == 1,
+                 "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.user_sources()[0]);
   } else {
     CUDF_FAIL("Unsupported source type");
   }
@@ -350,9 +358,11 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 
   CUDF_FUNC_RANGE();
 
-  auto sink   = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
+
   auto writer = std::make_unique<detail_orc::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -365,10 +375,11 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options
 {
   namespace io_detail = cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   writer = std::make_unique<detail_orc::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -417,9 +428,7 @@ std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   return detail_parquet::writer::merge_row_group_metadata(metadata_list);
 }
 
-table_input_metadata::table_input_metadata(table_view const& table,
-                                           std::map<std::string, std::string> user_data)
-  : user_data{std::move(user_data)}
+table_input_metadata::table_input_metadata(table_view const& table)
 {
   // Create a metadata hierarchy using `table`
   std::function<column_in_metadata(column_view const&)> get_children = [&](column_view const& col) {
@@ -443,13 +452,13 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 
   CUDF_FUNC_RANGE();
 
-  auto sink   = make_datasink(options.get_sink());
+  auto sinks  = make_datasinks(options.get_sink());
   auto writer = std::make_unique<detail_parquet::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table());
+  writer->write(options.get_table(), options.get_partitions());
 
-  return writer->close(options.get_column_chunks_file_path());
+  return writer->close(options.get_column_chunks_file_paths());
 }
 
 /**
@@ -460,20 +469,21 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
 {
   namespace io_detail = cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
 
   writer = std::make_unique<detail_parquet::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::write
  */
-parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table)
+parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table,
+                                                      std::vector<partition_info> const& partitions)
 {
   CUDF_FUNC_RANGE();
 
-  writer->write(table);
+  writer->write(table, partitions);
 
   return *this;
 }
@@ -482,7 +492,7 @@ parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table)
  * @copydoc cudf::io::parquet_chunked_writer::close
  */
 std::unique_ptr<std::vector<uint8_t>> parquet_chunked_writer::close(
-  std::string const& column_chunks_file_path)
+  std::vector<std::string> const& column_chunks_file_path)
 {
   CUDF_FUNC_RANGE();
   return writer->close(column_chunks_file_path);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index c1eb9891229..b0e674c206f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1313,6 +1313,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
 {
   if (options.get_metadata()) {
@@ -1333,6 +1334,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
 {
   if (options.get_metadata()) {
@@ -2069,12 +2071,10 @@ void writer::impl::close()
   PostScript ps;
 
   ff.contentLength = out_sink_->bytes_written();
-  std::transform(table_meta->user_data.begin(),
-                 table_meta->user_data.end(),
-                 std::back_inserter(ff.metadata),
-                 [&](auto const& udata) {
-                   return UserMetadataItem{udata.first, udata.second};
-                 });
+  std::transform(
+    kv_meta.begin(), kv_meta.end(), std::back_inserter(ff.metadata), [&](auto const& udata) {
+      return UserMetadataItem{udata.first, udata.second};
+    });
 
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 68622d17b28..80c22b09927 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -369,6 +369,8 @@ class writer::impl {
   bool const single_write_mode;
   // optional user metadata
   std::unique_ptr<table_input_metadata> table_meta;
+  // optional user metadata
+  std::map<std::string, std::string> kv_meta;
   // to track if the output has been written to sink
   bool closed = false;
 
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 42d27dadd1a..5589f87e57e 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -95,69 +95,41 @@ struct map_find_fn {
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                  size_type num_rows)
+                                  cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
   auto t       = threadIdx.x;
+  auto frag    = frags[col_idx][block_x];
+  auto chunk   = frag.chunk;
+  auto col     = chunk->col_desc;
 
-  auto start_row =
-    block_x *
-    max_page_fragment_size;  // This is fragment size. all chunks are multiple of these many rows.
-  size_type end_row = min(start_row + max_page_fragment_size, num_rows);
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ EncColumnChunk* s_chunk;
-  __shared__ parquet_column_device_view s_col;
   __shared__ size_type s_start_value_idx;
   __shared__ size_type s_num_values;
-  if (t == 0) {
-    // Find the chunk this block is a part of
-    size_type num_rowgroups = chunks.size().first;
-    size_type rg_idx        = 0;
-    while (rg_idx < num_rowgroups) {
-      if (auto ck = chunks[rg_idx][col_idx];
-          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
-        break;
-      }
-      ++rg_idx;
-    }
-    s_chunk = &chunks[rg_idx][col_idx];
-    s_col   = *(s_chunk->col_desc);
-  }
-  __syncthreads();
-  if (not s_chunk->use_dictionary) { return; }
+
+  if (not chunk->use_dictionary) { return; }
 
   if (t == 0) {
     // Find the bounds of values in leaf column to be inserted into the map for current chunk
-    auto col             = *(s_col.parent_column);
-    auto start_value_idx = start_row;
-    auto end_value_idx   = end_row;
-    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-      if (col.type().id() == type_id::STRUCT) {
-        start_value_idx += col.offset();
-        end_value_idx += col.offset();
-        col = col.child(0);
-      } else {
-        auto offset_col = col.child(lists_column_view::offsets_column_index);
-        start_value_idx = offset_col.element<size_type>(start_value_idx + col.offset());
-        end_value_idx   = offset_col.element<size_type>(end_value_idx + col.offset());
-        col             = col.child(lists_column_view::child_column_index);
-      }
-    }
-    s_start_value_idx = start_value_idx;
-    s_num_values      = end_value_idx - start_value_idx;
+    auto cudf_col      = *(col->parent_column);
+    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
+    s_num_values       = end_value_idx - s_start_value_idx;
   }
   __syncthreads();
 
-  column_device_view const& data_col = *s_col.leaf_column;
+  column_device_view const& data_col = *col->leaf_column;
   using block_reduce                 = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
   // Make a view of the hash map
   auto hash_map_mutable = map_type::device_mutable_view(
-    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
   auto hash_map = map_type::device_view(
-    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
   __shared__ int total_num_dict_entries;
   for (size_type i = 0; i < s_num_values; i += block_size) {
@@ -176,7 +148,7 @@ __global__ void __launch_bounds__(block_size, 1)
           type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
         uniq_elem_size = [&]() -> size_type {
           if (not is_unique) { return 0; }
-          switch (s_col.physical_type) {
+          switch (col->physical_type) {
             case Type::INT32: return 4;
             case Type::INT64: return 8;
             case Type::INT96: return 12;
@@ -200,9 +172,9 @@ __global__ void __launch_bounds__(block_size, 1)
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
     if (t == 0) {
-      total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique);
+      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
-      atomicAdd(&s_chunk->uniq_data_size, uniq_data_size);
+      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
     }
     __syncthreads();
 
@@ -246,67 +218,38 @@ __global__ void __launch_bounds__(block_size, 1)
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                size_type num_rows)
+                                cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
   auto t       = threadIdx.x;
+  auto frag    = frags[col_idx][block_x];
+  auto chunk   = frag.chunk;
+  auto col     = chunk->col_desc;
 
-  size_type start_row = block_x * max_page_fragment_size;
-  size_type end_row   = min(start_row + max_page_fragment_size, num_rows);
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ EncColumnChunk s_chunk;
-  __shared__ parquet_column_device_view s_col;
   __shared__ size_type s_start_value_idx;
   __shared__ size_type s_ck_start_val_idx;
   __shared__ size_type s_num_values;
 
   if (t == 0) {
-    // Find the chunk this block is a part of
-    size_type num_rowgroups = chunks.size().first;
-    size_type rg_idx        = 0;
-    while (rg_idx < num_rowgroups) {
-      if (auto ck = chunks[rg_idx][col_idx];
-          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
-        break;
-      }
-      ++rg_idx;
-    }
-    s_chunk = chunks[rg_idx][col_idx];
-    s_col   = *(s_chunk.col_desc);
-
-    // Find the bounds of values in leaf column to be inserted into the map for current chunk
-
-    auto col                 = *(s_col.parent_column);
-    auto start_value_idx     = start_row;
-    auto end_value_idx       = end_row;
-    auto chunk_start_val_idx = s_chunk.start_row;
-    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-      if (col.type().id() == type_id::STRUCT) {
-        start_value_idx += col.offset();
-        chunk_start_val_idx += col.offset();
-        end_value_idx += col.offset();
-        col = col.child(0);
-      } else {
-        auto offset_col     = col.child(lists_column_view::offsets_column_index);
-        start_value_idx     = offset_col.element<size_type>(start_value_idx + col.offset());
-        chunk_start_val_idx = offset_col.element<size_type>(chunk_start_val_idx + col.offset());
-        end_value_idx       = offset_col.element<size_type>(end_value_idx + col.offset());
-        col                 = col.child(lists_column_view::child_column_index);
-      }
-    }
-    s_start_value_idx  = start_value_idx;
-    s_ck_start_val_idx = chunk_start_val_idx;
-    s_num_values       = end_value_idx - start_value_idx;
+    // Find the bounds of values in leaf column to be searched in the map for current chunk
+    auto cudf_col      = *(col->parent_column);
+    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+    s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
+    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
+    s_num_values       = end_value_idx - s_start_value_idx;
   }
   __syncthreads();
 
-  if (not s_chunk.use_dictionary) { return; }
+  if (not chunk->use_dictionary) { return; }
 
-  column_device_view const& data_col = *s_col.leaf_column;
+  column_device_view const& data_col = *col->leaf_column;
 
   auto map = map_type::device_view(
-    s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
   for (size_t i = 0; i < s_num_values; i += block_size) {
     if (t + i < s_num_values) {
@@ -321,7 +264,7 @@ __global__ void __launch_bounds__(block_size, 1)
         if (found_slot != map.end()) {
           // No need for atomic as this is not going to be modified by any other thread
           auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-          s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
         }
       }
     }
@@ -336,16 +279,14 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
 }
 
 void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              size_type num_rows,
+                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 256;
-  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
-  auto const num_columns   = chunks.size().second;
-  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+  dim3 const dim_grid(frags.size().second, frags.size().first);
 
   populate_chunk_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
 }
 
 void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -355,16 +296,14 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
 }
 
 void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            size_type num_rows,
+                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 256;
-  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
-  auto const num_columns   = chunks.size().second;
-  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+  dim3 const dim_grid(frags.size().second, frags.size().first);
 
   get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
 }
 }  // namespace gpu
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 33647ff626c..ec6b24b3b4e 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -27,6 +27,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
+#include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -49,7 +50,6 @@ constexpr uint32_t rle_buffer_size = (1 << 9);
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
-  size_type start_value_idx;
 };
 
 struct page_enc_state_s {
@@ -114,24 +114,14 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
   return uint32_init_hash(static_cast<uint32_t>(v + (v >> 32)));
 }
 
-/**
- * @brief Initializes encoder page fragments
- *
- * Based on the number of rows in each fragment, populates the value count, the size of data in the
- * fragment, the number of unique values, and the data size of unique values.
- *
- * @param[in] frag Fragment array [fragment_id][column_id]
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
- */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
   gpuInitPageFragments(device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
-                       uint32_t fragment_size,
-                       uint32_t max_num_rows)
+                       device_span<partition_info const> partitions,
+                       device_span<int const> part_frag_offset,
+                       uint32_t fragment_size)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
@@ -140,53 +130,36 @@ __global__ void __launch_bounds__(block_size)
 
   frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
+  int frag_y                 = blockIdx.y;
 
   if (t == 0) s->col = col_desc[blockIdx.x];
   __syncthreads();
-  uint32_t const start_row = blockIdx.y * fragment_size;
   if (!t) {
-    // frag.num_rows = fragment_size except for the last page fragment which can be smaller.
+    // Find which partition this fragment came from
+    auto it =
+      thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y);
+    int p             = it - part_frag_offset.begin() - 1;
+    int part_end_row  = partitions[p].start_row + partitions[p].num_rows;
+    s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row;
+
+    // frag.num_rows = fragment_size except for the last fragment in partition which can be smaller.
     // num_rows is fixed but fragment size could be larger if the data is strings or nested.
-    s->frag.num_rows           = min(fragment_size, max_num_rows - min(start_row, max_num_rows));
+    s->frag.num_rows           = min(fragment_size, part_end_row - s->frag.start_row);
     s->frag.num_dict_vals      = 0;
     s->frag.fragment_data_size = 0;
     s->frag.dict_data_size     = 0;
 
-    // To use num_vals instead of num_rows, we need to calculate num_vals on the fly.
-    // For list<list<int>>, values between i and i+50 can be calculated by
-    // off_11 = off[i], off_12 = off[i+50]
-    // off_21 = child.off[off_11], off_22 = child.off[off_12]
-    // etc...
-    size_type end_value_idx = start_row + s->frag.num_rows;
-    if (s->col.parent_column == nullptr) {
-      s->start_value_idx = start_row;
-    } else {
-      auto col                     = *(s->col.parent_column);
-      auto current_start_value_idx = start_row;
-      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-        if (col.type().id() == type_id::STRUCT) {
-          current_start_value_idx += col.offset();
-          end_value_idx += col.offset();
-          col = col.child(0);
-        } else {
-          auto offset_col = col.child(lists_column_view::offsets_column_index);
-          current_start_value_idx =
-            offset_col.element<size_type>(current_start_value_idx + col.offset());
-          end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
-          col           = col.child(lists_column_view::child_column_index);
-        }
-      }
-      s->start_value_idx = current_start_value_idx;
-    }
-    s->frag.start_value_idx = s->start_value_idx;
-    s->frag.num_leaf_values = end_value_idx - s->start_value_idx;
+    auto col                = *(s->col.parent_column);
+    s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, col);
+    size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, col);
+    s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx;
 
     if (s->col.level_offsets != nullptr) {
       // For nested schemas, the number of values in a fragment is not directly related to the
       // number of encoded data elements or the number of rows.  It is simply the number of
       // repetition/definition values which together encode validity and nesting information.
-      size_type first_level_val_idx = s->col.level_offsets[start_row];
-      size_type last_level_val_idx  = s->col.level_offsets[start_row + s->frag.num_rows];
+      size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row];
+      size_type last_level_val_idx  = s->col.level_offsets[s->frag.start_row + s->frag.num_rows];
       s->frag.num_values            = last_level_val_idx - first_level_val_idx;
     } else {
       s->frag.num_values = s->frag.num_rows;
@@ -197,7 +170,7 @@ __global__ void __launch_bounds__(block_size)
   __syncthreads();
 
   size_type nvals           = s->frag.num_leaf_values;
-  size_type start_value_idx = s->start_value_idx;
+  size_type start_value_idx = s->frag.start_value_idx;
 
   for (uint32_t i = 0; i < nvals; i += block_size) {
     uint32_t val_idx  = start_value_idx + i + t;
@@ -912,28 +885,9 @@ __global__ void __launch_bounds__(128, 8)
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
     }
-    s->page_start_val    = s->page.start_row;  // Dictionary page's start row is chunk's start row
-    auto chunk_start_val = s->ck.start_row;
-    if (s->col.parent_column != nullptr) {  // TODO: remove this check. parent is now never nullptr
-      auto col                    = *(s->col.parent_column);
-      auto current_page_start_val = s->page_start_val;
-      // TODO: We do this so much. Add a global function that converts row idx to val idx
-      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-        if (col.type().id() == type_id::STRUCT) {
-          current_page_start_val += col.offset();
-          chunk_start_val += col.offset();
-          col = col.child(0);
-        } else {
-          auto offset_col = col.child(lists_column_view::offsets_column_index);
-          current_page_start_val =
-            offset_col.element<size_type>(current_page_start_val + col.offset());
-          chunk_start_val = offset_col.element<size_type>(chunk_start_val + col.offset());
-          col             = col.child(lists_column_view::child_column_index);
-        }
-      }
-      s->page_start_val  = current_page_start_val;
-      s->chunk_start_val = chunk_start_val;
-    }
+    auto col           = *(s->col.parent_column);
+    s->page_start_val  = row_to_value_idx(s->page.start_row, col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, col);
   }
   __syncthreads();
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
@@ -1944,36 +1898,20 @@ dremel_data get_dremel_data(column_view h_col,
     std::move(new_offsets), std::move(rep_level), std::move(def_level), leaf_data_size};
 }
 
-/**
- * @brief Launches kernel for initializing encoder page fragments
- *
- * @param[in,out] frag Fragment array [column_id][fragment_id]
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitPageFragments(device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
+                       device_span<partition_info const> partitions,
+                       device_span<int const> part_frag_offset,
                        uint32_t fragment_size,
-                       uint32_t num_rows,
                        rmm::cuda_stream_view stream)
 {
   auto num_columns              = frag.size().first;
   auto num_fragments_per_column = frag.size().second;
   dim3 dim_grid(num_columns, num_fragments_per_column);  // 1 threadblock per fragment
-  gpuInitPageFragments<512>
-    <<<dim_grid, 512, 0, stream.value()>>>(frag, col_desc, fragment_size, num_rows);
+  gpuInitPageFragments<512><<<dim_grid, 512, 0, stream.value()>>>(
+    frag, col_desc, partitions, part_frag_offset, fragment_size);
 }
 
-/**
- * @brief Launches kernel for initializing fragment statistics groups
- *
- * @param[out] groups Statistics groups [num_columns x num_fragments]
- * @param[in] fragments Page fragments [num_columns x num_fragments]
- * @param[in] col_desc Column description [num_columns]
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitFragmentStatistics(device_2dspan<statistics_group> groups,
                             device_2dspan<PageFragment const> fragments,
                             device_span<parquet_column_device_view const> col_desc,
@@ -1986,19 +1924,6 @@ void InitFragmentStatistics(device_2dspan<statistics_group> groups,
   gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(groups, fragments, col_desc);
 }
 
-/**
- * @brief Launches kernel for initializing encoder data pages
- *
- * @param[in,out] chunks Column chunks [rowgroup][column]
- * @param[out] pages Encode page array (null if just counting pages)
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_rowgroups Number of fragments per column
- * @param[in] num_columns Number of columns
- * @param[out] page_grstats Setup for page-level stats
- * @param[out] chunk_grstats Setup for chunk-level stats
- * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2014,14 +1939,6 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
     chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns);
 }
 
-/**
- * @brief Launches kernel for packing column data into parquet pages
- *
- * @param[in,out] pages Device array of EncPages (unordered)
- * @param[out] comp_in Optionally initializes compressor input params
- * @param[out] comp_stat Optionally initializes compressor status
- * @param[in] stream CUDA stream to use, default 0
- */
 void EncodePages(device_span<gpu::EncPage> pages,
                  device_span<gpu_inflate_input_s> comp_in,
                  device_span<gpu_inflate_status_s> comp_stat,
@@ -2033,26 +1950,11 @@ void EncodePages(device_span<gpu::EncPage> pages,
   gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_stat);
 }
 
-/**
- * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
- *
- * @param[in,out] chunks Column chunks
- * @param[in] stream CUDA stream to use, default 0
- */
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
 {
   gpuDecideCompression<<<chunks.size(), 128, 0, stream.value()>>>(chunks);
 }
 
-/**
- * @brief Launches kernel to encode page headers
- *
- * @param[in,out] pages Device array of EncPages
- * @param[in] comp_stat Compressor status or nullptr if no compression
- * @param[in] page_stats Optional page-level statistics to be included in page header
- * @param[in] chunk_stats Optional chunk-level statistics to be encoded
- * @param[in] stream CUDA stream to use, default 0
- */
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<gpu_inflate_status_s const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
@@ -2065,13 +1967,6 @@ void EncodePageHeaders(device_span<EncPage> pages,
     pages, comp_stat, page_stats, chunk_stats);
 }
 
-/**
- * @brief Launches kernel to gather pages to a single contiguous block per chunk
- *
- * @param[in,out] chunks Column chunks
- * @param[in] pages Device array of EncPages
- * @param[in] stream CUDA stream to use, default 0
- */
 void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a0cbc28bc8d..53bb11c8b70 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -252,6 +252,8 @@ struct parquet_column_device_view : stats_column_desc {
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
 
+struct EncColumnChunk;
+
 /**
  * @brief Struct describing an encoder page fragment
  */
@@ -262,8 +264,10 @@ struct PageFragment {
   uint32_t start_value_idx;
   uint32_t num_leaf_values;  //!< Number of leaf values in fragment. Does not include nulls at
                              //!< non-leaf level
+  size_type start_row;       //!< First row in fragment
   uint16_t num_rows;         //!< Number of rows in fragment
   uint16_t num_dict_vals;    //!< Number of unique dictionary entries
+  EncColumnChunk* chunk;     //!< The chunk that this fragment belongs to
 };
 
 /// Size of hash used for building dictionaries
@@ -284,6 +288,27 @@ inline uint32_t __device__ int32_logical_len(type_id id)
   }
 }
 
+/**
+ * @brief Translate the row index of a parent column_device_view into the index of the first value
+ * in the leaf child.
+ * Only works in the context of parquet writer where struct columns are previously modified s.t.
+ * they only have one immediate child.
+ */
+inline size_type __device__ row_to_value_idx(size_type idx, column_device_view col)
+{
+  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+    if (col.type().id() == type_id::STRUCT) {
+      idx += col.offset();
+      col = col.child(0);
+    } else {
+      auto offset_col = col.child(lists_column_view::offsets_column_index);
+      idx             = offset_col.element<size_type>(idx + col.offset());
+      col             = col.child(lists_column_view::child_column_index);
+    }
+  }
+  return idx;
+}
+
 /**
  * @brief Return worst-case compressed size of compressed data given the uncompressed size
  */
@@ -309,7 +334,7 @@ struct EncColumnChunk {
   uint32_t compressed_size;       //!< Compressed buffer size
   uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
-  uint32_t start_row;             //!< First row of chunk
+  size_type start_row;            //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
@@ -459,18 +484,21 @@ dremel_data get_dremel_data(column_view h_col,
 /**
  * @brief Launches kernel for initializing encoder page fragments
  *
+ * Based on the number of rows in each fragment, populates the value count, the size of data in the
+ * fragment, the number of unique values, and the data size of unique values.
+ *
  * @param[out] frag Fragment array [column_id][fragment_id]
  * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
+ * @param[in] partitions Information about partitioning of table
+ * @param[in] first_frag_in_part A Partition's offset into fragment array
  * @param[in] fragment_size Number of rows per fragment
- * @param[in] num_rows Number of rows per column
  * @param[in] stream CUDA stream to use
  */
 void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
+                       device_span<partition_info const> partitions,
+                       device_span<int const> first_frag_in_part,
                        uint32_t fragment_size,
-                       uint32_t num_rows,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -498,11 +526,11 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @brief Insert chunk values into their respective hash maps
  *
  * @param chunks Column chunks [rowgroup][column]
- * @param num_rows Number of rows per column
+ * @param frags Column fragments
  * @param stream CUDA stream to use
  */
 void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              size_type num_rows,
+                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -523,11 +551,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
  * @param chunks Column chunks [rowgroup][column]
- * @param num_rows Number of rows per column
+ * @param frags Column fragments
  * @param stream CUDA stream to use
  */
 void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            size_type num_rows,
+                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e04c8371df8..aceb3bfbec1 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <io/utilities/config_utils.hpp>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
@@ -40,6 +41,8 @@
 
 #include <nvcomp/snappy.h>
 
+#include <thrust/binary_search.h>
+
 #include <algorithm>
 #include <cstring>
 #include <numeric>
@@ -76,6 +79,113 @@ parquet::Compression to_parquet_compression(compression_type compression)
 
 }  // namespace
 
+struct aggregate_metadata {
+  aggregate_metadata(std::vector<partition_info> const& partitions,
+                     size_type num_columns,
+                     std::vector<SchemaElement> schema,
+                     statistics_freq stats_granularity,
+                     std::vector<std::map<std::string, std::string>> const& kv_md)
+    : version(1), schema(std::move(schema)), files(partitions.size())
+  {
+    for (size_t i = 0; i < partitions.size(); ++i) {
+      this->files[i].num_rows = partitions[i].num_rows;
+    }
+    this->column_order_listsize =
+      (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
+
+    for (size_t p = 0; p < kv_md.size(); ++p) {
+      std::transform(kv_md[p].begin(),
+                     kv_md[p].end(),
+                     std::back_inserter(this->files[p].key_value_metadata),
+                     [](auto const& kv) {
+                       return KeyValue{kv.first, kv.second};
+                     });
+    }
+  }
+
+  void update_files(std::vector<partition_info> const& partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == this->files.size(),
+                 "New partitions must be same size as previously passed number of partitions");
+    for (size_t i = 0; i < partitions.size(); ++i) {
+      this->files[i].num_rows += partitions[i].num_rows;
+    }
+  }
+
+  FileMetaData get_metadata(size_t part)
+  {
+    CUDF_EXPECTS(part < files.size(), "Invalid part index queried");
+    FileMetaData meta{};
+    meta.version               = this->version;
+    meta.schema                = this->schema;
+    meta.num_rows              = this->files[part].num_rows;
+    meta.row_groups            = this->files[part].row_groups;
+    meta.key_value_metadata    = this->files[part].key_value_metadata;
+    meta.created_by            = this->created_by;
+    meta.column_order_listsize = this->column_order_listsize;
+    return meta;
+  }
+
+  void set_file_paths(std::vector<std::string> const& column_chunks_file_path)
+  {
+    for (size_t p = 0; p < this->files.size(); ++p) {
+      auto& file            = this->files[p];
+      auto const& file_path = column_chunks_file_path[p];
+      for (auto& rowgroup : file.row_groups) {
+        for (auto& col : rowgroup.columns) {
+          col.file_path = file_path;
+        }
+      }
+    }
+  }
+
+  FileMetaData get_merged_metadata()
+  {
+    FileMetaData merged_md;
+    for (size_t p = 0; p < this->files.size(); ++p) {
+      auto& file = this->files[p];
+      if (p == 0) {
+        merged_md = this->get_metadata(0);
+      } else {
+        merged_md.row_groups.insert(merged_md.row_groups.end(),
+                                    std::make_move_iterator(file.row_groups.begin()),
+                                    std::make_move_iterator(file.row_groups.end()));
+        merged_md.num_rows += file.num_rows;
+      }
+    }
+    return merged_md;
+  }
+
+  std::vector<size_t> num_row_groups_per_file()
+  {
+    std::vector<size_t> global_rowgroup_base;
+    std::transform(this->files.begin(),
+                   this->files.end(),
+                   std::back_inserter(global_rowgroup_base),
+                   [](auto const& part) { return part.row_groups.size(); });
+    return global_rowgroup_base;
+  }
+
+  bool schema_matches(std::vector<SchemaElement> const& schema) const
+  {
+    return this->schema == schema;
+  }
+  auto& file(size_t p) { return files[p]; }
+  size_t num_files() const { return files.size(); }
+
+ private:
+  int32_t version = 0;
+  std::vector<SchemaElement> schema;
+  struct per_file_metadata {
+    int64_t num_rows = 0;
+    std::vector<RowGroup> row_groups;
+    std::vector<KeyValue> key_value_metadata;
+  };
+  std::vector<per_file_metadata> files;
+  std::string created_by         = "";
+  uint32_t column_order_listsize = 0;
+};
+
 struct linked_column_view;
 
 using LinkedColPtr    = std::shared_ptr<linked_column_view>;
@@ -736,10 +846,12 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(
 
 void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
                                        device_span<gpu::parquet_column_device_view const> col_desc,
-                                       uint32_t num_rows,
+                                       host_span<partition_info const> partitions,
+                                       device_span<int const> part_frag_offset,
                                        uint32_t fragment_size)
 {
-  gpu::InitPageFragments(frag, col_desc, fragment_size, num_rows, stream);
+  auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream);
+  gpu::InitPageFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host(stream, true);
 }
 
@@ -771,7 +883,7 @@ void writer::impl::init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chu
 
 auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                               host_span<gpu::parquet_column_device_view const> col_desc,
-                              uint32_t num_rows,
+                              device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each
@@ -801,7 +913,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   chunks.host_to_device(stream);
 
   gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(chunks, num_rows, stream);
+  gpu::populate_chunk_hash_maps(chunks, frags, stream);
 
   chunks.device_to_host(stream, true);
 
@@ -850,7 +962,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
   chunks.host_to_device(stream);
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream);
+  gpu::get_dictionary_indices(chunks.device_view(), frags, stream);
 
   return std::make_pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1016,7 +1128,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   stream.synchronize();
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
+writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
@@ -1028,8 +1140,9 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
-    out_sink_(std::move(sink)),
-    single_write_mode(mode == SingleWriteMode::YES)
+    kv_md(options.get_key_value_metadata()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    out_sink_(std::move(sinks))
 {
   if (options.get_metadata()) {
     table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -1037,7 +1150,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   init_state();
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
+writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    chunked_parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
@@ -1049,8 +1162,9 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
+    kv_md(options.get_key_value_metadata()),
     single_write_mode(mode == SingleWriteMode::YES),
-    out_sink_(std::move(sink))
+    out_sink_(std::move(sinks))
 {
   if (options.get_metadata()) {
     table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -1062,19 +1176,21 @@ writer::impl::~impl() { close(); }
 
 void writer::impl::init_state()
 {
+  current_chunk_offset.resize(out_sink_.size());
   // Write file header
   file_header_s fhdr;
   fhdr.magic = parquet_magic;
-  out_sink_->host_write(&fhdr, sizeof(fhdr));
-  current_chunk_offset = sizeof(file_header_s);
+  for (auto& sink : out_sink_) {
+    sink->host_write(&fhdr, sizeof(fhdr));
+  }
+  std::fill_n(current_chunk_offset.begin(), current_chunk_offset.size(), sizeof(file_header_s));
 }
 
-void writer::impl::write(table_view const& table)
+void writer::impl::write(table_view const& table, std::vector<partition_info> const& partitions)
 {
+  last_write_successful = false;
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
-  size_type num_rows = table.num_rows();
-
   if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
 
   // Fill unnamed columns' names in table_meta
@@ -1109,25 +1225,15 @@ void writer::impl::write(table_view const& table)
 
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
-  if (md.version == 0) {
-    md.version  = 1;
-    md.num_rows = num_rows;
-    md.column_order_listsize =
-      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
-    std::transform(table_meta->user_data.begin(),
-                   table_meta->user_data.end(),
-                   std::back_inserter(md.key_value_metadata),
-                   [](auto const& kv) {
-                     return KeyValue{kv.first, kv.second};
-                   });
-    md.schema = this_table_schema;
+  if (!md) {
+    md = std::make_unique<aggregate_metadata>(
+      partitions, num_columns, std::move(this_table_schema), stats_granularity_, kv_md);
   } else {
     // verify the user isn't passing mismatched tables
-    CUDF_EXPECTS(md.schema == this_table_schema,
+    CUDF_EXPECTS(md->schema_matches(this_table_schema),
                  "Mismatch in schema between multiple calls to write_chunk");
 
-    // increment num rows
-    md.num_rows += num_rows;
+    md->update_files(partitions);
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
@@ -1149,7 +1255,22 @@ void writer::impl::write(table_view const& table)
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
 
-  size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
+  std::vector<int> num_frag_in_part;
+  std::transform(partitions.begin(),
+                 partitions.end(),
+                 std::back_inserter(num_frag_in_part),
+                 [](auto const& part) {
+                   return util::div_rounding_up_unsafe(part.num_rows, max_page_fragment_size);
+                 });
+
+  size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
+
+  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  std::exclusive_scan(
+    num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
+  part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
+
+  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream);
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1159,36 +1280,50 @@ void writer::impl::write(table_view const& table)
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
-    init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
+    init_page_fragments(
+      fragments, col_desc, partitions, d_part_frag_offset, max_page_fragment_size);
   }
 
-  auto const global_rowgroup_base = static_cast<size_type>(md.row_groups.size());
+  std::vector<size_t> const global_rowgroup_base = md->num_row_groups_per_file();
 
   // Decide row group boundaries based on uncompressed data size
-  auto rowgroup_size = 0ul;
-  auto num_rowgroups = 0;
-  for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) {
-    auto fragment_data_size = 0ul;
-    // Replace with STL algorithm to transform and sum
-    for (auto i = 0; i < num_columns; i++) {
-      fragment_data_size += fragments[i][f].fragment_data_size;
-    }
-    if (f > rowgroup_start &&
-        (rowgroup_size + fragment_data_size > max_row_group_size ||
-         (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) {
-      // update schema
-      md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
-      num_rowgroups++;
-      rowgroup_start = f;
-      rowgroup_size  = 0;
-    }
-    rowgroup_size += fragment_data_size;
-    if (f + 1 == num_fragments) {
-      // update schema
-      md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size;
-      num_rowgroups++;
+  int num_rowgroups = 0;
+
+  std::vector<int> num_rg_in_part(partitions.size());
+  for (size_t p = 0; p < partitions.size(); ++p) {
+    size_type curr_rg_num_rows = 0;
+    size_t curr_rg_data_size   = 0;
+    int first_frag_in_rg       = part_frag_offset[p];
+    int last_frag_in_part      = part_frag_offset[p + 1] - 1;
+    for (auto f = first_frag_in_rg; f <= last_frag_in_part; ++f) {
+      size_t fragment_data_size = 0;
+      for (auto c = 0; c < num_columns; c++) {
+        fragment_data_size += fragments[c][f].fragment_data_size;
+      }
+      size_type fragment_num_rows = fragments[0][f].num_rows;
+
+      // If the fragment size gets larger than rg limit then break off a rg
+      if (f > first_frag_in_rg &&  // There has to be at least one fragment in row group
+          (curr_rg_data_size + fragment_data_size > max_row_group_size ||
+           curr_rg_num_rows + fragment_num_rows > max_row_group_rows)) {
+        auto& rg    = md->file(p).row_groups.emplace_back();
+        rg.num_rows = curr_rg_num_rows;
+        num_rowgroups++;
+        num_rg_in_part[p]++;
+        curr_rg_num_rows  = 0;
+        curr_rg_data_size = 0;
+        first_frag_in_rg  = f;
+      }
+      curr_rg_num_rows += fragment_num_rows;
+      curr_rg_data_size += fragment_data_size;
+
+      // TODO: (wishful) refactor to consolidate with above if block
+      if (f == last_frag_in_part) {
+        auto& rg    = md->file(p).row_groups.emplace_back();
+        rg.num_rows = curr_rg_num_rows;
+        num_rowgroups++;
+        num_rg_in_part[p]++;
+      }
     }
   }
 
@@ -1196,58 +1331,79 @@ void writer::impl::write(table_view const& table)
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
   if (stats_granularity_ != statistics_freq::STATISTICS_NONE) {
     frag_stats.resize(num_fragments * num_columns, stream);
-    if (frag_stats.size() != 0) {
+    if (not frag_stats.is_empty()) {
       auto frag_stats_2dview =
         device_2dspan<statistics_chunk>(frag_stats.data(), num_columns, num_fragments);
       gather_fragment_statistics(frag_stats_2dview, fragments, col_desc, num_fragments);
     }
   }
+
+  std::vector<int> first_rg_in_part;
+  std::exclusive_scan(
+    num_rg_in_part.begin(), num_rg_in_part.end(), std::back_inserter(first_rg_in_part), 0);
+
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
-       r++, global_r++) {
-    size_type const fragments_in_chunk =
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
-    md.row_groups[global_r].total_byte_size = 0;
-    md.row_groups[global_r].columns.resize(num_columns);
-    for (auto i = 0; i < num_columns; i++) {
-      gpu::EncColumnChunk* ck = &chunks[r][i];
-
-      *ck             = {};
-      ck->col_desc    = col_desc.device_ptr() + i;
-      ck->col_desc_id = i;
-      ck->fragments   = &fragments.device_view()[i][f];
-      ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
-      ck->start_row        = start_row;
-      ck->num_rows         = (uint32_t)md.row_groups[global_r].num_rows;
-      ck->first_fragment   = i * num_fragments + f;
-      auto chunk_fragments = fragments[i].subspan(f, fragments_in_chunk);
-      ck->num_values =
-        std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
-          return l + r.num_values;
-        });
-      ck->plain_data_size = std::accumulate(
-        chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
-          return sum + frag.fragment_data_size;
-        });
-      md.row_groups[global_r].columns[i].meta_data.type      = parquet_columns[i].physical_type();
-      md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE};
-      md.row_groups[global_r].columns[i].meta_data.path_in_schema =
-        parquet_columns[i].get_path_in_schema();
-      md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
-      md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values;
+
+  for (size_t p = 0; p < partitions.size(); ++p) {
+    int f               = part_frag_offset[p];
+    size_type start_row = partitions[p].start_row;
+    for (int r = 0; r < num_rg_in_part[p]; r++) {
+      size_t global_r = global_rowgroup_base[p] + r;  // Number of rowgroups already in file/part
+      auto& row_group = md->file(p).row_groups[global_r];
+      uint32_t fragments_in_chunk =
+        util::div_rounding_up_unsafe(row_group.num_rows, max_page_fragment_size);
+      row_group.total_byte_size = 0;
+      row_group.columns.resize(num_columns);
+      for (int c = 0; c < num_columns; c++) {
+        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+
+        ck             = {};
+        ck.col_desc    = col_desc.device_ptr() + c;
+        ck.col_desc_id = c;
+        ck.fragments   = &fragments.device_view()[c][f];
+        ck.stats =
+          (not frag_stats.is_empty()) ? frag_stats.data() + c * num_fragments + f : nullptr;
+        ck.start_row         = start_row;
+        ck.num_rows          = (uint32_t)row_group.num_rows;
+        ck.first_fragment    = c * num_fragments + f;
+        auto chunk_fragments = fragments[c].subspan(f, fragments_in_chunk);
+        // In fragment struct, add a pointer to the chunk it belongs to
+        // In each fragment in chunk_fragments, update the chunk pointer here.
+        for (auto& frag : chunk_fragments) {
+          frag.chunk = &chunks.device_view()[r + first_rg_in_part[p]][c];
+        }
+        ck.num_values = std::accumulate(
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
+            return l + r.num_values;
+          });
+        ck.plain_data_size = std::accumulate(
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+            return sum + frag.fragment_data_size;
+          });
+        auto& column_chunk_meta          = row_group.columns[c].meta_data;
+        column_chunk_meta.type           = parquet_columns[c].physical_type();
+        column_chunk_meta.encodings      = {Encoding::PLAIN, Encoding::RLE};
+        column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema();
+        column_chunk_meta.codec          = UNCOMPRESSED;
+        column_chunk_meta.num_values     = ck.num_values;
+      }
+      f += fragments_in_chunk;
+      start_row += (uint32_t)row_group.num_rows;
     }
-    f += fragments_in_chunk;
-    start_row += (uint32_t)md.row_groups[global_r].num_rows;
   }
 
-  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
-  for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
-    for (auto col = 0; col < num_columns; col++) {
-      if (chunks.host_view()[rg][col].use_dictionary) {
-        md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
-          Encoding::PLAIN_DICTIONARY);
+  fragments.host_to_device(stream);
+  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, fragments, stream);
+  for (size_t p = 0; p < partitions.size(); p++) {
+    for (int rg = 0; rg < num_rg_in_part[p]; rg++) {
+      size_t global_rg = global_rowgroup_base[p] + rg;
+      for (int col = 0; col < num_columns; col++) {
+        if (chunks.host_view()[rg][col].use_dictionary) {
+          md->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back(
+            Encoding::PLAIN_DICTIONARY);
+        }
       }
     }
   }
@@ -1272,6 +1428,12 @@ void writer::impl::write(table_view const& table)
                  "Error in getting compressed size from nvcomp");
   }
 
+  // Find which partition a rg belongs to
+  std::vector<int> rg_to_part;
+  for (size_t p = 0; p < num_rg_in_part.size(); ++p) {
+    std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
+  }
+
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
   std::vector<size_type> batch_list;
   size_type num_pages          = 0;
@@ -1335,11 +1497,11 @@ void writer::impl::write(table_view const& table)
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->uncompressed_bfr    = bfr;
-        ck->compressed_bfr      = bfr_c;
-        bfr += ck->bfr_size;
-        bfr_c += ck->compressed_size;
+        gpu::EncColumnChunk& ck = chunks[r][i];
+        ck.uncompressed_bfr     = bfr;
+        ck.compressed_bfr       = bfr_c;
+        bfr += ck.bfr_size;
+        bfr_c += ck.compressed_size;
       }
     }
   }
@@ -1359,9 +1521,7 @@ void writer::impl::write(table_view const& table)
   pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
-  for (auto b = 0, r = 0, global_r = global_rowgroup_base;
-       b < static_cast<size_type>(batch_list.size());
-       b++) {
+  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
     // Count pages in this batch
     auto const rnext               = r + batch_list[b];
     auto const first_page_in_batch = chunks[r][0].first_page;
@@ -1381,30 +1541,33 @@ void writer::impl::write(table_view const& table)
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                                : nullptr);
     std::vector<std::future<void>> write_tasks;
-    for (; r < rnext; r++, global_r++) {
+    for (; r < rnext; r++) {
+      int p           = rg_to_part[r];
+      int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
+      auto& row_group = md->file(p).row_groups[global_r];
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
+        gpu::EncColumnChunk& ck = chunks[r][i];
+        auto& column_chunk_meta = row_group.columns[i].meta_data;
         uint8_t* dev_bfr;
-        if (ck->is_compressed) {
-          md.row_groups[global_r].columns[i].meta_data.codec = compression_;
-          dev_bfr                                            = ck->compressed_bfr;
+        if (ck.is_compressed) {
+          column_chunk_meta.codec = compression_;
+          dev_bfr                 = ck.compressed_bfr;
         } else {
-          dev_bfr = ck->uncompressed_bfr;
+          dev_bfr = ck.uncompressed_bfr;
         }
 
-        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
+        if (out_sink_[p]->is_device_write_preferred(ck.compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
-          write_tasks.push_back(
-            out_sink_->device_write_async(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream));
+          write_tasks.push_back(out_sink_[p]->device_write_async(
+            dev_bfr + ck.ck_stat_size, ck.compressed_size, stream));
           // we still need to do a (much smaller) memcpy for the statistics.
-          if (ck->ck_stat_size != 0) {
-            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
-            CUDA_TRY(
-              cudaMemcpyAsync(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
-                              dev_bfr,
-                              ck->ck_stat_size,
-                              cudaMemcpyDeviceToHost,
-                              stream.value()));
+          if (ck.ck_stat_size != 0) {
+            column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
+            CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
+                                     dev_bfr,
+                                     ck.ck_stat_size,
+                                     cudaMemcpyDeviceToHost,
+                                     stream.value()));
             stream.synchronize();
           }
         } else {
@@ -1419,86 +1582,91 @@ void writer::impl::write(table_view const& table)
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
-                                   ck->ck_stat_size + ck->compressed_size,
+                                   ck.ck_stat_size + ck.compressed_size,
                                    cudaMemcpyDeviceToHost,
                                    stream.value()));
           stream.synchronize();
-          out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size);
-          if (ck->ck_stat_size != 0) {
-            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
-            memcpy(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
-                   host_bfr.get(),
-                   ck->ck_stat_size);
+          out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size);
+          if (ck.ck_stat_size != 0) {
+            column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
+            memcpy(column_chunk_meta.statistics_blob.data(), host_bfr.get(), ck.ck_stat_size);
           }
         }
-        md.row_groups[global_r].total_byte_size += ck->compressed_size;
-        md.row_groups[global_r].columns[i].meta_data.data_page_offset =
-          current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0);
-        md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
-          (ck->use_dictionary) ? current_chunk_offset : 0;
-        md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
-        md.row_groups[global_r].columns[i].meta_data.total_compressed_size   = ck->compressed_size;
-        current_chunk_offset += ck->compressed_size;
+        row_group.total_byte_size += ck.compressed_size;
+        column_chunk_meta.data_page_offset =
+          current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0);
+        column_chunk_meta.dictionary_page_offset =
+          (ck.use_dictionary) ? current_chunk_offset[p] : 0;
+        column_chunk_meta.total_uncompressed_size = ck.bfr_size;
+        column_chunk_meta.total_compressed_size   = ck.compressed_size;
+        current_chunk_offset[p] += ck.compressed_size;
       }
     }
     for (auto const& task : write_tasks) {
       task.wait();
     }
   }
+  last_write_successful = true;
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
-  std::string const& column_chunks_file_path)
+  std::vector<std::string> const& column_chunks_file_path)
 {
   if (closed) { return nullptr; }
   closed = true;
-  CompactProtocolWriter cpw(&buffer_);
-  file_ender_s fendr;
-  buffer_.resize(0);
-  fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
-  fendr.magic      = parquet_magic;
-  out_sink_->host_write(buffer_.data(), buffer_.size());
-  out_sink_->host_write(&fendr, sizeof(fendr));
-  out_sink_->flush();
+  if (not last_write_successful) { return nullptr; }
+  for (size_t p = 0; p < out_sink_.size(); p++) {
+    std::vector<uint8_t> buffer;
+    CompactProtocolWriter cpw(&buffer);
+    file_ender_s fendr;
+    buffer.resize(0);
+    fendr.footer_len = static_cast<uint32_t>(cpw.write(md->get_metadata(p)));
+    fendr.magic      = parquet_magic;
+    out_sink_[p]->host_write(buffer.data(), buffer.size());
+    out_sink_[p]->host_write(&fendr, sizeof(fendr));
+    out_sink_[p]->flush();
+  }
 
   // Optionally output raw file metadata with the specified column chunk file path
-  if (column_chunks_file_path.length() > 0) {
+  if (column_chunks_file_path.size() > 0) {
+    CUDF_EXPECTS(column_chunks_file_path.size() == md->num_files(),
+                 "Expected one column chunk path per output file");
+    md->set_file_paths(column_chunks_file_path);
     file_header_s fhdr = {parquet_magic};
-    buffer_.resize(0);
-    buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t*>(&fhdr),
-                   reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
-    for (auto& rowgroup : md.row_groups) {
-      for (auto& col : rowgroup.columns) {
-        col.file_path = column_chunks_file_path;
-      }
-    }
-    fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
-    buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t*>(&fendr),
-                   reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
-    return std::make_unique<std::vector<uint8_t>>(std::move(buffer_));
+    std::vector<uint8_t> buffer;
+    CompactProtocolWriter cpw(&buffer);
+    buffer.insert(buffer.end(),
+                  reinterpret_cast<const uint8_t*>(&fhdr),
+                  reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
+    file_ender_s fendr;
+    fendr.magic      = parquet_magic;
+    fendr.footer_len = static_cast<uint32_t>(cpw.write(md->get_merged_metadata()));
+    buffer.insert(buffer.end(),
+                  reinterpret_cast<const uint8_t*>(&fendr),
+                  reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
+    return std::make_unique<std::vector<uint8_t>>(std::move(buffer));
   } else {
     return {nullptr};
   }
+  return nullptr;
 }
 
 // Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
+writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
+  : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream, mr))
 {
 }
 
-writer::writer(std::unique_ptr<data_sink> sink,
+writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                chunked_parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
+  : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream, mr))
 {
 }
 
@@ -1506,16 +1674,21 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::~writer() = default;
 
 // Forward to implementation
-void writer::write(table_view const& table) { _impl->write(table); }
+void writer::write(table_view const& table, std::vector<partition_info> const& partitions)
+{
+  _impl->write(
+    table, partitions.empty() ? std::vector<partition_info>{{0, table.num_rows()}} : partitions);
+}
 
 // Forward to implementation
-std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> writer::close(
+  std::vector<std::string> const& column_chunks_file_path)
 {
   return _impl->close(column_chunks_file_path);
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
+  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list)
 {
   std::vector<uint8_t> output;
   CompactProtocolWriter cpw(&output);
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 9188218f607..1cefb91c904 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -45,6 +45,7 @@ namespace detail {
 namespace parquet {
 // Forward internal classes
 struct parquet_column_view;
+struct aggregate_metadata;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
@@ -60,13 +61,13 @@ class writer::impl {
   /**
    * @brief Constructor with writer options.
    *
-   * @param sink data_sink for storing dataset
+   * @param sink data_sink's for storing dataset
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::unique_ptr<data_sink> sink,
+  explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
                 SingleWriteMode mode,
                 rmm::cuda_stream_view stream,
@@ -75,13 +76,13 @@ class writer::impl {
   /**
    * @brief Constructor with chunked writer options.
    *
-   * @param sink data_sink for storing dataset
+   * @param sink data_sink's for storing dataset
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::unique_ptr<data_sink> sink,
+  explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
                 SingleWriteMode mode,
                 rmm::cuda_stream_view stream,
@@ -102,8 +103,10 @@ class writer::impl {
    * normally used for chunked writing.
    *
    * @param[in] table The table information to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
    */
-  void write(table_view const& table);
+  void write(table_view const& table, std::vector<partition_info> const& partitions);
 
   /**
    * @brief Finishes the chunked/streamed write process.
@@ -112,7 +115,8 @@ class writer::impl {
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
    * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_path = {});
 
  private:
   /**
@@ -120,12 +124,14 @@ class writer::impl {
    *
    * @param frag Destination page fragments
    * @param col_desc column description array
-   * @param num_rows Total number of rows
+   * @param[in] partitions Information about partitioning of table
+   * @param[in] part_frag_offset A Partition's offset into fragment array
    * @param fragment_size Number of rows per fragment
    */
   void init_page_fragments(hostdevice_2dvector<gpu::PageFragment>& frag,
                            device_span<gpu::parquet_column_device_view const> col_desc,
-                           uint32_t num_rows,
+                           host_span<partition_info const> partitions,
+                           device_span<int const> part_frag_offset,
                            uint32_t fragment_size);
 
   /**
@@ -208,19 +214,22 @@ class writer::impl {
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
-  cudf::io::parquet::FileMetaData md;
+  std::unique_ptr<aggregate_metadata> md;
+  // File footer key-value metadata. Written during write_chunked_end()
+  std::vector<std::map<std::string, std::string>> kv_md;
   // optional user metadata
   std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
+  // To track if the last write(table) call completed successfully
+  bool last_write_successful = false;
   // current write position for rowgroups/chunks
-  std::size_t current_chunk_offset;
+  std::vector<std::size_t> current_chunk_offset;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
   // a single table write.  this enables some internal optimizations.
   bool const single_write_mode = true;
 
-  std::vector<uint8_t> buffer_;
-  std::unique_ptr<data_sink> out_sink_;
+  std::vector<std::unique_ptr<data_sink>> out_sink_;
 };
 
 }  // namespace parquet
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index c376accd1ff..75ff39cbe70 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -838,13 +838,13 @@ TEST_F(ParquetWriterTest, MultiIndex)
   expected_metadata.column_metadata[2].set_name("int32s");
   expected_metadata.column_metadata[3].set_name("floats");
   expected_metadata.column_metadata[4].set_name("doubles");
-  expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"});
 
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
-      .metadata(&expected_metadata);
+      .metadata(&expected_metadata)
+      .key_value_metadata(
+        {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}});
   cudf_io::write_parquet(out_opts);
 
   cudf_io::parquet_reader_options in_opts =
@@ -1174,6 +1174,100 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile)
   auto custom_tbl = cudf_io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
+
+TEST_F(ParquetWriterTest, PartitionedWrite)
+{
+  auto source = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 1000, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{10, 1024 * 1024};
+  auto partition2 = cudf::io::partition_info{20 * 1024 + 7, 3 * 1024 * 1024};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
+{
+  auto source = create_random_fixed_table<int>(4, 4, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{1, 0};
+  auto partition2 = cudf::io::partition_info{1, 0};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
+{
+  auto source = create_random_fixed_table<int>(0, 4, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{1, 0};
+  auto partition2 = cudf::io::partition_info{1, 0};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
 template <typename T>
 std::string create_parquet_file(int num_cols)
 {
@@ -1305,7 +1399,7 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
   std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
     writer.write(tbl);
   });
-  auto md = writer.close("dummy/path");
+  auto md = writer.close({"dummy/path"});
   CUDF_EXPECTS(md, "The returned metadata should not be null.");
 
   cudf_io::parquet_reader_options read_opts =
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 2fc71f64df1..e5a8bb926c1 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -85,6 +86,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_row_index_stride(size_type val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(cudf_io_types.table_input_metadata* meta) except+
+        void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         orc_writer_options_builder builder(
@@ -107,6 +109,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
         ) except+
+        orc_writer_options_builder& key_value_metadata(
+            map[string, string] kvm
+        ) except+
 
         orc_writer_options build() except+
 
@@ -134,6 +139,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_metadata(
             cudf_io_types.table_input_metadata* meta
         ) except+
+        void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         chunked_orc_writer_options_builder builder(
@@ -155,6 +161,9 @@ cdef extern from "cudf/io/orc.hpp" \
         chunked_orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
         ) except+
+        chunked_orc_writer_options_builder& key_value_metadata(
+            map[string, string] kvm
+        ) except+
 
         chunked_orc_writer_options build() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 9d95dce83bc..60be608d997 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -73,21 +73,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
-        string get_column_chunks_file_path() except+
+        string get_column_chunks_file_paths() except+
         size_t get_row_group_size_bytes() except+
         size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        void set_key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_column_chunks_file_path(
-            string column_chunks_file_path
+        void set_column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
         ) except +
         void set_row_group_size_bytes(size_t val) except+
         void set_row_group_size_rows(size_type val) except+
@@ -108,14 +111,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        parquet_writer_options_builder& key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
         parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
-        parquet_writer_options_builder& column_chunks_file_path(
-            string column_chunks_file_path
+        parquet_writer_options_builder& column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
         ) except +
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
@@ -146,6 +152,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        void set_key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
@@ -168,6 +177,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        chunked_parquet_writer_options_builder& key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
@@ -190,7 +202,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_,
         ) except+
         unique_ptr[vector[uint8_t]] close(
-            string column_chunks_file_path,
+            vector[string] column_chunks_file_paths,
         ) except+
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 6b68902d22f..40a056b46e0 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -70,13 +70,8 @@ cdef extern from "cudf/io/types.hpp" \
     cdef cppclass table_input_metadata:
         table_input_metadata() except +
         table_input_metadata(const cudf_table_view.table_view& table) except +
-        table_input_metadata(
-            const cudf_table_view.table_view& table,
-            map[string, string] user_data
-        ) except +
 
         vector[column_in_metadata] column_metadata
-        map[string, string] user_data
 
     cdef cppclass host_buffer:
         const char* data
@@ -87,8 +82,8 @@ cdef extern from "cudf/io/types.hpp" \
 
     cdef cppclass source_info:
         io_type type
-        vector[string] filepaths
-        vector[host_buffer] buffers
+        const vector[string]& filepaths() except +
+        const vector[host_buffer]& buffers() except +
         vector[shared_ptr[CRandomAccessFile]] files
 
         source_info() except +
@@ -98,9 +93,9 @@ cdef extern from "cudf/io/types.hpp" \
 
     cdef cppclass sink_info:
         io_type type
-        string filepath
-        vector[char] * buffer
-        data_sink * user_sink
+        const vector[string]& filepaths()
+        const vector[vector[char] *]& buffers()
+        const vector[data_sink *]& user_sinks()
 
         sink_info() except +
         sink_info(string file_path) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 9a4bd8652da..bf761c30bc8 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -3,6 +3,7 @@
 import cudf
 
 from libcpp cimport bool, int
+from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -311,10 +312,9 @@ cdef class ORCWriter:
         chunked_orc_writer_options anb creates a writer"""
         cdef table_view tv
 
-        # Set the table_metadata
         num_index_cols_meta = 0
         self.tbl_meta = make_unique[table_input_metadata](
-            table_view_from_table(table, ignore_index=True)
+            table_view_from_table(table, ignore_index=True),
         )
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
@@ -340,15 +340,16 @@ cdef class ORCWriter:
                 table[name]._column, self.tbl_meta.get().column_metadata[i]
             )
 
+        cdef map[string, string] user_data
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        self.tbl_meta.get().user_data[str.encode("pandas")] = \
-            str.encode(pandas_metadata)
+        user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
         cdef chunked_orc_writer_options args
         with nogil:
             args = move(
                 chunked_orc_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta.get())
+                .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .enable_statistics(self.enable_stats)
                 .build()
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d17184685fa..955324778fd 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -296,7 +296,7 @@ cpdef write_parquet(
     # Create the write options
     cdef unique_ptr[table_input_metadata] tbl_meta
 
-    cdef map[string, string] user_data
+    cdef vector[map[string, string]] user_data
     cdef table_view tv
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
@@ -328,30 +328,29 @@ cpdef write_parquet(
         )
 
     pandas_metadata = generate_pandas_metadata(table, index)
-    user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-
-    # Set the table_metadata
-    tbl_meta.get().user_data = user_data
+    user_data.resize(1)
+    user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
 
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
-    cdef string c_column_chunks_file_path
+    cdef vector[string] c_column_chunks_file_paths
     cdef bool _int96_timestamps = int96_timestamps
-    if metadata_file_path is not None:
-        c_column_chunks_file_path = str.encode(metadata_file_path)
 
     # Perform write
     cdef parquet_writer_options args = move(
         parquet_writer_options.builder(sink, tv)
         .metadata(tbl_meta.get())
+        .key_value_metadata(move(user_data))
         .compression(comp_type)
         .stats_level(stat_freq)
-        .column_chunks_file_path(c_column_chunks_file_path)
         .int96_timestamps(_int96_timestamps)
         .build()
     )
+    if metadata_file_path is not None:
+        c_column_chunks_file_paths.push_back(str.encode(metadata_file_path))
+        args.set_column_chunks_file_paths(move(c_column_chunks_file_paths))
     if row_group_size_bytes is not None:
         args.set_row_group_size_bytes(row_group_size_bytes)
     if row_group_size_rows is not None:
@@ -413,18 +412,18 @@ cdef class ParquetWriter:
 
     def close(self, object metadata_file_path=None):
         cdef unique_ptr[vector[uint8_t]] out_metadata_c
-        cdef string column_chunks_file_path
+        cdef vector[string] column_chunks_file_paths
 
         if not self.initialized:
             return None
 
         # Update metadata-collection options
         if metadata_file_path is not None:
-            column_chunks_file_path = str.encode(metadata_file_path)
+            column_chunks_file_paths.push_back(str.encode(metadata_file_path))
 
         with nogil:
             out_metadata_c = move(
-                self.writer.get()[0].close(column_chunks_file_path)
+                self.writer.get()[0].close(column_chunks_file_paths)
             )
 
         if metadata_file_path is not None:
@@ -471,14 +470,16 @@ cdef class ParquetWriter:
             )
 
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        self.tbl_meta.get().user_data[str.encode("pandas")] = \
-            str.encode(pandas_metadata)
+        cdef vector[map[string, string]] user_data
+        user_data.resize(1)
+        user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
 
         cdef chunked_parquet_writer_options args
         with nogil:
             args = move(
                 chunked_parquet_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta.get())
+                .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .stats_level(self.stat_freq)
                 .build()

From fc2a32a1576d97a48b0c1c983ef4b31285267e96 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Dec 2021 16:59:24 -0600
Subject: [PATCH 083/202] Introduce `nan_as_null` parameter for `cudf.Index`
 (#9893)

Fixes: #9822

This PR introduces `nan_as_null` parameter to `cudf.Index` constructor which is similar to the one present in `cudf.Series` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9893
---
 python/cudf/cudf/core/_base_index.py        |  2 +-
 python/cudf/cudf/core/column/categorical.py |  4 +++-
 python/cudf/cudf/core/index.py              | 25 ++++++++++++++++-----
 python/cudf/cudf/tests/test_index.py        | 19 ++++++++++++++++
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ac5e152d011..ed1cc74db71 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -829,7 +829,7 @@ def is_floating(self):
         >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0])
         >>> idx.is_floating()
         True
-        >>> idx = cudf.Index([1, 2, 3, 4, np.nan])
+        >>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False)
         >>> idx.is_floating()
         True
         >>> idx = cudf.Index([1, 2, 3, 4])
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a2c1f04b2f2..4be7a422de0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -809,7 +809,9 @@ def __setitem__(self, key, value):
             to_add_categories = 0
         else:
             to_add_categories = len(
-                cudf.Index(value).difference(self.categories)
+                cudf.Index(value, nan_as_null=False).difference(
+                    self.categories
+                )
             )
 
         if to_add_categories > 0:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 29e0d17bc39..362c96ebbeb 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2527,7 +2527,7 @@ def is_object(self):
         return True
 
 
-def as_index(arbitrary, **kwargs) -> BaseIndex:
+def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     """Create an Index from an arbitrary object
 
     Currently supported inputs are:
@@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
     elif isinstance(arbitrary, ColumnBase):
         return _index_from_data({kwargs.get("name", None): arbitrary})
     elif isinstance(arbitrary, cudf.Series):
-        return as_index(arbitrary._column, **kwargs)
+        return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs)
     elif isinstance(arbitrary, (pd.RangeIndex, range)):
         return RangeIndex(
             start=arbitrary.start,
@@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
             **kwargs,
         )
     elif isinstance(arbitrary, pd.MultiIndex):
-        return cudf.MultiIndex.from_pandas(arbitrary)
+        return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null)
     elif isinstance(arbitrary, cudf.DataFrame):
         return cudf.MultiIndex.from_frame(arbitrary)
     return as_index(
-        column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs
+        column.as_column(
+            arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null
+        ),
+        **kwargs,
     )
 
 
@@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta):
     tupleize_cols : bool (default: True)
         When True, attempt to create a MultiIndex if possible.
         tupleize_cols == False is not yet supported.
+    nan_as_null : bool, Default True
+        If ``None``/``True``, converts ``np.nan`` values to
+        ``null`` values.
+        If ``False``, leaves ``np.nan`` values as is.
 
     Returns
     -------
@@ -2655,6 +2662,7 @@ def __new__(
         copy=False,
         name=None,
         tupleize_cols=True,
+        nan_as_null=True,
         **kwargs,
     ):
         assert (
@@ -2665,7 +2673,14 @@ def __new__(
                 "tupleize_cols != True is not yet supported"
             )
 
-        return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
+        return as_index(
+            data,
+            copy=copy,
+            dtype=dtype,
+            name=name,
+            nan_as_null=nan_as_null,
+            **kwargs,
+        )
 
     @classmethod
     def from_arrow(cls, obj):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c7fca2075f5..6679725ae9a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution):
     cuidx_floor = cuidx.round(resolution)
 
     assert_eq(pidx_floor, cuidx_floor)
+
+
+@pytest.mark.parametrize(
+    "data,nan_idx,NA_idx",
+    [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)],
+)
+@pytest.mark.parametrize("nan_as_null", [True, False])
+def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
+    idx = cudf.Index(data, nan_as_null=nan_as_null)
+
+    if nan_as_null:
+        if nan_idx is not None:
+            assert idx[nan_idx] is cudf.NA
+    else:
+        if nan_idx is not None:
+            assert np.isnan(idx[nan_idx])
+
+    if NA_idx is not None:
+        assert idx[NA_idx] is cudf.NA

From 44fce8bb201ae818ec73ed563e4ab0232ceb751e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:02:27 -0600
Subject: [PATCH 084/202] Fix cudf.Scalar string datetime construction (#9875)

Closes https://github.com/rapidsai/cudf/issues/9874

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9875
---
 python/cudf/cudf/tests/test_scalar.py | 12 ++++++++++++
 python/cudf/cudf/utils/dtypes.py      |  6 +++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index a9919900256..a8b62710e0e 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -369,3 +369,15 @@ def test_construct_from_scalar(value):
 
     x._is_host_value_current == y._is_host_value_current
     x._is_device_value_current == y._is_device_value_current
+
+
+@pytest.mark.parametrize(
+    "data", ["20000101", "2000-01-01", "2000-01-01T00:00:00.000000000", "2000"]
+)
+@pytest.mark.parametrize("dtype", DATETIME_TYPES)
+def test_datetime_scalar_from_string(data, dtype):
+    slr = cudf.Scalar(data, dtype)
+
+    expected = np.datetime64(datetime.datetime(2000, 1, 1)).astype(dtype)
+
+    assert expected == slr.value
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2eb38c0f77e..7142d0d710e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -257,7 +257,11 @@ def to_cudf_compatible_scalar(val, dtype=None):
     val = cudf.api.types.pandas_dtype(type(val)).type(val)
 
     if dtype is not None:
-        val = val.astype(dtype)
+        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+            # pd.Timestamp can handle str, but not np.str_
+            val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
+        else:
+            val = val.astype(dtype)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)

From 3428f7f7b123851ee580c29f7c4fdc28b8384e98 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 14 Dec 2021 22:08:54 -0600
Subject: [PATCH 085/202] Fix compilation of benchmark for parquet writer.
 (#9905)

This fixes a compilation error introduced in #9810. Tagging @devavret @vuule for review. Feel free to push to this PR with any fixes.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9905
---
 cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index b4c11179c35..5c3c53fee8e 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -85,7 +85,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
       cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
         .stats_level(enable_stats)
-        .column_chunks_file_path(file_path);
+        .column_chunks_file_paths({file_path});
     cudf_io::write_parquet(options);
   }
 

From 78d12bb20501770839c2a062a2cc611349dc4120 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Wed, 15 Dec 2021 10:05:12 +0100
Subject: [PATCH 086/202] Update ucx-py version on release using rvc (#9897)

Update `ucx-py` version on release using `rvc`

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9897
---
 ci/gpu/build.sh              | 5 ++++-
 ci/gpu/java.sh               | 5 ++++-
 ci/release/update-version.sh | 5 +++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 00ad6bf812d..5646c268301 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -33,6 +33,9 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 # Dask & Distributed git tag
 export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -83,7 +86,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.24.*"
+                  "ucx-py=${UCX_PY_VERSION}"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index bada16bd40e..6f7038d21d7 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -80,7 +83,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.24.*" \
+                  "ucx-py=${UCX_PY_VERSION}" \
                   "openjdk=8.*" \
                   "maven"
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 86432a92128..1105b9c194d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -21,6 +21,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -62,3 +63,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
+
+# ucx-py version update
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/java.sh

From 38631a635fbfe05f69fd243df03868ec1f23d3c5 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 15 Dec 2021 08:29:05 -0600
Subject: [PATCH 087/202] Fix the java build after parquet partitioning support
 (#9908)

This fixes the java build after #9810 went in.

There is a lot of copy/paste in this first draft, because I just wanted to get something to work. Not sure if it is worth going back to make it common everywhere.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9908
---
 java/src/main/native/src/TableJni.cpp | 63 +++++++++++++++++++--------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 5bae4f5f399..0914c8a23f7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -705,15 +705,12 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
 
 void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
                          jintArray &j_children, jbooleanArray &j_col_nullability,
-                         jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values,
                          jbooleanArray &j_is_int96, jintArray &j_precisions,
                          jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
   cudf::jni::native_jbooleanArray is_int96(env, j_is_int96);
-  cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-  cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
   cudf::jni::native_jintArray precisions(env, j_precisions);
   cudf::jni::native_jintArray children(env, j_children);
   cudf::jni::native_jbooleanArray is_map(env, j_is_map);
@@ -742,9 +739,6 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
                               is_int96, precisions, is_map, children, childs_children, read_index);
     }
   }
-  for (auto i = 0; i < meta_keys.size(); ++i) {
-    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-  }
 }
 
 // Check that window parameters are valid.
@@ -1364,15 +1358,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     using namespace cudf::jni;
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
             .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .key_value_metadata({kv_metadata})
             .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
@@ -1398,15 +1400,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::io;
     using namespace cudf::jni;
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
+
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
             .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .key_value_metadata({kv_metadata})
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
@@ -1519,9 +1530,16 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1530,6 +1548,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
+                                          .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
@@ -1556,15 +1575,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
+                                          .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =

From db9aef8181c400d707d512a2449cc9927d4a3bc5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Dec 2021 09:49:57 -0500
Subject: [PATCH 088/202] Add regex_flags parameter to strings replace_re
 functions (#9878)

Closes #9845

Adds a `cudf::strings::regex_flags` parameter to the `cudf::strings::replace_re` functions so the matching logic will be the same as for `cudf::strings::contains_re` which already has this parameter.

This is a breaking change since it adds this new parameter and changes the default behavior. The previous default behavior is equivalent to specifying the `regex_flags::MULTILINE` flag now to be consistent with the default behavior of `contains_re`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9878
---
 cpp/include/cudf/strings/replace_re.hpp   | 24 ++++---
 cpp/src/strings/replace/backref_re.cu     | 16 +++--
 cpp/src/strings/replace/multi_re.cu       | 84 +++++++++--------------
 cpp/src/strings/replace/replace_re.cu     | 68 +++++++++---------
 cpp/tests/strings/replace_regex_tests.cpp | 52 ++++++++++++++
 5 files changed, 143 insertions(+), 101 deletions(-)

diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 087d1a94603..a2c4eba1636 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 namespace cudf {
@@ -37,22 +38,25 @@ namespace strings {
  *
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression pattern to search within each string.
- * @param repl The string used to replace the matched sequence in each string.
+ * @param replacement The string used to replace the matched sequence in each string.
  *        Default is an empty string.
- * @param maxrepl The maximum number of times to replace the matched pattern within each string.
+ * @param max_replace_count The maximum number of times to replace the matched pattern
+ *        within each string. Default replaces every substring that is matched.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  string_scalar const& repl           = string_scalar(""),
-  size_type maxrepl                   = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& replacement           = string_scalar(""),
+  std::optional<size_type> max_replace_count = std::nullopt,
+  regex_flags const flags                    = regex_flags::DEFAULT,
+  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
- * with the corresponding string in the repls column.
+ * with the corresponding string in the `replacements` column.
  *
  * Any null string entries return corresponding null output column entries.
  *
@@ -60,14 +64,16 @@ std::unique_ptr<column> replace_re(
  *
  * @param strings Strings instance for this operation.
  * @param patterns The regular expression patterns to search within each string.
- * @param repls The strings used for replacement.
+ * @param replacements The strings used for replacement.
+ * @param flags Regex flags for interpreting special characters in the patterns.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::vector<std::string> const& patterns,
-  strings_column_view const& repls,
+  strings_column_view const& replacements,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,6 +89,7 @@ std::unique_ptr<column> replace_re(
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression patterns to search within each string.
  * @param replacement The replacement template for creating the output string.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
@@ -90,6 +97,7 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
   std::string const& replacement,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 99c55998fb9..ff86d7aa552 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -101,22 +101,24 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string con
 std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
-  std::string const& repl,
+  std::string const& replacement,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-  CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
+  CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
   // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings.size(), stream);
   auto const regex_insts = d_prog->insts_counts();
 
   // parse the repl string for back-ref indicators
-  auto const parse_result = parse_backrefs(repl);
+  auto const parse_result = parse_backrefs(replacement);
   rmm::device_uvector<backref_type> backrefs =
     cudf::detail::make_device_uvector_async(parse_result.second, stream);
   string_scalar repl_scalar(parse_result.first, true, stream);
@@ -170,11 +172,13 @@ std::unique_ptr<column> replace_with_backrefs(
 
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
                                               std::string const& pattern,
-                                              std::string const& repl,
+                                              std::string const& replacement,
+                                              regex_flags const flags,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_with_backrefs(strings, pattern, repl, rmm::cuda_stream_default, mr);
+  return detail::replace_with_backrefs(
+    strings, pattern, replacement, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 25417909c89..2b5380b76dd 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -53,12 +54,11 @@ using found_range = thrust::pair<size_type, size_type>;
 template <int stack_size>
 struct replace_multi_regex_fn {
   column_device_view const d_strings;
-  reprog_device* progs;  // array of regex progs
-  size_type number_of_patterns;
-  found_range* d_found_ranges;       // working array matched (begin,end) values
-  column_device_view const d_repls;  // replacement strings
-  int32_t* d_offsets{};              // these are null when
-  char* d_chars{};                   // only computing size
+  device_span<reprog_device const> progs;  // array of regex progs
+  found_range* d_found_ranges;             // working array matched (begin,end) values
+  column_device_view const d_repls;        // replacement strings
+  int32_t* d_offsets{};
+  char* d_chars{};
 
   __device__ void operator()(size_type idx)
   {
@@ -66,6 +66,9 @@ struct replace_multi_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
+
+    auto const number_of_patterns = static_cast<size_type>(progs.size());
+
     auto const d_str      = d_strings.element<string_view>(idx);
     auto const nchars     = d_str.length();      // number of characters in input string
     auto nbytes           = d_str.size_bytes();  // number of bytes in input string
@@ -129,7 +132,8 @@ struct replace_multi_regex_fn {
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::vector<std::string> const& patterns,
-  strings_column_view const& repls,
+  strings_column_view const& replacements,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -138,31 +142,25 @@ std::unique_ptr<column> replace_re(
   if (patterns.empty())  // no patterns; just return a copy
     return std::make_unique<column>(strings.parent(), stream, mr);
 
-  CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls");
+  CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls");
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
-  auto d_repls   = column_device_view::create(repls.parent(), stream);
-  auto d_flags   = get_character_flags_table();
+  auto d_strings    = column_device_view::create(strings.parent(), stream);
+  auto d_repls      = column_device_view::create(replacements.parent(), stream);
+  auto d_char_table = get_character_flags_table();
 
   // compile regexes into device objects
   size_type regex_insts = 0;
   std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>> h_progs;
-  thrust::host_vector<reprog_device> progs;
+  std::vector<reprog_device> progs;
   for (auto itr = patterns.begin(); itr != patterns.end(); ++itr) {
-    auto prog   = reprog_device::create(*itr, d_flags, strings_count, stream);
+    auto prog   = reprog_device::create(*itr, flags, d_char_table, strings_count, stream);
     regex_insts = std::max(regex_insts, prog->insts_counts());
     progs.push_back(*prog);
     h_progs.emplace_back(std::move(prog));
   }
 
   // copy all the reprog_device instances to a device memory array
-  rmm::device_buffer progs_buffer{sizeof(reprog_device) * progs.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(progs_buffer.data(),
-                           progs.data(),
-                           progs.size() * sizeof(reprog_device),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-  reprog_device* d_progs = reinterpret_cast<reprog_device*>(progs_buffer.data());
+  auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
 
   // create working buffer for ranges pairs
   rmm::device_uvector<found_range> found_ranges(patterns.size() * strings_count, stream);
@@ -172,34 +170,19 @@ std::unique_ptr<column> replace_re(
   auto children = [&] {
     // Each invocation is predicated on the stack size which is dependent on the number of regex
     // instructions
-    if (regex_insts <= RX_SMALL_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_SMALL>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_MEDIUM>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_LARGE_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_LARGE>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_ANY>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
+    if (regex_insts <= RX_SMALL_INSTS) {
+      replace_multi_regex_fn<RX_STACK_SMALL> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_MEDIUM_INSTS) {
+      replace_multi_regex_fn<RX_STACK_MEDIUM> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_LARGE_INSTS) {
+      replace_multi_regex_fn<RX_STACK_LARGE> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else {
+      replace_multi_regex_fn<RX_STACK_ANY> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    }
   }();
 
   return make_strings_column(strings_count,
@@ -215,11 +198,12 @@ std::unique_ptr<column> replace_re(
 
 std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::vector<std::string> const& patterns,
-                                   strings_column_view const& repls,
+                                   strings_column_view const& replacements,
+                                   regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, repls, rmm::cuda_stream_default, mr);
+  return detail::replace_re(strings, patterns, replacements, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index b940944c186..9fd1768453a 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -52,7 +52,7 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   reprog_device prog;
   string_view const d_repl;
-  size_type maxrepl;
+  size_type const maxrepl;
   int32_t* d_offsets{};
   char* d_chars{};
 
@@ -102,56 +102,48 @@ struct replace_regex_fn {
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  string_scalar const& repl           = string_scalar(""),
-  size_type maxrepl                   = -1,
+  string_scalar const& replacement,
+  std::optional<size_type> max_replace_count,
+  regex_flags const flags,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid");
-  string_view d_repl(repl.data(), repl.size());
+  CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
+  string_view d_repl(replacement.data(), replacement.size());
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-  auto regex_insts = d_prog.insts_counts();
+  auto prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
+  auto d_prog            = *prog;
+  auto const regex_insts = d_prog.insts_counts();
 
   // copy null mask
-  auto null_mask  = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  auto null_count = strings.null_count();
+  auto null_mask        = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
+  auto const null_count = strings.null_count();
+  auto const maxrepl    = max_replace_count.value_or(-1);
 
   // create child columns
   auto children = [&] {
     // Each invocation is predicated on the stack size which is dependent on the number of regex
     // instructions
-    if (regex_insts <= RX_SMALL_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_LARGE_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_ANY>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
+    if (regex_insts <= RX_SMALL_INSTS) {
+      replace_regex_fn<RX_STACK_SMALL> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_MEDIUM_INSTS) {
+      replace_regex_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_LARGE_INSTS) {
+      replace_regex_fn<RX_STACK_LARGE> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else {
+      replace_regex_fn<RX_STACK_ANY> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    }
   }();
 
   return make_strings_column(strings_count,
@@ -167,12 +159,14 @@ std::unique_ptr<column> replace_re(
 
 std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::string const& pattern,
-                                   string_scalar const& repl,
-                                   size_type maxrepl,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, pattern, repl, maxrepl, rmm::cuda_stream_default, mr);
+  return detail::replace_re(
+    strings, pattern, replacement, max_replace_count, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 16308265a9b..eac06fa4588 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -133,6 +133,58 @@ TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
+TEST_F(StringsReplaceRegexTest, MultiReplacement)
+{
+  cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"});
+  auto results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2);
+  cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
+}
+
+TEST_F(StringsReplaceRegexTest, Multiline)
+{
+  auto const multiline = cudf::strings::regex_flags::MULTILINE;
+
+  cudf::test::strings_column_wrapper input({"bcd\naba\nefg", "aba\naba abab\naba", "aba"});
+  auto sv = cudf::strings_column_view(input);
+
+  // single-replace
+  auto results =
+    cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline);
+  cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
+
+  results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"));
+  cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  // multi-replace
+  std::vector<std::string> patterns({"aba$", "^aba"});
+  cudf::test::strings_column_wrapper repls({">", "<"});
+  results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls), multiline);
+  cudf::test::strings_column_wrapper multi_expected_ml({"bcd\n>\nefg", ">\n< abab\n>", ">"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected_ml);
+
+  results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls));
+  cudf::test::strings_column_wrapper multi_expected({"bcd\naba\nefg", "<\naba abab\n>", ">"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected);
+
+  // backref-replace
+  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline);
+  cudf::test::strings_column_wrapper br_expected_ml(
+    {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
+
+  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]");
+  cudf::test::strings_column_wrapper br_expected(
+    {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",

From 0c3f7356e0afe391dc874b55898029275a23db1c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Dec 2021 10:03:22 -0500
Subject: [PATCH 089/202] Add dictionary support to cudf::copy_if_else (#9887)

Close #9885

Adds support for dictionary column types to `cudf::copy_if_else`. The column/scalar versions of this API will accept a scalar type that matches the dictionary's key type. The column/column version will accept 2 dictionary columns with matching key types. The result of the function will be a dictionary that incorporates both sets of keys or the scalar value as appropriate.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9887
---
 .../dictionary/dictionary_column_view.hpp     |  5 ++
 cpp/src/copying/copy.cu                       | 37 +++++++-
 cpp/src/dictionary/dictionary_column_view.cpp |  8 +-
 cpp/tests/copying/copy_tests.cpp              | 84 +++++++++++++++++++
 4 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 1da52e67e06..42f8310040e 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -77,6 +77,11 @@ class dictionary_column_view : private column_view {
    */
   column_view keys() const noexcept;
 
+  /**
+   * @brief Returns the `data_type` of the keys child column.
+   */
+  data_type keys_type() const noexcept;
+
   /**
    * @brief Returns the number of rows in the keys column.
    */
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 10af2ffb614..91fc5f02989 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -267,6 +268,22 @@ struct copy_if_else_functor_impl<list_view> {
   }
 };
 
+template <>
+struct copy_if_else_functor_impl<dictionary32> {
+  template <typename Left, typename Right, typename Filter>
+  std::unique_ptr<column> operator()(Left const& lhs,
+                                     Right const& rhs,
+                                     size_type size,
+                                     bool,
+                                     bool,
+                                     Filter filter,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
+  }
+};
+
 /**
  * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations
  *        of column_view and scalar
@@ -297,7 +314,6 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8");
 
@@ -311,7 +327,11 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
     return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and
            bool_mask_device.element<bool>(i);
   };
-  return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
+
+  // always dispatch on dictionary-type if either input is a dictionary
+  auto dispatch_type = cudf::is_dictionary(rhs.type()) ? rhs.type() : lhs.type();
+
+  return cudf::type_dispatcher<dispatch_storage_type>(dispatch_type,
                                                       copy_if_else_functor{},
                                                       lhs,
                                                       rhs,
@@ -334,6 +354,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns");
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size");
+  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
@@ -345,6 +367,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column");
+
+  auto rhs_type =
+    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
+  CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
@@ -356,6 +383,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column");
+
+  auto lhs_type =
+    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
+  CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
 
@@ -365,6 +397,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp
index d33fd6c548f..4906e5b4f9c 100644
--- a/cpp/src/dictionary/dictionary_column_view.cpp
+++ b/cpp/src/dictionary/dictionary_column_view.cpp
@@ -44,8 +44,12 @@ column_view dictionary_column_view::keys() const noexcept { return child(1); }
 
 size_type dictionary_column_view::keys_size() const noexcept
 {
-  if (size() == 0) return 0;
-  return keys().size();
+  return (size() == 0) ? 0 : keys().size();
+}
+
+data_type dictionary_column_view::keys_type() const noexcept
+{
+  return (size() == 0) ? data_type{type_id::EMPTY} : keys().type();
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 651a977050c..4468bc69640 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -18,11 +18,13 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/encode.hpp>
 #include <cudf/scalar/scalar.hpp>
 
 template <typename T>
@@ -633,3 +635,85 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch)
 
   EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error);
 }
+
+struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
+{
+  auto valids = cudf::test::iterators::null_at(2);
+  std::vector<const char*> h_strings1{"eee", "bb", "", "aa", "bb", "ééé"};
+  cudf::test::dictionary_column_wrapper<std::string> input1(
+    h_strings1.begin(), h_strings1.end(), valids);
+  std::vector<const char*> h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"};
+  cudf::test::dictionary_column_wrapper<std::string> input2(
+    h_strings2.begin(), h_strings2.end(), valids);
+
+  bool mask[]   = {1, 1, 0, 1, 0, 1};
+  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
+
+  auto results = cudf::copy_if_else(input1, input2, mask_w);
+  auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  std::vector<const char*> h_expected;
+  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings1.size()); ++idx) {
+    if (mask[idx] and mask_v[idx])
+      h_expected.push_back(h_strings1[idx]);
+    else
+      h_expected.push_back(h_strings2[idx]);
+  }
+  cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end(), valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected);
+}
+
+TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
+{
+  std::string h_string{"eee"};
+  cudf::string_scalar input1{h_string};
+  std::vector<const char*> h_strings{"zz", "", "yyy", "w", "ééé", "ooo"};
+  auto valids = cudf::test::iterators::null_at(1);
+  cudf::test::dictionary_column_wrapper<std::string> input2(
+    h_strings.begin(), h_strings.end(), valids);
+
+  bool mask[] = {0, 1, 1, 1, 0, 1};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
+
+  auto results = cudf::copy_if_else(input2, input1, mask_w);
+  auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  std::vector<const char*> h_expected1;
+  std::vector<const char*> h_expected2;
+  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings.size()); ++idx) {
+    if (mask[idx]) {
+      h_expected1.push_back(h_strings[idx]);
+      h_expected2.push_back(h_string.c_str());
+    } else {
+      h_expected1.push_back(h_string.c_str());
+      h_expected2.push_back(h_strings[idx]);
+    }
+  }
+
+  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end(), valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected1);
+
+  results = cudf::copy_if_else(input1, input2, mask_w);
+  decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected2);
+}
+
+TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
+{
+  cudf::test::dictionary_column_wrapper<int32_t> input1({1, 1, 1, 1});
+  cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
+  cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
+
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+
+  cudf::string_scalar input3{"1"};
+  EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error);
+}

From 967f3397fb486368d74916ae344c0e1d9eb0a1a8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Dec 2021 13:28:11 -0600
Subject: [PATCH 090/202] Remove conda envs for CUDA 11.0 and 11.2. (#9910)

I think the development environments for CUDA 11.0 and 11.2 can be safely removed now that we require CUDA 11.5 to build. I also updated the default CUDA version in the conda recipes from 10.1 to 11.5.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9910
---
 conda/environments/cudf_dev_cuda11.0.yml | 69 ------------------------
 conda/environments/cudf_dev_cuda11.2.yml | 69 ------------------------
 conda/recipes/cudf/meta.yaml             |  2 +-
 conda/recipes/cudf_kafka/meta.yaml       |  2 +-
 conda/recipes/custreamz/meta.yaml        |  4 +-
 conda/recipes/dask-cudf/meta.yaml        |  2 +-
 conda/recipes/libcudf/meta.yaml          |  2 +-
 7 files changed, 6 insertions(+), 144 deletions(-)
 delete mode 100644 conda/environments/cudf_dev_cuda11.0.yml
 delete mode 100644 conda/environments/cudf_dev_cuda11.2.yml

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
deleted file mode 100644
index e7b92eddd9e..00000000000
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-dependencies:
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
-  - rmm=22.02.*
-  - cmake>=3.20.1
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.54
-  - numpy
-  - pandas>=1.0,<1.4.0dev0
-  - pyarrow=5.0.0=*cuda
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc=<2.0.0
-  - cudatoolkit=11.0
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.6.4
-  - mypy=0.782
-  - pydocstyle=6.1.1
-  - typing_extensions
-  - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
-  - streamz
-  - arrow-cpp=5.0.0
-  - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp-proc * cuda
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis<4.1
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git@master
-      - pyorc
-  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
deleted file mode 100644
index 6fe8ed0fafe..00000000000
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-dependencies:
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
-  - rmm=22.02.*
-  - cmake>=3.20.1
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.54
-  - numpy
-  - pandas>=1.0,<1.4.0dev0
-  - pyarrow=5.0.0=*cuda
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc=<2.0.0
-  - cudatoolkit=11.2
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.6.4
-  - mypy=0.782
-  - pydocstyle=6.1.1
-  - typing_extensions
-  - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
-  - streamz
-  - arrow-cpp=5.0.0
-  - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp-proc * cuda
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis<4.1
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git@master
-      - pyorc
-  - ptxcompiler  # [linux64]
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 46eefbc825f..2600ab358cc 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index af27d888b46..e450d306cbe 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 
 package:
   name: cudf_kafka
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index db8af9b0bed..a8b096d4892 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 
 package:
   name: custreamz
@@ -29,7 +29,7 @@ requirements:
     - cudf_kafka {{ version }}
   run:
     - python
-    - streamz 
+    - streamz
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2
     - distributed>=2021.11.1,<=2021.11.2
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index d90de2d628c..ed3309056cf 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index e78110f3233..bd9b76e4890 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -2,7 +2,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:

From 0faf2afc2a12b8dad5e3d1fd823b6a8c98c28bcc Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 15 Dec 2021 12:32:11 -0700
Subject: [PATCH 091/202] Implement JNI for `cudf::scatter` APIs (#9903)

This PR adds Java binding for both `cudf::scatter` APIs:
```
std::unique_ptr<table> scatter(
  table_view const& source,
  column_view const& scatter_map,
  table_view const& target,
...)
```
and
```
std::unique_ptr<table> scatter(
  std::vector<std::reference_wrapper<const scalar>> const& source,
  column_view const& indices,
  table_view const& target,
...)
```

Closes #9892.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9903
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 68 ++++++++++++++++++-
 java/src/main/native/src/TableJni.cpp         | 40 +++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 47 ++++++++++++-
 3 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 887a125e083..00c98c4fef8 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -649,6 +649,13 @@ private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns,
 
   private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds);
 
+  private static native long[] scatterTable(long srcTableHandle, long scatterView,
+                                            long targetTableHandle, boolean checkBounds)
+                                            throws CudfException;
+  private static native long[] scatterScalars(long[] srcScalarHandles, long scatterView,
+                                             long targetTableHandle, boolean checkBounds)
+                                             throws CudfException;
+
   private static native long[] convertToRows(long nativeHandle);
 
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
@@ -2047,7 +2054,7 @@ public Table gather(ColumnView gatherMap) {
    * `n` is the number of rows in this table.
    *
    * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
-   * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`
+   * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`.
    * @return the resulting Table.
    */
   public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
@@ -2055,6 +2062,65 @@ public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
     return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
   }
 
+  /**
+   * Scatters values from the source table into the target table out-of-place, returning a new
+   * result table. The scatter is performed according to a scatter map such that row `scatterMap[i]`
+   * of the destination table gets row `i` of the source table. All other rows of the destination
+   * table equal corresponding rows of the target table.
+   *
+   * The number of columns in source must match the number of columns in target and their
+   * corresponding data types must be the same.
+   *
+   * If the same index appears more than once in the scatter map, the result is undefined.
+   *
+   * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of
+   * rows in the `target` table.
+   *
+   * @param scatterMap The map of indexes. Must be non-nullable and integral type.
+   * @param target The table into which rows from the current table are to be scattered out-of-place.
+   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
+   *                    an exception if any of its values are out of bounds.
+   * @return A new table which is the result of out-of-place scattering the source table into the
+   *         target table.
+   */
+  public Table scatter(ColumnView scatterMap, Table target, boolean checkBounds) {
+    return new Table(scatterTable(nativeHandle, scatterMap.getNativeView(), target.getNativeView(),
+        checkBounds));
+  }
+
+  /**
+   * Scatters values from the source rows into the target table out-of-place, returning a new result
+   * table. The scatter is performed according to a scatter map such that row `scatterMap[i]` of the
+   * destination table is replaced by the source row `i`. All other rows of the destination table
+   * equal corresponding rows of the target table.
+   *
+   * The number of elements in source must match the number of columns in target and their
+   * corresponding data types must be the same.
+   *
+   * If the same index appears more than once in the scatter map, the result is undefined.
+   *
+   * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of
+   * rows in the `target` table.
+   *
+   * @param source The input scalars containing values to be scattered into the target table.
+   * @param scatterMap The map of indexes. Must be non-nullable and integral type.
+   * @param target The table into which the values from source are to be scattered out-of-place.
+   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
+   *                    an exception if any of its values are out of bounds.
+   * @return A new table which is the result of out-of-place scattering the source values into the
+   *         target table.
+   */
+  public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target,
+                              boolean checkBounds) {
+    long[] srcScalarHandles = new long[source.length];
+    for(int i = 0; i < source.length; ++i) {
+      assert source[i] != null : "Scalar vectors passed in should not contain null";
+      srcScalarHandles[i] = source[i].getScalarHandle();
+    }
+    return new Table(scatterScalars(srcScalarHandles, scatterMap.getNativeView(),
+        target.getNativeView(), checkBounds));
+  }
+
   private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0914c8a23f7..0e6425ea7a2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2746,6 +2746,46 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
+                                                                    jlong j_input, jlong j_map,
+                                                                    jlong j_target,
+                                                                    jboolean check_bounds) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  JNI_NULL_CHECK(env, j_map, "map column is null", 0);
+  JNI_NULL_CHECK(env, j_target, "target table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto result = cudf::scatter(*input, *map, *target, check_bounds);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
+                                                                      jlongArray j_input,
+                                                                      jlong j_map, jlong j_target,
+                                                                      jboolean check_bounds) {
+  JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
+  JNI_NULL_CHECK(env, j_map, "map column is null", 0);
+  JNI_NULL_CHECK(env, j_target, "target table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
+    std::vector<std::reference_wrapper<cudf::scalar const>> input;
+    for (int i = 0; i < scalars_array.size(); ++i) {
+      input.emplace_back(*scalars_array[i]);
+    }
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto result = cudf::scatter(input, *map, *target, check_bounds);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass,
                                                                      jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index eeed8224425..86c55e19776 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -56,12 +56,12 @@
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
-import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertTableTypes;
 import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
+import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -6338,6 +6338,51 @@ void testBoundsCheckedGather() {
     }
   }
 
+
+  @Test
+  void testScatterTable() {
+    try (Table srcTable = new Table.TestBuilder()
+            .column(1, 2, 3, 4, 5)
+            .column("A", "AA", "AAA", "AAAA", "AAAAA")
+            .decimal32Column(-3, 1, 2, 3, 4, 5)
+            .decimal64Column(-8, 100001L, 200002L, 300003L, 400004L, 500005L)
+            .build();
+         ColumnVector scatterMap = ColumnVector.fromInts(0, 2, 4, -2);
+         Table targetTable = new Table.TestBuilder()
+            .column(-1, -2, -3, -4, -5)
+            .column("B", "BB", "BBB", "BBBB", "BBBBB")
+            .decimal32Column(-3, -1, -2, -3, -4, -5)
+            .decimal64Column(-8, -100001L, -200002L, -300003L, -400004L, -500005L)
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(1, -2, 2, 4, 3)
+            .column("A", "BB", "AA", "AAAA", "AAA")
+            .decimal32Column(-3, 1, -2, 2, 4, 3)
+            .decimal64Column(-8, 100001L, -200002L, 200002L, 400004L, 300003L)
+            .build();
+         Table result = srcTable.scatter(scatterMap, targetTable, false)) {
+      assertTablesAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testScatterScalars() {
+    try (Scalar s1 = Scalar.fromInt(0);
+         Scalar s2 = Scalar.fromString("A");
+         ColumnVector scatterMap = ColumnVector.fromInts(0, 2, -1);
+         Table targetTable = new Table.TestBuilder()
+            .column(-1, -2, -3, -4, -5)
+            .column("B", "BB", "BBB", "BBBB", "BBBBB")
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(0, -2, 0, -4, 0)
+            .column("A", "BB", "A", "BBBB", "A")
+            .build();
+         Table result = Table.scatter(new Scalar[] { s1, s2 }, scatterMap, targetTable, false)) {
+       assertTablesAreEqual(expected, result);
+     }
+  }
+
   @Test
   void testMaskWithoutValidity() {
     try (ColumnVector mask = ColumnVector.fromBoxedBooleans(true, false, true, false, true);

From 56430b46a6494cbec90b4c085085a905631be55f Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 15 Dec 2021 16:09:38 -0800
Subject: [PATCH 092/202] Use pandas `to_offset` to parse frequency string in
 `date_range` (#9843)

Pandas uses the [following regex](https://github.com/pandas-dev/pandas/blob/8fefaa5a9a7c3f3a1c35c36c1140117dab73c9c7/pandas/_libs/tslibs/offsets.pyx#L3506-L3508) to convert freqeuncy strings into offset, which is exposed in public api `pd.tseries.frequencies.to_offset`. Currently cudf depends on a [custom  regex](https://github.com/rapidsai/cudf/blob/fdd9bb00dc0ba5ac373feaa079b782029130dae3/python/cudf/cudf/core/tools/datetimes.py#L819-L821) to perform the conversion. We probably shouldn't reinvent the wheels here as it might make it harder to track changes in pandas.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9843
---
 python/cudf/cudf/core/tools/datetimes.py | 41 ++++++++++++++---------
 python/cudf/cudf/tests/test_datetime.py  | 42 ++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 3efbd982b53..15426d0173a 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -8,6 +8,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pandas.tseries.offsets as pd_offset
 from pandas.core.tools.datetimes import _unit_map
 
 import cudf
@@ -458,6 +459,17 @@ class DateOffset:
         "Y": "years",
     }
 
+    _TICK_OR_WEEK_TO_UNITS = {
+        pd_offset.Week: "weeks",
+        pd_offset.Day: "days",
+        pd_offset.Hour: "hours",
+        pd_offset.Minute: "minutes",
+        pd_offset.Second: "seconds",
+        pd_offset.Milli: "milliseconds",
+        pd_offset.Micro: "microseconds",
+        pd_offset.Nano: "nanoseconds",
+    }
+
     _FREQSTR_REGEX = re.compile("([0-9]*)([a-zA-Z]+)")
 
     def __init__(self, n=1, normalize=False, **kwds):
@@ -649,6 +661,13 @@ def _from_freqstr(cls: Type[_T], freqstr: str) -> _T:
 
         return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
 
+    @classmethod
+    def _from_pandas_ticks_or_weeks(
+        cls: Type[_T],
+        tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week],
+    ) -> _T:
+        return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n})
+
     def _maybe_as_fast_pandas_offset(self):
         if (
             len(self.kwds) == 1
@@ -814,23 +833,15 @@ def date_range(
     if isinstance(freq, DateOffset):
         offset = freq
     elif isinstance(freq, str):
-        # Map pandas `offset alias` into cudf DateOffset `CODE`, only
-        # fixed-frequency, non-anchored offset aliases are supported.
-        mo = re.fullmatch(
-            rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
-        )
-        if mo is None:
+        offset = pd.tseries.frequencies.to_offset(freq)
+        if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance(
+            offset, pd.tseries.offsets.Week
+        ):
             raise ValueError(
-                f"Unrecognized or unsupported offset alias {freq}."
+                f"Unrecognized frequency string {freq}. cuDF does "
+                "not yet support month, quarter, year-anchored frequency."
             )
-
-        sign, n, offset_alias = mo.groups()
-        code = _offset_alias_to_code[offset_alias]
-
-        freq = "".join([n, code])
-        offset = DateOffset._from_freqstr(freq)
-        if sign:
-            offset.kwds.update({s: -i for s, i in offset.kwds.items()})
+        offset = DateOffset._from_pandas_ticks_or_weeks(offset)
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 72601a3da2c..1a1b21aa3d5 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1583,6 +1583,48 @@ def test_date_range_raise_overflow():
         cudf.date_range(start=start, periods=periods, freq=freq)
 
 
+@pytest.mark.parametrize(
+    "freqstr_unsupported",
+    [
+        "1M",
+        "2SM",
+        "3MS",
+        "4BM",
+        "5CBM",
+        "6SMS",
+        "7BMS",
+        "8CBMS",
+        "Q",
+        "2BQ",
+        "3BQS",
+        "10A",
+        "10Y",
+        "9BA",
+        "9BY",
+        "8AS",
+        "8YS",
+        "7BAS",
+        "7BYS",
+        "BH",
+        "B",
+    ],
+)
+def test_date_range_raise_unsupported(freqstr_unsupported):
+    s, e = "2001-01-01", "2008-01-31"
+    pd.date_range(start=s, end=e, freq=freqstr_unsupported)
+    with pytest.raises(ValueError, match="does not yet support"):
+        cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
+
+    # We also check that these values are unsupported when using lowercase
+    # characters. We exclude the value 3MS (every 3 month starts) because 3ms
+    # is a valid frequency for every 3 milliseconds.
+    if freqstr_unsupported != "3MS":
+        freqstr_unsupported = freqstr_unsupported.lower()
+        pd.date_range(start=s, end=e, freq=freqstr_unsupported)
+        with pytest.raises(ValueError, match="does not yet support"):
+            cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
+
+
 ##################################################################
 #                    End of Date Range Test                      #
 ##################################################################

From 52d7acc29e788e27d066e99eecda142170c9e746 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Wed, 15 Dec 2021 21:50:55 -0500
Subject: [PATCH 093/202] Replace `thrust/std::get` with structure bindings
 (#9915)

While watching @bdice and @isVoid's Better Code talk, I noticed a pair of `thrust::get`s that could be replaced by structured bindings. I did a quick search and found all the places the clean up could be done. That is what this PR does.

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9915
---
 cpp/include/cudf/detail/merge.cuh          | 8 ++------
 cpp/include/cudf/strings/detail/merge.cuh  | 7 ++-----
 cpp/src/dictionary/detail/merge.cu         | 6 ++----
 cpp/src/merge/merge.cu                     | 7 ++-----
 cpp/tests/copying/sample_tests.cpp         | 5 ++---
 cpp/tests/interop/from_arrow_test.cpp      | 9 ++++-----
 cpp/tests/interop/to_arrow_test.cpp        | 9 ++++-----
 cpp/tests/io/orc_test.cpp                  | 8 +++-----
 cpp/tests/transform/mask_to_bools_test.cpp | 3 +--
 9 files changed, 22 insertions(+), 40 deletions(-)

diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index f141d9b5d59..ee5cb5c265d 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -80,14 +80,10 @@ struct tagged_element_relational_comparator {
   __device__ weak_ordering compare(index_type lhs_tagged_index,
                                    index_type rhs_tagged_index) const noexcept
   {
-    side const l_side = thrust::get<0>(lhs_tagged_index);
-    side const r_side = thrust::get<0>(rhs_tagged_index);
-
-    cudf::size_type const l_indx = thrust::get<1>(lhs_tagged_index);
-    cudf::size_type const r_indx = thrust::get<1>(rhs_tagged_index);
+    auto const [l_side, l_indx] = lhs_tagged_index;
+    auto const [r_side, r_indx] = rhs_tagged_index;
 
     column_device_view const* ptr_left_dview{l_side == side::LEFT ? &lhs : &rhs};
-
     column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs};
 
     auto erl_comparator = element_relational_comparator(
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index a132d8c7229..dba1c24be93 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -68,8 +68,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
 
   // build offsets column
   auto offsets_transformer = [d_lhs, d_rhs] __device__(auto index_pair) {
-    auto side  = thrust::get<0>(index_pair);
-    auto index = thrust::get<1>(index_pair);
+    auto const [side, index] = index_pair;
     if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0;
     auto d_str =
       side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
@@ -90,9 +89,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
-                       index_type index_pair = begin[idx];
-                       auto side             = thrust::get<0>(index_pair);
-                       auto index            = thrust::get<1>(index_pair);
+                       auto const [side, index] = begin[idx];
                        if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return;
                        auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
                                                        : d_rhs.element<string_view>(index);
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index e972403cad3..a194f4add2e 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -53,10 +53,8 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                     row_order.end(),
                     output_iter,
                     [lcol_iter, rcol_iter] __device__(auto const& index_pair) {
-                      auto index = thrust::get<1>(index_pair);
-                      return (thrust::get<0>(index_pair) == cudf::detail::side::LEFT
-                                ? lcol_iter[index]
-                                : rcol_iter[index]);
+                      auto const [side, index] = index_pair;
+                      return side == cudf::detail::side::LEFT ? lcol_iter[index] : rcol_iter[index];
                     });
 
   // build dictionary; the validity mask is updated by the caller
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index f7e9b114f7b..ff9401022b2 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -80,9 +80,7 @@ __global__ void materialize_merged_bitmask_kernel(
   auto active_threads = __ballot_sync(0xffffffff, destination_row < num_destination_rows);
 
   while (destination_row < num_destination_rows) {
-    index_type const& merged_idx = merged_indices[destination_row];
-    side const src_side          = thrust::get<0>(merged_idx);
-    size_type const src_row      = thrust::get<1>(merged_idx);
+    auto const [src_side, src_row] = merged_indices[destination_row];
     bool const from_left{src_side == side::LEFT};
     bool source_bit_is_valid{true};
     if (left_have_valids && from_left) {
@@ -284,8 +282,7 @@ struct column_merger {
                       row_order_.end(),
                       merged_view.begin<Element>(),
                       [d_lcol, d_rcol] __device__(index_type const& index_pair) {
-                        auto side  = thrust::get<0>(index_pair);
-                        auto index = thrust::get<1>(index_pair);
+                        auto const [side, index] = index_pair;
                         return side == side::LEFT ? d_lcol[index] : d_rcol[index];
                       });
 
diff --git a/cpp/tests/copying/sample_tests.cpp b/cpp/tests/copying/sample_tests.cpp
index 4da1b541a65..8cb2b9ce74e 100644
--- a/cpp/tests/copying/sample_tests.cpp
+++ b/cpp/tests/copying/sample_tests.cpp
@@ -89,9 +89,8 @@ struct SampleBasicTest : public SampleTest,
 
 TEST_P(SampleBasicTest, CombinationOfParameters)
 {
-  cudf::size_type const table_size         = 1024;
-  cudf::size_type const n_samples          = std::get<0>(GetParam());
-  cudf::sample_with_replacement multi_smpl = std::get<1>(GetParam());
+  cudf::size_type const table_size   = 1024;
+  auto const [n_samples, multi_smpl] = GetParam();
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
   cudf::test::fixed_width_column_wrapper<int16_t> col1(data, data + table_size);
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 52d5da8f6e5..946ac7fc891 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -333,11 +333,10 @@ struct FromArrowTestSlice
 
 TEST_P(FromArrowTestSlice, SliceTest)
 {
-  auto tables          = get_tables(10000);
-  auto cudf_table_view = tables.first->view();
-  auto arrow_table     = tables.second;
-  auto start           = std::get<0>(GetParam());
-  auto end             = std::get<1>(GetParam());
+  auto tables             = get_tables(10000);
+  auto cudf_table_view    = tables.first->view();
+  auto arrow_table        = tables.second;
+  auto const [start, end] = GetParam();
 
   auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
   auto expected_cudf_table = cudf::table{sliced_cudf_table};
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 9ad546d3e01..98031f42a9c 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -488,11 +488,10 @@ struct ToArrowTestSlice
 
 TEST_P(ToArrowTestSlice, SliceTest)
 {
-  auto tables          = get_tables(10000);
-  auto cudf_table_view = tables.first->view();
-  auto arrow_table     = tables.second;
-  auto start           = std::get<0>(GetParam());
-  auto end             = std::get<1>(GetParam());
+  auto tables             = get_tables(10000);
+  auto cudf_table_view    = tables.first->view();
+  auto arrow_table        = tables.second;
+  auto const [start, end] = GetParam();
 
   auto sliced_cudf_table    = cudf::slice(cudf_table_view, {start, end})[0];
   auto expected_arrow_table = arrow_table->Slice(start, end - start);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 574ce8573e9..656c72ef02f 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1137,8 +1137,7 @@ struct OrcWriterTestDecimal : public OrcWriterTest,
 
 TEST_P(OrcWriterTestDecimal, Decimal64)
 {
-  auto const num_rows = std::get<0>(GetParam());
-  auto const scale    = std::get<1>(GetParam());
+  auto const [num_rows, scale] = GetParam();
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
@@ -1241,9 +1240,8 @@ struct OrcWriterTestStripes
 
 TEST_P(OrcWriterTestStripes, StripeSize)
 {
-  constexpr auto num_rows = 1000000;
-  auto size_bytes         = std::get<0>(GetParam());
-  auto size_rows          = std::get<1>(GetParam());
+  constexpr auto num_rows            = 1000000;
+  auto const [size_bytes, size_rows] = GetParam();
 
   const auto seq_col = random_values<int>(num_rows);
   const auto validity =
diff --git a/cpp/tests/transform/mask_to_bools_test.cpp b/cpp/tests/transform/mask_to_bools_test.cpp
index 2a759ffcfe5..02057fc3f3a 100644
--- a/cpp/tests/transform/mask_to_bools_test.cpp
+++ b/cpp/tests/transform/mask_to_bools_test.cpp
@@ -56,8 +56,7 @@ struct MaskToBoolsTest
 TEST_P(MaskToBoolsTest, LargeDataSizeTest)
 {
   auto data                       = std::vector<bool>(10000);
-  cudf::size_type const begin_bit = std::get<0>(GetParam());
-  cudf::size_type const end_bit   = std::get<1>(GetParam());
+  auto const [begin_bit, end_bit] = GetParam();
   std::transform(data.cbegin(), data.cend(), data.begin(), [](auto val) {
     return rand() % 2 == 0 ? true : false;
   });

From b08b37ddf19de41f3ea044a229327c9186861535 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Thu, 16 Dec 2021 15:39:11 +0100
Subject: [PATCH 094/202] Add missing imports tests (#9920)

Add missing imports tests

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9920
---
 conda/recipes/dask-cudf/meta.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index ed3309056cf..da8bcea430a 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -40,6 +40,8 @@ requirements:
 test:                                   # [linux64]
   requires:                             # [linux64]
     - cudatoolkit {{ cuda_version }}.*  # [linux64]
+  imports:                              # [linux64]
+    - dask_cudf                         # [linux64]
 
 
 about:

From b8f812aae554f7b58db197e1040e56656c969c21 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 16 Dec 2021 10:40:50 -0700
Subject: [PATCH 095/202] Fix null handling for structs `min` and `arg_min` in
 groupby, groupby scan, reduction, and inclusive_scan (#9864)

When finding `min`, `max`, `arg_min` and `arg_max` for structs in groupby, groupby scan, reduction and inclusive_scan operations, null struct rows should be excluded from the operation (but the null rows of its children column are not). The current implementation for structs wrongly includes nulls at all levels, producing wrong results for `min` and `arg_min` operations.

This PR fixes that. In particular, null rows at the children levels are still being handled by the old way (nulls are smaller than non-null elements), but handling nulls at the top parent column level is modified such that:
 * nulls are considered as larger than all other non-null rows, if finding for `min` and `arg_min`, or
 * nulls are considered as smaller than all other non-null rows, if finding for `max` and `arg_max`.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9864
---
 cpp/src/groupby/sort/group_scan_util.cuh      |  53 ++-----
 .../sort/group_single_pass_reduction_util.cuh |  25 +--
 cpp/src/reductions/arg_minmax_util.cuh        |  65 --------
 cpp/src/reductions/scan/scan_inclusive.cu     |  42 ++---
 cpp/src/reductions/simple.cuh                 |  40 +----
 cpp/src/reductions/struct_minmax_util.cuh     | 143 ++++++++++++++++++
 cpp/tests/reductions/reduction_tests.cpp      |  14 +-
 cpp/tests/reductions/scan_tests.cpp           |  76 ++++++----
 8 files changed, 237 insertions(+), 221 deletions(-)
 delete mode 100644 cpp/src/reductions/arg_minmax_util.cuh
 create mode 100644 cpp/src/reductions/struct_minmax_util.cuh

diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 2efe14f70ca..14e5195bb79 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <reductions/arg_minmax_util.cuh>
+#include <reductions/struct_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -26,8 +26,6 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
@@ -192,43 +190,18 @@ struct group_scan_functor<K,
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
-    // When finding MIN, we need to consider nulls as larger than non-null elements.
-    // Thing is opposite when finding MAX.
-    auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
-    auto const flattened_values = structs::detail::flatten_nested_columns(
-      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-    auto const flattened_null_precedences =
-      (K == aggregation::MIN)
-        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-        : rmm::device_uvector<null_order>(0, stream);
+    // Create a gather map containing indices of the prefix min/max elements within each group.
+    auto gather_map = rmm::device_uvector<size_type>(values.size(), stream);
 
-    // Create a gather map contaning indices of the prefix min/max elements.
-    auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
-    auto const map_begin = gather_map.begin();
-
-    // Perform segmented scan.
-    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
-      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                    group_labels.begin(),
-                                    group_labels.end(),
-                                    inp_iter,
-                                    out_iter,
-                                    thrust::equal_to{},
-                                    binop);
-    };
-
-    // Find the indices of the prefix min/max elements within each group.
-    auto const count_iter = thrust::make_counting_iterator<size_type>(0);
-    auto const binop      = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
-                                                                  *d_flattened_values_ptr,
-                                                                  values.has_nulls(),
-                                                                  flattened_null_precedences.data(),
-                                                                  K == aggregation::MIN);
-    do_scan(count_iter, map_begin, binop);
-
-    auto gather_map_view =
-      column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
+    auto const binop_generator =
+      cudf::reduction::detail::comparison_binop_generator::create<K>(values, stream);
+    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                  group_labels.begin(),
+                                  group_labels.end(),
+                                  thrust::make_counting_iterator<size_type>(0),
+                                  gather_map.begin(),
+                                  thrust::equal_to{},
+                                  binop_generator.binop());
 
     //
     // Gather the children elements of the prefix min/max struct elements first.
@@ -240,7 +213,7 @@ struct group_scan_functor<K,
     auto scanned_children =
       cudf::detail::gather(
         table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
-        gather_map_view,
+        gather_map,
         cudf::out_of_bounds_policy::DONT_CHECK,
         cudf::detail::negative_index_policy::NOT_ALLOWED,
         stream,
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 4fde825c0e0..ffc6032dfa1 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,15 +16,13 @@
 
 #pragma once
 
-#include <reductions/arg_minmax_util.cuh>
+#include <reductions/struct_minmax_util.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -244,18 +242,6 @@ struct group_reduction_functor<
 
     if (values.is_empty()) { return result; }
 
-    // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
-    // Thing is opposite for ARGMAX.
-    auto const null_precedence =
-      (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
-    auto const flattened_values = structs::detail::flatten_nested_columns(
-      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
-    auto const flattened_null_precedences =
-      (K == aggregation::ARGMIN)
-        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
-        : rmm::device_uvector<null_order>(0, stream);
-
     // Perform segmented reduction to find ARGMIN/ARGMAX.
     auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
@@ -270,12 +256,9 @@ struct group_reduction_functor<
 
     auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
     auto const result_begin = result->mutable_view().template begin<ResultType>();
-    auto const binop        = cudf::reduction::detail::row_arg_minmax_fn(values.size(),
-                                                                  *d_flattened_values_ptr,
-                                                                  values.has_nulls(),
-                                                                  flattened_null_precedences.data(),
-                                                                  K == aggregation::ARGMIN);
-    do_reduction(count_iter, result_begin, binop);
+    auto const binop_generator =
+      cudf::reduction::detail::comparison_binop_generator::create<K>(values, stream);
+    do_reduction(count_iter, result_begin, binop_generator.binop());
 
     if (values.has_nulls()) {
       // Generate bitmask for the output by segmented reduction of the input bitmask.
diff --git a/cpp/src/reductions/arg_minmax_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh
deleted file mode 100644
index 5694d0ed0fa..00000000000
--- a/cpp/src/reductions/arg_minmax_util.cuh
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/table/row_operators.cuh>
-
-namespace cudf {
-namespace reduction {
-namespace detail {
-
-/**
- * @brief Binary operator ArgMin/ArgMax with index values into the input table.
- */
-struct row_arg_minmax_fn {
-  size_type const num_rows;
-  row_lexicographic_comparator<nullate::DYNAMIC> const comp;
-  bool const arg_min;
-
-  row_arg_minmax_fn(size_type const num_rows,
-                    table_device_view const& table,
-                    bool has_nulls,
-                    null_order const* null_precedence,
-                    bool const arg_min)
-    : num_rows(num_rows),
-      comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
-      arg_min(arg_min)
-  {
-  }
-
-  // This function is explicitly prevented from inlining, because it calls to
-  // `row_lexicographic_comparator::operator()` which is inlined and very heavy-weight. As a result,
-  // instantiating this functor will result in huge code, and objects of this functor used with
-  // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
-  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
-    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
-
-    // Return `lhs_idx` iff:
-    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
-    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
-    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
-  }
-};
-
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 5c2b686fd9c..809f3506c67 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <reductions/arg_minmax_util.cuh>
 #include <reductions/scan/scan.cuh>
+#include <reductions/struct_minmax_util.cuh>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -23,8 +23,6 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -159,35 +157,15 @@ struct scan_functor<Op, cudf::struct_view> {
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
   {
-    // Op is used only to determined if we want to find the min or max element.
-    auto constexpr is_min_op = std::is_same_v<Op, DeviceMin>;
-
-    // Build indices of the scan operation results (ARGMIN/ARGMAX).
-    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
-    // opposite for ARGMAX.
-    auto gather_map    = rmm::device_uvector<size_type>(input.size(), stream);
-    auto const do_scan = [&](auto const& binop) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
-                             thrust::counting_iterator<size_type>(0),
-                             thrust::counting_iterator<size_type>(input.size()),
-                             gather_map.begin(),
-                             binop);
-    };
-
-    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
-    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
-      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
-    auto const flattened_null_precedences =
-      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
-                : rmm::device_uvector<cudf::null_order>(0, stream);
-
-    auto const binop = cudf::reduction::detail::row_arg_minmax_fn(input.size(),
-                                                                  *d_flattened_input_ptr,
-                                                                  input.has_nulls(),
-                                                                  flattened_null_precedences.data(),
-                                                                  is_min_op);
-    do_scan(binop);
+    // Create a gather map contaning indices of the prefix min/max elements.
+    auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
+    auto const binop_generator =
+      cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
+    thrust::inclusive_scan(rmm::exec_policy(stream),
+                           thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(input.size()),
+                           gather_map.begin(),
+                           binop_generator.binop());
 
     // Gather the children columns of the input column. Must use `get_sliced_child` to properly
     // handle input in case it is a sliced view.
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 642531434ae..8f76a320b7e 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -16,13 +16,12 @@
 
 #pragma once
 
-#include <reductions/arg_minmax_util.cuh>
+#include <reductions/struct_minmax_util.cuh>
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/reduction.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
@@ -294,37 +293,14 @@ struct same_element_type_dispatcher {
   {
     if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); }
 
-    auto constexpr is_min_op = std::is_same_v<Op, cudf::reduction::op::min>;
-
     // We will do reduction to find the ARGMIN/ARGMAX index, then return the element at that index.
-    // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the
-    // opposite for ARGMAX.
-    auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE;
-    auto const flattened_input     = cudf::structs::detail::flatten_nested_columns(
-      table_view{{input}}, {}, std::vector<null_order>{null_precedence});
-    auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream);
-    auto const flattened_null_precedences =
-      is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream)
-                : rmm::device_uvector<cudf::null_order>(0, stream);
-
-    // Perform reduction to find ARGMIN/ARGMAX.
-    auto const do_reduction = [&](auto const& binop) {
-      return thrust::reduce(rmm::exec_policy(stream),
-                            thrust::make_counting_iterator(0),
-                            thrust::make_counting_iterator(input.size()),
-                            size_type{0},
-                            binop);
-    };
-
-    auto const minmax_idx = [&] {
-      auto const binop =
-        cudf::reduction::detail::row_arg_minmax_fn(input.size(),
-                                                   *d_flattened_input_ptr,
-                                                   input.has_nulls(),
-                                                   flattened_null_precedences.data(),
-                                                   is_min_op);
-      return do_reduction(binop);
-    }();
+    auto const binop_generator =
+      cudf::reduction::detail::comparison_binop_generator::create<Op>(input, stream);
+    auto const minmax_idx = thrust::reduce(rmm::exec_policy(stream),
+                                           thrust::make_counting_iterator(0),
+                                           thrust::make_counting_iterator(input.size()),
+                                           size_type{0},
+                                           binop_generator.binop());
 
     return cudf::detail::get_element(input, minmax_idx, stream, mr);
   }
diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
new file mode 100644
index 00000000000..8a7e94ea4ca
--- /dev/null
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/reduction_operators.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+
+namespace cudf {
+namespace reduction {
+namespace detail {
+
+/**
+ * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ */
+struct row_arg_minmax_fn {
+  size_type const num_rows;
+  row_lexicographic_comparator<nullate::DYNAMIC> const comp;
+  bool const arg_min;
+
+  row_arg_minmax_fn(table_device_view const& table,
+                    bool has_nulls,
+                    null_order const* null_precedence,
+                    bool const arg_min)
+    : num_rows(table.num_rows()),
+      comp(nullate::DYNAMIC{has_nulls}, table, table, nullptr, null_precedence),
+      arg_min(arg_min)
+  {
+  }
+
+  // This function is explicitly prevented from inlining, because it calls to
+  // `row_lexicographic_comparator::operator()` which is inlined and very heavy-weight. As a result,
+  // instantiating this functor will result in huge code, and objects of this functor used with
+  // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
+  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
+    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+/**
+ * @brief The null order when comparing a null with non-null elements. Currently support only the
+ * default null order: nulls are compared as LESS than any other non-null elements.
+ */
+auto static constexpr DEFAULT_NULL_ORDER = cudf::null_order::BEFORE;
+
+/**
+ * @brief The utility class to provide a binary operator object for lexicographic comparison of
+ * struct elements.
+ *
+ * The input of this class is a structs column. Using the binary operator provided by this class,
+ * nulls STRUCT are compared as larger than all other non-null STRUCT elements - if finding for
+ * ARGMIN, or smaller than all other non-null STRUCT elements - if finding for ARGMAX. This helps
+ * achieve the results of finding the min or max STRUCT element when nulls are excluded from the
+ * operations, returning null only when all the input elements are nulls.
+ */
+class comparison_binop_generator {
+ private:
+  cudf::structs::detail::flattened_table const flattened_input;
+  std::unique_ptr<table_device_view, std::function<void(table_device_view*)>> const
+    d_flattened_input_ptr;
+  bool const is_min_op;
+  bool const has_nulls;
+
+  std::vector<null_order> null_orders;
+  rmm::device_uvector<null_order> null_orders_dvec;
+
+  comparison_binop_generator(column_view const& input, rmm::cuda_stream_view stream, bool is_min_op)
+    : flattened_input{cudf::structs::detail::flatten_nested_columns(
+        table_view{{input}}, {}, std::vector<null_order>{DEFAULT_NULL_ORDER})},
+      d_flattened_input_ptr{table_device_view::create(flattened_input, stream)},
+      is_min_op(is_min_op),
+      has_nulls{input.has_nulls()},
+      null_orders_dvec(0, stream)
+  {
+    if (is_min_op) {
+      null_orders = flattened_input.null_orders();
+      // Null structs are excluded from the operations, and that is equivalent to considering
+      // nulls as larger than all other non-null STRUCT elements (if finding for ARGMIN), or
+      // smaller than all other non-null STRUCT elements (if finding for ARGMAX).
+      // Thus, we need to set a separate null order for the top level structs column (which is
+      // stored at the first position in the null_orders array) to achieve this purpose.
+      null_orders.front() = cudf::null_order::AFTER;
+      null_orders_dvec    = cudf::detail::make_device_uvector_async(null_orders, stream);
+    }
+    // else: Don't need to generate nulls order to copy to device memory if we have all null orders
+    // are BEFORE (that happens when we have is_min_op == false).
+  }
+
+ public:
+  auto binop() const
+  {
+    return row_arg_minmax_fn(*d_flattened_input_ptr, has_nulls, null_orders_dvec.data(), is_min_op);
+  }
+
+  template <typename BinOp>
+  static auto create(column_view const& input, rmm::cuda_stream_view stream)
+  {
+    return comparison_binop_generator(
+      input,
+      stream,
+      std::is_same_v<BinOp, cudf::reduction::op::min> || std::is_same_v<BinOp, cudf::DeviceMin>);
+  }
+
+  template <cudf::aggregation::Kind K>
+  static auto create(column_view const& input, rmm::cuda_stream_view stream)
+
+  {
+    return comparison_binop_generator(
+      input, stream, K == cudf::aggregation::MIN || K == cudf::aggregation::ARGMIN);
+  }
+};
+
+}  // namespace detail
+}  // namespace reduction
+}  // namespace cudf
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index e138cd6f68e..e1c426990eb 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -2301,28 +2301,32 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
   using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
   using STRINGS_CW = cudf::test::strings_column_wrapper;
   using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
+  // `null` means null at child column.
+  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*NULL*/,
+                              "₹1" /*null*/,
                               "aaa" /*NULL*/,
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*NULL*/,
+                              "$1" /*null*/,
                               "€1" /*NULL*/,
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
 
   {
-    auto const expected_child1 = STRINGS_CW{"aab"};
-    auto const expected_child2 = INTS_CW{7};
+    // In the structs column, the min struct is {null, null}.
+    auto const expected_child1 = STRINGS_CW{{""}, null_at(0)};
+    auto const expected_child2 = INTS_CW{{8}, null_at(0)};
     this->reduction_test(input,
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 0892436eb47..8dee5160fd7 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -488,30 +488,52 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
   using INTS_CW    = cudf::test::fixed_width_column_wrapper<int>;
   using STRINGS_CW = cudf::test::strings_column_wrapper;
   using STRUCTS_CW = cudf::test::structs_column_wrapper;
+  using cudf::test::iterators::null_at;
   using cudf::test::iterators::nulls_at;
 
+  // `null` means null at child column.
+  // `NULL` means null at parent column.
   auto const input = [] {
     auto child1 = STRINGS_CW{{"año",
                               "bit",
-                              "₹1" /*NULL*/,
+                              "₹1" /*null*/,
                               "aaa" /*NULL*/,
                               "zit",
                               "bat",
                               "aab",
-                              "$1" /*NULL*/,
+                              "$1" /*null*/,
                               "€1" /*NULL*/,
                               "wut"},
                              nulls_at({2, 7})};
-    auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10},
+    auto child2 = INTS_CW{{1, 2, 3 /*null*/, 4 /*NULL*/, 5, 6, 7, 8 /*null*/, 9 /*NULL*/, 10},
                           nulls_at({2, 7})};
     return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
   }();
 
   {
     auto const expected = [] {
-      auto child1 = STRINGS_CW{
-        "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"};
-      auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7};
+      auto child1 = STRINGS_CW{{"año",
+                                "año",
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/,
+                                "" /*null*/},
+                               nulls_at({2, 3, 4, 5, 6, 7, 8, 9})};
+      auto child2 = INTS_CW{{1,
+                             1,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/,
+                             0 /*null*/},
+                            nulls_at({2, 3, 4, 5, 6, 7, 8, 9})};
       return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})};
     }();
 
@@ -535,26 +557,28 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
 
   {
     auto const expected = [] {
-      auto child1 = STRINGS_CW{"año",
-                               "año",
-                               "año",
-                               "" /*NULL*/,
-                               "" /*NULL*/,
-                               "" /*NULL*/,
-                               "" /*NULL*/,
-                               "" /*NULL*/,
-                               "" /*NULL*/,
-                               "" /*NULL*/};
-      auto child2 = INTS_CW{1,
-                            1,
-                            1,
-                            0 /*NULL*/,
-                            0 /*NULL*/,
-                            0 /*NULL*/,
-                            0 /*NULL*/,
-                            0 /*NULL*/,
-                            0 /*NULL*/,
-                            0 /*NULL*/};
+      auto child1 = STRINGS_CW{{"año",
+                                "año",
+                                "" /*null*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/,
+                                "" /*NULL*/},
+                               null_at(2)};
+      auto child2 = INTS_CW{{1,
+                             1,
+                             0 /*null*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/,
+                             0 /*NULL*/},
+                            null_at(2)};
       return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})};
     }();
 

From 19190b438a8b5fff2598cda5ef7ff66740061471 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 16 Dec 2021 13:55:39 -0800
Subject: [PATCH 096/202] JNI: Function to copy and set validity from bool
 column. (#9901)

This commit introduces a JNI function `ColumnVector#copyWithValidity(ColumnView boolean_column)` to use a `BOOL8` column as the validity buffer of a column.

As per the convention in the JNI layer, this method does not modify the current object. A new `ColumnVector` is returned, containing the contents of the current object. The validity of the returned `ColumnVector` is set to the `DeviceBuffer` representation of the logical contents of the supplied `BOOL8` column.

This method will be useful to set a column's validity based on a column resulting from a boolean operation.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9901
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 38 +++++++++++++
 java/src/main/native/CMakeLists.txt           |  1 +
 java/src/main/native/src/ColumnViewJni.cpp    | 23 ++++++--
 java/src/main/native/src/ColumnViewJni.cu     | 54 +++++++++++++++++++
 java/src/main/native/src/ColumnViewJni.hpp    | 38 +++++++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 40 ++++++++++++++
 6 files changed, 190 insertions(+), 4 deletions(-)
 create mode 100644 java/src/main/native/src/ColumnViewJni.cu
 create mode 100644 java/src/main/native/src/ColumnViewJni.hpp

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 6d0d24baf99..5153c5c1d2a 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -803,6 +803,25 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
     return new ColumnVector(bitwiseMergeAndSetValidity(getNativeView(), columnViews, mergeOp.nativeId));
   }
 
+  /**
+   * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
+   * device_vector equivalent of the boolean column given as argument.
+   * 
+   * The boolColumn must have the same number of rows as the current column.
+   * The result column will have the same number of rows as the current column. 
+   * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
+   * For all other values (i.e. `false` or `null`), the result column will have nulls.
+   * 
+   * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
+   * then the row value is undefined.
+   * 
+   * @param boolColumn bool column whose value is to be used as the validity mask.
+   * @return Deep copy of the column with replaced validity mask.
+   */    
+  public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
+    return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
+  }
+
   /////////////////////////////////////////////////////////////////////////////
   // DATE/TIME
   /////////////////////////////////////////////////////////////////////////////
@@ -3752,6 +3771,25 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo
   private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] viewHandles,
                                                         int nullConfig) throws CudfException;
 
+  /**
+   * Native method to deep copy a column while replacing the null mask. The null mask is the
+   * device_vector equivalent of the boolean column given as argument.
+   * 
+   * The boolColumn must have the same number of rows as the exemplar column.
+   * The result column will have the same number of rows as the exemplar.
+   * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
+   * For all other values (i.e. `false` or `null`), the result column will have nulls.
+   * 
+   * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
+   * then the resultant row value is undefined.
+   * 
+   * @param exemplarViewHandle column view of the column that is deep copied.
+   * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
+   * @return Deep copy of the column with replaced null mask.
+   */                                                      
+  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle, 
+                                                             long boolColumnViewHandle) throws CudfException;
+
   /**
    * Get the number of bytes needed to allocate a validity buffer for the given number of rows.
    */
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0ed2f31bfac..2db37d57cbb 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -225,6 +225,7 @@ add_library(
   src/CudaJni.cpp
   src/ColumnVectorJni.cpp
   src/ColumnViewJni.cpp
+  src/ColumnViewJni.cu
   src/CompiledExpression.cpp
   src/ContiguousTableJni.cpp
   src/HashJoinJni.cpp
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 02d5dc4569c..4cd4b070aed 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include "ColumnViewJni.hpp"
 #include <numeric>
 
+#include <jni.h>
+
 #include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -66,14 +69,11 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
-#include <map_lookup.hpp>
-
-#include "cudf/types.hpp"
 
 #include "cudf_jni_apis.hpp"
 #include "dtype_utils.hpp"
-#include "jni.h"
 #include "jni_utils.hpp"
+#include "map_lookup.hpp"
 
 namespace {
 
@@ -1576,6 +1576,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
+    JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) {
+  JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
+  JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+
+    auto const exemplar = *reinterpret_cast<cudf::column_view *>(exemplar_handle);
+    auto const validity = *reinterpret_cast<cudf::column_view *>(validity_column_handle);
+    auto deep_copy = cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity);
+    return reinterpret_cast<jlong>(deep_copy.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 ////////
 // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
 // should typically only be called from the CudfColumn inner class.
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
new file mode 100644
index 00000000000..47055ca1611
--- /dev/null
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/valid_if.cuh>
+
+#include "ColumnViewJni.hpp"
+
+namespace cudf::jni {
+
+std::unique_ptr<cudf::column>
+new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
+                                           cudf::column_view const &validity_column) {
+  CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
+               "Validity column must be of type bool");
+  CUDF_EXPECTS(validity_column.size() == exemplar.size(),
+               "Exemplar and validity columns must have the same size");
+
+  auto validity_device_view = cudf::column_device_view::create(validity_column);
+  auto validity_begin = cudf::detail::make_optional_iterator<bool>(
+      *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
+  auto validity_end = validity_begin + validity_device_view->size();
+  auto [null_mask, null_count] =
+      cudf::detail::valid_if(validity_begin, validity_end, [] __device__(auto optional_bool) {
+        return optional_bool.value_or(false);
+      });
+  auto const exemplar_without_null_mask = cudf::column_view{
+      exemplar.type(),
+      exemplar.size(),
+      exemplar.head<void>(),
+      nullptr,
+      0,
+      exemplar.offset(),
+      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
+  auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
+  deep_copy->set_null_mask(std::move(null_mask), null_count);
+  return deep_copy;
+}
+
+} // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
new file mode 100644
index 00000000000..37e58ecb63a
--- /dev/null
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+
+namespace cudf::jni {
+
+/**
+ * @brief Creates a deep copy of the exemplar column, with its validity set to the equivalent
+ * of the boolean `validity` column's value.
+ *
+ * The bool_column must have the same number of rows as the exemplar column.
+ * The result column will have the same number of rows as the exemplar.
+ * For all indices `i` where the boolean column is `true`, the result column will have a valid value
+ * at index i. For all other values (i.e. `false` or `null`), the result column will have nulls.
+ *
+ * @param exemplar The column to be deep copied.
+ * @param bool_column bool column whose value is to be used as the validity.
+ * @return Deep copy of the exemplar, with the replaced validity.
+ */
+std::unique_ptr<cudf::column>
+new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
+                                           cudf::column_view const &bool_column);
+
+} // namespace cudf::jni
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7120a40a26a..b78183692a3 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5886,4 +5886,44 @@ void testReplaceSameIndexColumnInStruct() {
     });
     assertTrue(e.getMessage().contains("Duplicate mapping found for replacing child index"));
   }
+
+  @Test
+  void testCopyWithBooleanColumnAsValidity() {
+    final Boolean T = true;
+    final Boolean F = false;
+    final Integer X = null;
+
+    // Straight-line: Invalidate every other row.
+    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, T, F, T);
+         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, 8, X, 10);
+         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
+      assertColumnsAreEqual(expected, result);
+    }
+
+    // Straight-line: Invalidate all Rows.
+    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, F, F, F, F, F, F, F, F, F);
+         ColumnVector expected = ColumnVector.fromBoxedInts(X, X, X, X, X, X, X, X, X, X);
+         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
+      assertColumnsAreEqual(expected, result);
+    }
+
+    // Nulls in the validity column are treated as invalid.
+    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, null, F, null);
+         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, X, X, X);
+         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
+      assertColumnsAreEqual(expected, result);
+    }
+
+    // Negative case: Mismatch in row count.
+    Exception x = assertThrows(CudfException.class, () ->  { 
+      try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T);
+         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
+      }
+    });
+    assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size"));
+  }
 }

From 428a1b37bc2c9dff738a7e02e55d42e91976ce8b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 16 Dec 2021 23:07:23 -0800
Subject: [PATCH 097/202] Add test for map column metadata handling in ORC
 writer (#9852)

Expands existing ORC test to cover the fix merged in #9782

Test now has struct column nested in a map column, so that the propagation of names of the leaf columns (struct column field names in ORC) can be tested. The test segfaults without the fix in #9782

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9852
---
 cpp/tests/io/orc_test.cpp | 240 ++++++++++++++++++--------------------
 1 file changed, 116 insertions(+), 124 deletions(-)

diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 656c72ef02f..837ac96ef21 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -19,6 +19,7 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -41,6 +42,22 @@ using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
                             cudf::test::strings_column_wrapper,
                             cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
+
+using str_col     = column_wrapper<cudf::string_view>;
+using bool_col    = column_wrapper<bool>;
+using int8_col    = column_wrapper<int8_t>;
+using int16_col   = column_wrapper<int16_t>;
+using int32_col   = column_wrapper<int32_t>;
+using int64_col   = column_wrapper<int64_t>;
+using float32_col = column_wrapper<float>;
+using float64_col = column_wrapper<double>;
+using dec32_col   = column_wrapper<numeric::decimal32>;
+using dec64_col   = column_wrapper<numeric::decimal64>;
+using dec128_col  = column_wrapper<numeric::decimal128>;
+using struct_col  = cudf::test::structs_column_wrapper;
+template <typename T>
+using list_col = cudf::test::lists_column_wrapper<T>;
+
 using column     = cudf::column;
 using table      = cudf::table;
 using table_view = cudf::table_view;
@@ -54,29 +71,24 @@ std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_colum
                                                        cudf::size_type num_rows,
                                                        bool include_validity)
 {
-  auto valids = cudf::detail::make_counting_transform_iterator(
-    0, [](auto i) { return i % 2 == 0 ? true : false; });
-  std::vector<cudf::test::fixed_width_column_wrapper<T>> src_cols(num_columns);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  std::vector<column_wrapper<T>> src_cols(num_columns);
   for (int idx = 0; idx < num_columns; idx++) {
     auto rand_elements =
       cudf::detail::make_counting_transform_iterator(0, [](T i) { return rand(); });
     if (include_validity) {
-      src_cols[idx] =
-        cudf::test::fixed_width_column_wrapper<T>(rand_elements, rand_elements + num_rows, valids);
+      src_cols[idx] = column_wrapper<T>(rand_elements, rand_elements + num_rows, valids);
     } else {
-      src_cols[idx] =
-        cudf::test::fixed_width_column_wrapper<T>(rand_elements, rand_elements + num_rows);
+      src_cols[idx] = column_wrapper<T>(rand_elements, rand_elements + num_rows);
     }
   }
   std::vector<std::unique_ptr<cudf::column>> columns(num_columns);
-  std::transform(src_cols.begin(),
-                 src_cols.end(),
-                 columns.begin(),
-                 [](cudf::test::fixed_width_column_wrapper<T>& in) {
-                   auto ret = in.release();
-                   ret->has_nulls();
-                   return ret;
-                 });
+  std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](column_wrapper<T>& in) {
+    auto ret = in.release();
+    ret->has_nulls();
+    return ret;
+  });
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
@@ -159,9 +171,8 @@ struct SkipRowTest {
                                              int read_num_rows)
   {
     auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-    auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
     column_wrapper<int32_t, typename decltype(sequence)::value_type> input_col(
-      sequence, sequence + file_num_rows, validity);
+      sequence, sequence + file_num_rows);
     table_view input_table({input_col});
 
     cudf_io::orc_writer_options out_opts =
@@ -173,8 +184,8 @@ struct SkipRowTest {
       begin_sequence += skip_rows;
       end_sequence += std::min(skip_rows + read_num_rows, file_num_rows);
     }
-    column_wrapper<int32_t, typename decltype(sequence)::value_type> output_col(
-      begin_sequence, end_sequence, validity);
+    column_wrapper<int32_t, typename decltype(sequence)::value_type> output_col(begin_sequence,
+                                                                                end_sequence);
     std::vector<std::unique_ptr<column>> output_cols;
     output_cols.push_back(output_col.release());
     return std::make_unique<table>(std::move(output_cols));
@@ -214,11 +225,10 @@ struct SkipRowTest {
 TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
@@ -259,11 +269,10 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
 {
   auto sequence =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (std::rand() / 10); });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
@@ -310,11 +319,10 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
 {
   constexpr int64_t max = std::numeric_limits<int64_t>::max();
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 100;
-  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(sequence,
+                                                                         sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
@@ -348,23 +356,21 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
     return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
   });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-
-  column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), validity};
-  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), validity};
-  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), validity};
-  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), validity};
-  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), validity};
-  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
-  column_wrapper<numeric::decimal128> col6{col6_data, col6_data + num_rows, validity};
-  column_wrapper<numeric::decimal128> col7{col7_data, col7_data + num_rows, validity};
-
-  cudf::test::lists_column_wrapper<int64_t> col8{
+
+  bool_col col0(col0_data.begin(), col0_data.end());
+  int8_col col1(col1_data.begin(), col1_data.end());
+  int16_col col2(col2_data.begin(), col2_data.end());
+  int32_col col3(col3_data.begin(), col3_data.end());
+  float32_col col4(col4_data.begin(), col4_data.end());
+  float64_col col5(col5_data.begin(), col5_data.end());
+  dec128_col col6(col6_data, col6_data + num_rows);
+  dec128_col col7(col7_data, col7_data + num_rows);
+
+  list_col<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
 
-  auto child_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
-  auto col9 = cudf::test::structs_column_wrapper{child_col};
+  int32_col child_col{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
+  struct_col col9{child_col};
 
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
 
@@ -412,7 +418,6 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); });
-  auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
@@ -422,19 +427,19 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col6_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
 
-  column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), col0_mask};
-  column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), col1_mask};
-  column_wrapper<int16_t> col2{col2_data.begin(), col2_data.end(), col2_mask};
-  column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
-  column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
-  column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
-  cudf::test::lists_column_wrapper<int32_t> col7{
+  bool_col col0{col0_data.begin(), col0_data.end(), col0_mask};
+  int8_col col1{col1_data.begin(), col1_data.end(), col1_mask};
+  int16_col col2(col2_data.begin(), col2_data.end());
+  int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
+  float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
+  float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
+  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
+  list_col<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};
   auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
     {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}};
-  auto col8 = cudf::test::structs_column_wrapper{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
+  struct_col col8{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
 
   cudf_io::table_input_metadata expected_metadata(expected);
@@ -465,11 +470,10 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
 TEST_F(OrcWriterTest, ReadZeroRows)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
   constexpr auto num_rows = 10;
-  column_wrapper<int64_t, typename decltype(sequence)::value_type> col(
-    sequence, sequence + num_rows, validity);
+  column_wrapper<int64_t, typename decltype(sequence)::value_type> col(sequence,
+                                                                       sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
@@ -495,11 +499,10 @@ TEST_F(OrcWriterTest, Strings)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
-  column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
-  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1(strings.begin(), strings.end());
+  float32_col col2(seq_col2.begin(), seq_col2.end());
 
   table_view expected({col0, col1, col2});
 
@@ -530,25 +533,23 @@ TEST_F(OrcWriterTest, SlicedTable)
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Funday"};
   const auto num_rows = strings.size();
 
-  auto seq_col0  = random_values<int>(num_rows);
+  auto seq_col0  = random_values<int32_t>(num_rows);
   auto seq_col2  = random_values<float>(num_rows);
   auto vals_col3 = random_values<int32_t>(num_rows);
   auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
     return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
   });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
-  column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
-  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
-  column_wrapper<float> col3{seq_col3, seq_col3 + num_rows, validity};
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1(strings.begin(), strings.end());
+  float32_col col2(seq_col2.begin(), seq_col2.end());
+  float32_col col3(seq_col3, seq_col3 + num_rows);
 
-  using lcw = cudf::test::lists_column_wrapper<int64_t>;
-  lcw col4{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
+  list_col<int64_t> col4{
+    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
 
-  auto ages_col = cudf::test::fixed_width_column_wrapper<int16_t>{
-    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
-  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
+  int16_col ages_col{{48, 27, 25, 31, 351, 351, 29, 15}, cudf::test::iterators::null_at(5)};
+  struct_col col5{{ages_col}, cudf::test::iterators::null_at(4)};
 
   table_view expected({col0, col1, col2, col3, col4, col5});
 
@@ -580,9 +581,7 @@ TEST_F(OrcWriterTest, HostBuffer)
 {
   constexpr auto num_rows = 100 << 10;
   const auto seq_col      = random_values<int>(num_rows);
-  const auto validity =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
+  int32_col col(seq_col.begin(), seq_col.end());
 
   table_view expected{{col}};
 
@@ -635,8 +634,7 @@ TEST_F(OrcWriterTest, negTimestampsNano)
 
 TEST_F(OrcWriterTest, Slice)
 {
-  auto col =
-    cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 4, 5}, {true, true, true, false, true}};
+  int32_col col{{1, 2, 3, 4, 5}, cudf::test::iterators::null_at(3)};
   std::vector<cudf::size_type> indices{2, 5};
   std::vector<cudf::column_view> result = cudf::slice(col, indices);
   cudf::table_view tbl{result};
@@ -748,11 +746,10 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
-  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
-  column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
-  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
+  int32_col col0(seq_col0.begin(), seq_col0.end());
+  str_col col1{strings.begin(), strings.end()};
+  float32_col col2(seq_col2.begin(), seq_col2.end());
 
   table_view expected({col0, col1, col2});
 
@@ -778,12 +775,12 @@ TEST_F(OrcChunkedWriterTest, Strings)
 {
   bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
-  cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
+  str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
   table_view tbl1({strings1});
 
   bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
-  cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
+  str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
   table_view tbl2({strings2});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
@@ -977,9 +974,8 @@ TEST_F(OrcReaderTest, CombinedSkipRowTest)
 
 TEST_F(OrcStatisticsTest, Basic)
 {
-  auto sequence  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
-  auto valid_all = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   std::vector<const char*> strings{
     "Monday", "Monday", "Friday", "Monday", "Friday", "Friday", "Friday", "Wednesday", "Tuesday"};
@@ -990,8 +986,7 @@ TEST_F(OrcStatisticsTest, Basic)
   column_wrapper<float, typename decltype(sequence)::value_type> col2(
     sequence, sequence + num_rows, validity);
   column_wrapper<cudf::string_view> col3{strings.begin(), strings.end()};
-  column_wrapper<bool, typename decltype(sequence)::value_type> col4(
-    sequence, sequence + num_rows, valid_all);
+  column_wrapper<bool, typename decltype(sequence)::value_type> col4(sequence, sequence + num_rows);
   column_wrapper<cudf::timestamp_s, typename decltype(sequence)::value_type> col5(
     sequence, sequence + num_rows, validity);
   table_view expected({col1, col2, col3, col4, col5});
@@ -1059,9 +1054,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   for (int i = 0; i < 34; ++i)
     strings.emplace_back("a long string to make sure overflow affects the output");
   // An element is null only to enforce the output column to be nullable
-  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 32; });
-
-  column_wrapper<cudf::string_view> col{strings.begin(), strings.end(), validity};
+  str_col col{strings.begin(), strings.end(), cudf::test::iterators::null_at(32)};
 
   // Bug tested here is easiest to reproduce when column_offset % 32 is 31
   std::vector<cudf::size_type> indices{31, 34};
@@ -1145,7 +1138,7 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
     return numeric::decimal64{vals[i], numeric::scale_type{scale}};
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
+  dec64_col col{data, data + num_rows, mask};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
@@ -1176,7 +1169,7 @@ TEST_F(OrcWriterTest, Decimal32)
     return numeric::decimal32{vals[i], numeric::scale_type{2}};
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
-  column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
+  dec32_col col{data, data + num_rows, mask};
   cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
@@ -1325,11 +1318,10 @@ TEST_F(OrcWriterTest, TestMap)
 
   auto keys      = random_values<int>(num_child_rows);
   auto vals      = random_values<float>(num_child_rows);
-  auto keys_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
-  column_wrapper<int> keys_col{keys.begin(), keys.end(), keys_mask};
-  column_wrapper<float> vals_col{vals.begin(), vals.end(), vals_mask};
-  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+  int32_col keys_col(keys.begin(), keys.end());
+  float32_col vals_col{vals.begin(), vals.end(), vals_mask};
+  auto s_col = struct_col({keys_col, vals_col}).release();
 
   auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
@@ -1339,13 +1331,13 @@ TEST_F(OrcWriterTest, TestMap)
     row_offsets[idx] = offset;
     if (valids[idx]) { offset += lists_per_row; }
   }
-  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
 
   auto num_list_rows = static_cast<cudf::column_view>(offsets).size() - 1;
   auto list_col =
     cudf::make_lists_column(num_list_rows,
                             offsets.release(),
-                            std::move(struct_col),
+                            std::move(s_col),
                             cudf::UNKNOWN_NULL_COUNT,
                             cudf::test::detail::make_null_mask(valids, valids + num_list_rows));
 
@@ -1374,10 +1366,10 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   auto child_col1_data = random_values<int32_t>(num_rows);
   auto child_col2_data = random_values<int64_t>(num_rows);
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
-  column_wrapper<int32_t> child_col1 = {child_col1_data.begin(), child_col1_data.end(), validity};
-  column_wrapper<int64_t> child_col2 = {child_col2_data.begin(), child_col2_data.end(), validity};
-  auto struct_col                    = cudf::test::structs_column_wrapper{child_col1, child_col2};
-  table_view expected({struct_col});
+  int32_col child_col1{child_col1_data.begin(), child_col1_data.end(), validity};
+  int64_col child_col2{child_col2_data.begin(), child_col2_data.end(), validity};
+  struct_col s_col{child_col1, child_col2};
+  table_view expected({s_col});
 
   cudf_io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("struct_s");
@@ -1399,7 +1391,7 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   // Verify that only one child column is included in the output table
   ASSERT_EQ(1, result.tbl->view().column(0).num_children());
   // Verify that the first child column is `field_b`
-  column_wrapper<int64_t> expected_col = {child_col2_data.begin(), child_col2_data.end(), validity};
+  int64_col expected_col{child_col2_data.begin(), child_col2_data.end(), validity};
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_col, result.tbl->view().column(0).child(0));
   ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
 }
@@ -1413,7 +1405,7 @@ TEST_F(OrcReaderTest, DecimalOptions)
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
 
-  column_wrapper<numeric::decimal128> col{col_data, col_data + num_rows, mask};
+  dec128_col col{col_data, col_data + num_rows, mask};
   table_view expected({col});
 
   cudf_io::table_input_metadata expected_metadata(expected);
@@ -1445,35 +1437,34 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
   auto const num_rows = 100;
 
   auto dec_vals  = random_values<int32_t>(num_rows);
-  auto keys_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
     return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
   });
-  auto vals_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
     return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
   });
-  auto validity  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
-  column_wrapper<numeric::decimal64> keys_col{keys_data, keys_data + num_rows, validity};
-  column_wrapper<numeric::decimal128> vals_col{vals_data, vals_data + num_rows, validity};
+  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
+  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
+  auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
 
-  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+  auto int_vals = random_values<int32_t>(num_rows);
+  int32_col int_col(int_vals.begin(), int_vals.end());
+  auto map_struct_col = struct_col({child_struct_col, int_col}).release();
 
   std::vector<int> row_offsets(num_rows + 1);
   std::iota(row_offsets.begin(), row_offsets.end(), 0);
-  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
 
-  auto list_col =
-    cudf::make_lists_column(num_rows,
-                            offsets.release(),
-                            std::move(struct_col),
-                            cudf::UNKNOWN_NULL_COUNT,
-                            cudf::test::detail::make_null_mask(validity, validity + num_rows));
+  auto map_list_col = cudf::make_lists_column(
+    num_rows, offsets.release(), std::move(map_struct_col), 0, rmm::device_buffer{});
 
-  table_view expected({*list_col});
+  table_view expected({*map_list_col});
 
   cudf_io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("lists");
-  expected_metadata.column_metadata[0].child(1).child(0).set_name("dec64");
-  expected_metadata.column_metadata[0].child(1).child(1).set_name("dec128");
+  expected_metadata.column_metadata[0].set_name("maps");
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+  expected_metadata.column_metadata[0].child(1).child(0).child(0).set_name("dec64");
+  expected_metadata.column_metadata[0].child(1).child(0).child(1).set_name("dec128");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
@@ -1484,12 +1475,13 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
       .use_index(false)
-      .decimal128_columns({"lists.1.dec64"});
+      // One less level of nesting because children of map columns are the child struct's children
+      .decimal128_columns({"maps.0.dec64"});
   auto result = cudf_io::read_orc(in_opts);
 
   // Both columns should be read as decimal128
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0),
-                                      result.tbl->view().column(0).child(1).child(1));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0).child(0),
+                                      result.tbl->view().column(0).child(1).child(0).child(1));
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From e6c69911bb9d1c897ccb0ae35124a8ef367b550a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 17 Dec 2021 03:01:19 -0500
Subject: [PATCH 098/202] Use dynamic nullate for join hasher and equality
 comparator (#9902)

Follow on PR for this comment: https://github.com/rapidsai/cudf/pull/9623#discussion_r758635105

The join hasher and equality-comparator were previously hardcoded with `has_nulls=true` (and migrated to `nullate::YES`) to help minimize code size. The new `nullate::DYNAMIC` allows for runtime checking of nulls so this can now be used here instead.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/9902
---
 cpp/src/join/hash_join.cu          | 58 +++++++++++++++++++++---------
 cpp/src/join/hash_join.cuh         |  6 ++--
 cpp/src/join/join_common_utils.cuh |  3 +-
 cpp/src/join/join_common_utils.hpp |  4 +--
 cpp/src/join/semi_join.cu          | 12 ++++---
 5 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index ee62008b90f..c6f842c6c55 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -81,7 +81,7 @@ void build_join_hash_table(cudf::table_view const& build,
   CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
   CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
 
-  row_hash hash_build{nullate::YES{}, *build_table_ptr};
+  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_build, empty_key_sentinel};
 
@@ -123,6 +123,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 probe_join_hash_table(cudf::table_device_view build_table,
                       cudf::table_device_view probe_table,
                       multimap_type const& hash_table,
+                      bool has_nulls,
                       null_equality compare_nulls,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
@@ -133,10 +134,10 @@ probe_join_hash_table(cudf::table_device_view build_table,
                                                       ? cudf::detail::join_kind::LEFT_JOIN
                                                       : JoinKind;
 
-  std::size_t const join_size = output_size
-                                  ? *output_size
-                                  : compute_join_output_size<ProbeJoinKind>(
-                                      build_table, probe_table, hash_table, compare_nulls, stream);
+  std::size_t const join_size =
+    output_size ? *output_size
+                : compute_join_output_size<ProbeJoinKind>(
+                    build_table, probe_table, hash_table, has_nulls, compare_nulls, stream);
 
   // If output size is zero, return immediately
   if (join_size == 0) {
@@ -147,9 +148,10 @@ probe_join_hash_table(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls};
+  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls};
 
-  row_hash hash_probe{nullate::YES{}, probe_table};
+  row_hash hash_probe{probe_nulls, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
@@ -197,12 +199,13 @@ probe_join_hash_table(cudf::table_device_view build_table,
 std::size_t get_full_join_size(cudf::table_device_view build_table,
                                cudf::table_device_view probe_table,
                                multimap_type const& hash_table,
+                               bool has_nulls,
                                null_equality compare_nulls,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
   std::size_t join_size = compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
-    build_table, probe_table, hash_table, compare_nulls, stream);
+    build_table, probe_table, hash_table, has_nulls, compare_nulls, stream);
 
   // If output size is zero, return immediately
   if (join_size == 0) { return join_size; }
@@ -212,9 +215,10 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  pair_equality equality{probe_table, build_table, compare_nulls};
+  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls};
 
-  row_hash hash_probe{nullate::YES{}, probe_table};
+  row_hash hash_probe{probe_nulls, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
@@ -367,7 +371,12 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::INNER_JOIN>(
-    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream);
+    *build_table_ptr,
+    *flattened_probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
+    compare_nulls,
+    stream);
 }
 
 std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
@@ -387,7 +396,12 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
-    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream);
+    *build_table_ptr,
+    *flattened_probe_table_ptr,
+    _hash_table,
+    cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
+    compare_nulls,
+    stream);
 }
 
 std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
@@ -407,8 +421,13 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
-  return get_full_join_size(
-    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream, mr);
+  return get_full_join_size(*build_table_ptr,
+                            *flattened_probe_table_ptr,
+                            _hash_table,
+                            cudf::has_nulls(flattened_probe_table) | cudf::has_nulls(_build),
+                            compare_nulls,
+                            stream,
+                            mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
@@ -466,8 +485,15 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe,
   auto build_table_ptr = cudf::table_device_view::create(_build, stream);
   auto probe_table_ptr = cudf::table_device_view::create(probe, stream);
 
-  auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table_ptr, *probe_table_ptr, _hash_table, compare_nulls, output_size, stream, mr);
+  auto join_indices =
+    cudf::detail::probe_join_hash_table<JoinKind>(*build_table_ptr,
+                                                  *probe_table_ptr,
+                                                  _hash_table,
+                                                  cudf::has_nulls(probe) | cudf::has_nulls(_build),
+                                                  compare_nulls,
+                                                  output_size,
+                                                  stream,
+                                                  mr);
 
   if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 976b0c81ead..5a042f65aad 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -96,6 +96,7 @@ template <join_kind JoinKind, typename multimap_type>
 std::size_t compute_join_output_size(table_device_view build_table,
                                      table_device_view probe_table,
                                      multimap_type const& hash_table,
+                                     bool has_nulls,
                                      null_equality compare_nulls,
                                      rmm::cuda_stream_view stream)
 {
@@ -117,9 +118,10 @@ std::size_t compute_join_output_size(table_device_view build_table,
     }
   }
 
-  pair_equality equality{probe_table, build_table, compare_nulls};
+  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  pair_equality equality{probe_table, build_table, probe_nulls, compare_nulls};
 
-  row_hash hash_probe{nullate::YES{}, probe_table};
+  row_hash hash_probe{probe_nulls, probe_table};
   auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
   make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 4b33772dd69..39a9f19c0ee 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -34,8 +34,9 @@ class pair_equality {
  public:
   pair_equality(table_device_view lhs,
                 table_device_view rhs,
+                nullate::DYNAMIC has_nulls,
                 null_equality nulls_are_equal = null_equality::EQUAL)
-    : _check_row_equality{cudf::nullate::YES{}, lhs, rhs, nulls_are_equal}
+    : _check_row_equality{has_nulls, lhs, rhs, nulls_are_equal}
   {
   }
 
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index c4692a50fec..9a7540bcd33 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -51,9 +51,9 @@ using multimap_type =
                         hash_table_allocator_type,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
-using row_hash = cudf::row_hasher<default_hash, cudf::nullate::YES>;
+using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
-using row_equality = cudf::row_equality_comparator<cudf::nullate::YES>;
+using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 3d27c5740f4..e781472e025 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -77,13 +77,15 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   // Create hash table containing all keys found in right table
   auto right_rows_d            = table_device_view::create(right_flattened_keys, stream);
   size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  row_hash hash_build{cudf::nullate::YES{}, *right_rows_d};
-  row_equality equality_build{cudf::nullate::YES{}, *right_rows_d, *right_rows_d, compare_nulls};
+  auto const right_nulls       = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)};
+  row_hash hash_build{right_nulls, *right_rows_d};
+  row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls};
 
   // Going to join it with left table
-  auto left_rows_d = table_device_view::create(left_flattened_keys, stream);
-  row_hash hash_probe{cudf::nullate::YES{}, *left_rows_d};
-  row_equality equality_probe{cudf::nullate::YES{}, *left_rows_d, *right_rows_d, compare_nulls};
+  auto left_rows_d      = table_device_view::create(left_flattened_keys, stream);
+  auto const left_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(left_flattened_keys)};
+  row_hash hash_probe{left_nulls, *left_rows_d};
+  row_equality equality_probe{left_nulls, *left_rows_d, *right_rows_d, compare_nulls};
 
   auto hash_table_ptr = hash_table_type::create(hash_table_size,
                                                 stream,

From 8c5a85af5add2414a9e921fb0675fc76f481ea47 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 17 Dec 2021 09:00:41 -0600
Subject: [PATCH 099/202] Fix see also links for IO APIs (#9895)

Fixes: #9771

This PR fixes the broken **see also** links in sphinx docstrings.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9895
---
 docs/cudf/source/api_docs/io.rst  |  1 +
 python/cudf/cudf/utils/ioutils.py | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
index 4e73531e174..c1eb7d381bc 100644
--- a/docs/cudf/source/api_docs/io.rst
+++ b/docs/cudf/source/api_docs/io.rst
@@ -33,6 +33,7 @@ Parquet
 
    read_parquet
    DataFrame.to_parquet
+   cudf.io.parquet.read_parquet_metadata
 
 ORC
 ~~~
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index e6c031acac7..c7ec539c6a6 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -107,7 +107,7 @@
 
 See Also
 --------
-cudf.io.parquet.read_parquet
+cudf.read_parquet
 """
 doc_read_parquet_metadata = docfmt_partial(
     docstring=_docstring_read_parquet_metadata
@@ -186,7 +186,7 @@
 See Also
 --------
 cudf.io.parquet.read_parquet_metadata
-cudf.io.parquet.to_parquet
+cudf.DataFrame.to_parquet
 cudf.read_orc
 """.format(
     remote_data_sources=_docstring_remote_sources
@@ -234,7 +234,7 @@
 
 See Also
 --------
-cudf.io.parquet.read_parquet
+cudf.read_parquet
 cudf.read_orc
 """
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
@@ -253,7 +253,7 @@
 
 See Also
 --------
-cudf.io.parquet.to_parquet
+cudf.DataFrame.to_parquet
 """
 doc_merge_parquet_filemetadata = docfmt_partial(
     docstring=_docstring_merge_parquet_filemetadata
@@ -392,8 +392,8 @@
 
 See Also
 --------
-cudf.io.parquet.read_parquet
-cudf.io.parquet.to_parquet
+cudf.read_parquet
+cudf.DataFrame.to_parquet
 """.format(
     remote_data_sources=_docstring_remote_sources
 )
@@ -660,7 +660,7 @@
 
 See Also
 --------
-cudf.io.hdf.to_hdf : Write a HDF file from a DataFrame.
+cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
 doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
 
@@ -731,8 +731,8 @@
 See Also
 --------
 cudf.read_hdf : Read from HDF file.
-cudf.io.parquet.to_parquet : Write a DataFrame to the binary parquet format.
-cudf.io.feather.to_feather : Write out feather-format for DataFrames.
+cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
 doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
 
@@ -762,7 +762,7 @@
 
 See Also
 --------
-cudf.io.feather.to_feather
+cudf.DataFrame.to_feather
 """
 doc_read_feather = docfmt_partial(docstring=_docstring_read_feather)
 
@@ -776,7 +776,7 @@
 
 See Also
 --------
-cudf.io.feather.read_feather
+cudf.read_feather
 """
 doc_to_feather = docfmt_partial(docstring=_docstring_to_feather)
 
@@ -945,7 +945,7 @@
 
 See Also
 --------
-cudf.io.csv.to_csv
+cudf.DataFrame.to_csv
 """.format(
     remote_data_sources=_docstring_remote_sources
 )

From 23cafcf0ae1a2fe5e6b7138f4c92c2dbfa2ec93b Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Fri, 17 Dec 2021 10:02:57 -0800
Subject: [PATCH 100/202] TimedeltaIndex constructor raises an AttributeError.
 (#9884)

Fixes: https://github.com/rapidsai/cudf/issues/9829

This PR fixes `TimedeltaIndex` constructor invocation by handling `NaT` values replacement with `nulls`.

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9884
---
 python/cudf/cudf/core/column/column.py   | 29 ++++++++++++++++++++----
 python/cudf/cudf/core/column/datetime.py | 22 +-----------------
 python/cudf/cudf/tests/test_timedelta.py | 10 ++++++++
 3 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a98052ce906..a3a8b0c91d1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1652,6 +1652,27 @@ def build_struct_column(
     return cast("cudf.core.column.StructColumn", result)
 
 
+def _make_copy_replacing_NaT_with_null(column):
+    """Return a copy with NaT values replaced with nulls."""
+    if np.issubdtype(column.dtype, np.timedelta64):
+        na_value = np.timedelta64("NaT", column.time_unit)
+    elif np.issubdtype(column.dtype, np.datetime64):
+        na_value = np.datetime64("NaT", column.time_unit)
+    else:
+        raise ValueError("This type does not support replacing NaT with null.")
+
+    null = column_empty_like(column, masked=True, newsize=1)
+    out_col = cudf._lib.replace.replace(
+        column,
+        build_column(
+            Buffer(np.array([na_value], dtype=column.dtype).view("|u1")),
+            dtype=column.dtype,
+        ),
+        null,
+    )
+    return out_col
+
+
 def as_column(
     arbitrary: Any,
     nan_as_null: bool = None,
@@ -1753,9 +1774,7 @@ def as_column(
                 col = col.set_mask(mask)
         elif np.issubdtype(col.dtype, np.datetime64):
             if nan_as_null or (mask is None and nan_as_null is None):
-                # Ignore typing error since this method is only defined for
-                # DatetimeColumn, not the ColumnBase class.
-                col = col._make_copy_with_na_as_null()  # type: ignore
+                col = _make_copy_replacing_NaT_with_null(col)
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
@@ -1886,7 +1905,7 @@ def as_column(
             mask = None
             if nan_as_null is None or nan_as_null is True:
                 data = build_column(buffer, dtype=arbitrary.dtype)
-                data = data._make_copy_with_na_as_null()
+                data = _make_copy_replacing_NaT_with_null(data)
                 mask = data.mask
 
             data = cudf.core.column.datetime.DatetimeColumn(
@@ -1904,7 +1923,7 @@ def as_column(
             mask = None
             if nan_as_null is None or nan_as_null is True:
                 data = build_column(buffer, dtype=arbitrary.dtype)
-                data = data._make_copy_with_na_as_null()
+                data = _make_copy_replacing_NaT_with_null(data)
                 mask = data.mask
 
             data = cudf.core.column.timedelta.TimeDeltaColumn(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 24ec25acbbb..b763790986a 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -20,13 +20,7 @@
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.buffer import Buffer
-from cudf.core.column import (
-    ColumnBase,
-    as_column,
-    column,
-    column_empty_like,
-    string,
-)
+from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.utils.utils import _fillna_natwise
 
 if PANDAS_GE_120:
@@ -493,20 +487,6 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         else:
             return False
 
-    def _make_copy_with_na_as_null(self):
-        """Return a copy with NaN values replaced with nulls."""
-        null = column_empty_like(self, masked=True, newsize=1)
-        na_value = np.datetime64("nat", self.time_unit)
-        out_col = cudf._lib.replace.replace(
-            self,
-            column.build_column(
-                Buffer(np.array([na_value], dtype=self.dtype).view("|u1")),
-                dtype=self.dtype,
-            ),
-            null,
-        )
-        return out_col
-
 
 def binop_offset(lhs, rhs, op):
     if rhs._is_no_op:
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 36a49aa4b33..8c7fdfa5c39 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1406,3 +1406,13 @@ def test_error_values():
         match="TimeDelta Arrays is not yet implemented in cudf",
     ):
         s.values
+
+
+@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
+@pytest.mark.parametrize("name", [None, "delta-index"])
+def test_create_TimedeltaIndex(dtype, name):
+    gdi = cudf.TimedeltaIndex(
+        [1132223, 2023232, 342234324, 4234324], dtype=dtype, name=name
+    )
+    pdi = gdi.to_pandas()
+    assert_eq(pdi, gdi)

From 84073e8c3c9477c8afa974f14058f1208f63aba2 Mon Sep 17 00:00:00 2001
From: AJ Schmidt <aschmidt@nvidia.com>
Date: Fri, 17 Dec 2021 17:08:19 -0500
Subject: [PATCH 101/202] update changelog

---
 CHANGELOG.md | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 225 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b55669f7f50..39bb868c7db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,229 @@
-# cuDF 21.12.00 (Date TBD)
+# cuDF 21.12.00 (9 Dec 2021)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
+## 🚨 Breaking Changes
+
+- Update `bitmask_and` and `bitmask_or` to return a pair of resulting mask and count of unset bits ([#9616](https://github.com/rapidsai/cudf/pull/9616)) [@PointKernel](https://github.com/PointKernel)
+- Remove sizeof and standardize on memory_usage ([#9544](https://github.com/rapidsai/cudf/pull/9544)) [@vyasr](https://github.com/vyasr)
+- Add support for single-line regex anchors ^/$ in contains_re ([#9482](https://github.com/rapidsai/cudf/pull/9482)) [@davidwendt](https://github.com/davidwendt)
+- Refactor sorting APIs ([#9464](https://github.com/rapidsai/cudf/pull/9464)) [@vyasr](https://github.com/vyasr)
+- Update Java nvcomp JNI bindings to nvcomp 2.x API ([#9384](https://github.com/rapidsai/cudf/pull/9384)) [@jbrennan333](https://github.com/jbrennan333)
+- Support Python UDFs written in terms of rows ([#9343](https://github.com/rapidsai/cudf/pull/9343)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- JNI: Support nested types in ORC writer ([#9334](https://github.com/rapidsai/cudf/pull/9334)) [@firestarman](https://github.com/firestarman)
+- Optionally nullify out-of-bounds indices in segmented_gather(). ([#9318](https://github.com/rapidsai/cudf/pull/9318)) [@mythrocks](https://github.com/mythrocks)
+- Refactor cuIO timestamp processing with `cuda::std::chrono` ([#9278](https://github.com/rapidsai/cudf/pull/9278)) [@PointKernel](https://github.com/PointKernel)
+- Various internal MultiIndex improvements ([#9243](https://github.com/rapidsai/cudf/pull/9243)) [@vyasr](https://github.com/vyasr)
+
+## 🐛 Bug Fixes
+
+- Fix read_parquet bug for bytes input ([#9669](https://github.com/rapidsai/cudf/pull/9669)) [@rjzamora](https://github.com/rjzamora)
+- Use `_gather` internal for `sort_*` ([#9668](https://github.com/rapidsai/cudf/pull/9668)) [@isVoid](https://github.com/isVoid)
+- Fix behavior of equals for non-DataFrame Frames and add tests. ([#9653](https://github.com/rapidsai/cudf/pull/9653)) [@vyasr](https://github.com/vyasr)
+- Dont recompute output size if it is already available ([#9649](https://github.com/rapidsai/cudf/pull/9649)) [@abellina](https://github.com/abellina)
+- Fix read_parquet bug for extended dtypes from remote storage ([#9638](https://github.com/rapidsai/cudf/pull/9638)) [@rjzamora](https://github.com/rjzamora)
+- add const when getting data from a JNI data wrapper ([#9637](https://github.com/rapidsai/cudf/pull/9637)) [@wjxiz1992](https://github.com/wjxiz1992)
+- Fix debrotli issue on CUDA 11.5 ([#9632](https://github.com/rapidsai/cudf/pull/9632)) [@vuule](https://github.com/vuule)
+- Use std::size_t when computing join output size ([#9626](https://github.com/rapidsai/cudf/pull/9626)) [@jlowe](https://github.com/jlowe)
+- Fix `usecols` parameter handling in `dask_cudf.read_csv` ([#9618](https://github.com/rapidsai/cudf/pull/9618)) [@galipremsagar](https://github.com/galipremsagar)
+- Add support for string `&#39;nan&#39;, &#39;inf&#39; &amp; &#39;-inf&#39;` values while type-casting to `float` ([#9613](https://github.com/rapidsai/cudf/pull/9613)) [@galipremsagar](https://github.com/galipremsagar)
+- Avoid passing NativeFileDatasource to pyarrow in read_parquet ([#9608](https://github.com/rapidsai/cudf/pull/9608)) [@rjzamora](https://github.com/rjzamora)
+- Fix test failure with cuda 11.5 in row_bit_count tests. ([#9581](https://github.com/rapidsai/cudf/pull/9581)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Correct _LIBCUDACXX_CUDACC_VER value computation ([#9579](https://github.com/rapidsai/cudf/pull/9579)) [@robertmaynard](https://github.com/robertmaynard)
+- Increase max RLE stream size estimate to avoid potential overflows ([#9568](https://github.com/rapidsai/cudf/pull/9568)) [@vuule](https://github.com/vuule)
+- Fix edge case in tdigest scalar generation for groups containing all nulls. ([#9551](https://github.com/rapidsai/cudf/pull/9551)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix pytests failing in `cuda-11.5` environment ([#9547](https://github.com/rapidsai/cudf/pull/9547)) [@galipremsagar](https://github.com/galipremsagar)
+- compile libnvcomp with PTDS if requested ([#9540](https://github.com/rapidsai/cudf/pull/9540)) [@jbrennan333](https://github.com/jbrennan333)
+- Fix `segmented_gather()` for null LIST rows ([#9537](https://github.com/rapidsai/cudf/pull/9537)) [@mythrocks](https://github.com/mythrocks)
+- Deprecate DataFrame.label_encoding, use private _label_encoding method internally. ([#9535](https://github.com/rapidsai/cudf/pull/9535)) [@bdice](https://github.com/bdice)
+- Fix several test and benchmark issues related to bitmask allocations. ([#9521](https://github.com/rapidsai/cudf/pull/9521)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix for inserting duplicates in groupby result cache ([#9508](https://github.com/rapidsai/cudf/pull/9508)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix mismatched types error in clip() when using non int64 numeric types ([#9498](https://github.com/rapidsai/cudf/pull/9498)) [@davidwendt](https://github.com/davidwendt)
+- Match conda pinnings for style checks (revert part of #9412, #9433). ([#9490](https://github.com/rapidsai/cudf/pull/9490)) [@bdice](https://github.com/bdice)
+- Make sure all dask-cudf supported aggs are handled in `_tree_node_agg` ([#9487](https://github.com/rapidsai/cudf/pull/9487)) [@charlesbluca](https://github.com/charlesbluca)
+- Resolve `hash_columns` `FutureWarning` in `dask_cudf` ([#9481](https://github.com/rapidsai/cudf/pull/9481)) [@pentschev](https://github.com/pentschev)
+- Add fixed point to AllTypes in libcudf unit tests ([#9472](https://github.com/rapidsai/cudf/pull/9472)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix regex handling of embedded null characters ([#9470](https://github.com/rapidsai/cudf/pull/9470)) [@davidwendt](https://github.com/davidwendt)
+- Fix memcheck error in copy-if-else ([#9467](https://github.com/rapidsai/cudf/pull/9467)) [@davidwendt](https://github.com/davidwendt)
+- Fix bug in dask_cudf.read_parquet for index=False ([#9453](https://github.com/rapidsai/cudf/pull/9453)) [@rjzamora](https://github.com/rjzamora)
+- Preserve the decimal scale when creating a default scalar ([#9449](https://github.com/rapidsai/cudf/pull/9449)) [@revans2](https://github.com/revans2)
+- Push down parent nulls when flattening nested columns. ([#9443](https://github.com/rapidsai/cudf/pull/9443)) [@mythrocks](https://github.com/mythrocks)
+- Fix memcheck error in gtest SegmentedGatherTest/GatherSliced ([#9442](https://github.com/rapidsai/cudf/pull/9442)) [@davidwendt](https://github.com/davidwendt)
+- Revert &quot;Fix quantile division / partition handling for dask-cudf sort… ([#9438](https://github.com/rapidsai/cudf/pull/9438)) [@charlesbluca](https://github.com/charlesbluca)
+- Allow int-like objects for the `decimals` argument in `round` ([#9428](https://github.com/rapidsai/cudf/pull/9428)) [@shwina](https://github.com/shwina)
+- Fix stream compaction&#39;s `drop_duplicates` API to use stable sort ([#9417](https://github.com/rapidsai/cudf/pull/9417)) [@ttnghia](https://github.com/ttnghia)
+- Skip Comparing Uniform Window Results in Var/std Tests ([#9416](https://github.com/rapidsai/cudf/pull/9416)) [@isVoid](https://github.com/isVoid)
+- Fix `StructColumn.to_pandas` type handling issues ([#9388](https://github.com/rapidsai/cudf/pull/9388)) [@galipremsagar](https://github.com/galipremsagar)
+- Correct issues in the build dir cudf-config.cmake ([#9386](https://github.com/rapidsai/cudf/pull/9386)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix Java table partition test to account for non-deterministic ordering ([#9385](https://github.com/rapidsai/cudf/pull/9385)) [@jlowe](https://github.com/jlowe)
+- Fix timestamp truncation/overflow bugs in orc/parquet ([#9382](https://github.com/rapidsai/cudf/pull/9382)) [@PointKernel](https://github.com/PointKernel)
+- Fix the crash in stats code ([#9368](https://github.com/rapidsai/cudf/pull/9368)) [@devavret](https://github.com/devavret)
+- Make Series.hash_encode results reproducible. ([#9366](https://github.com/rapidsai/cudf/pull/9366)) [@bdice](https://github.com/bdice)
+- Fix libcudf compile warnings on debug 11.4 build ([#9360](https://github.com/rapidsai/cudf/pull/9360)) [@davidwendt](https://github.com/davidwendt)
+- Fail gracefully when compiling python UDFs that attempt to access columns with unsupported dtypes ([#9359](https://github.com/rapidsai/cudf/pull/9359)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Set pass_filenames: false in mypy pre-commit configuration. ([#9349](https://github.com/rapidsai/cudf/pull/9349)) [@bdice](https://github.com/bdice)
+- Fix cudf_assert in cudf::io::orc::gpu::gpuDecodeOrcColumnData ([#9348](https://github.com/rapidsai/cudf/pull/9348)) [@davidwendt](https://github.com/davidwendt)
+- Fix memcheck error in groupby-tdigest get_scalar_minmax ([#9339](https://github.com/rapidsai/cudf/pull/9339)) [@davidwendt](https://github.com/davidwendt)
+- Optimizations for `cudf.concat` when `axis=1` ([#9333](https://github.com/rapidsai/cudf/pull/9333)) [@galipremsagar](https://github.com/galipremsagar)
+- Use f-string in join helper warning message. ([#9325](https://github.com/rapidsai/cudf/pull/9325)) [@bdice](https://github.com/bdice)
+- Avoid casting to list or struct dtypes in dask_cudf.read_parquet ([#9314](https://github.com/rapidsai/cudf/pull/9314)) [@rjzamora](https://github.com/rjzamora)
+- Fix null count in statistics for parquet ([#9303](https://github.com/rapidsai/cudf/pull/9303)) [@devavret](https://github.com/devavret)
+- Potential overflow of `decimal32` when casting to `int64_t` ([#9287](https://github.com/rapidsai/cudf/pull/9287)) [@codereport](https://github.com/codereport)
+- Fix quantile division / partition handling for dask-cudf sort on null dataframes ([#9259](https://github.com/rapidsai/cudf/pull/9259)) [@charlesbluca](https://github.com/charlesbluca)
+- Updating cudf version also updates rapids cmake branch ([#9249](https://github.com/rapidsai/cudf/pull/9249)) [@robertmaynard](https://github.com/robertmaynard)
+- Implement `one_hot_encoding` in libcudf and bind to python ([#9229](https://github.com/rapidsai/cudf/pull/9229)) [@isVoid](https://github.com/isVoid)
+- BUG FIX: CSV Writer ignores the header parameter when no metadata is provided ([#8740](https://github.com/rapidsai/cudf/pull/8740)) [@skirui-source](https://github.com/skirui-source)
+
+## 📖 Documentation
+
+- Update Documentation to use `TYPED_TEST_SUITE` ([#9654](https://github.com/rapidsai/cudf/pull/9654)) [@codereport](https://github.com/codereport)
+- Add dedicated page for `StringHandling` in python docs ([#9624](https://github.com/rapidsai/cudf/pull/9624)) [@galipremsagar](https://github.com/galipremsagar)
+- Update docstring of `DataFrame.merge` ([#9572](https://github.com/rapidsai/cudf/pull/9572)) [@galipremsagar](https://github.com/galipremsagar)
+- Use raw strings to avoid SyntaxErrors in parsed docstrings. ([#9526](https://github.com/rapidsai/cudf/pull/9526)) [@bdice](https://github.com/bdice)
+- Add example to docstrings in `rolling.apply` ([#9522](https://github.com/rapidsai/cudf/pull/9522)) [@isVoid](https://github.com/isVoid)
+- Update help message to escape quotes in ./build.sh --cmake-args. ([#9494](https://github.com/rapidsai/cudf/pull/9494)) [@bdice](https://github.com/bdice)
+- Improve Python docstring formatting. ([#9493](https://github.com/rapidsai/cudf/pull/9493)) [@bdice](https://github.com/bdice)
+- Update table of I/O supported types ([#9476](https://github.com/rapidsai/cudf/pull/9476)) [@vuule](https://github.com/vuule)
+- Document invalid regex patterns as undefined behavior ([#9473](https://github.com/rapidsai/cudf/pull/9473)) [@davidwendt](https://github.com/davidwendt)
+- Miscellaneous documentation fixes to `cudf` ([#9471](https://github.com/rapidsai/cudf/pull/9471)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix many documentation errors in libcudf. ([#9355](https://github.com/rapidsai/cudf/pull/9355)) [@karthikeyann](https://github.com/karthikeyann)
+- Fixing SubwordTokenizer docs issue ([#9354](https://github.com/rapidsai/cudf/pull/9354)) [@mayankanand007](https://github.com/mayankanand007)
+- Improved deprecation warnings. ([#9347](https://github.com/rapidsai/cudf/pull/9347)) [@bdice](https://github.com/bdice)
+- doc reorder mr, stream to stream, mr ([#9308](https://github.com/rapidsai/cudf/pull/9308)) [@karthikeyann](https://github.com/karthikeyann)
+- Deprecate method parameters to DataFrame.join, DataFrame.merge. ([#9291](https://github.com/rapidsai/cudf/pull/9291)) [@bdice](https://github.com/bdice)
+- Added deprecation warning for `.label_encoding()` ([#9289](https://github.com/rapidsai/cudf/pull/9289)) [@mayankanand007](https://github.com/mayankanand007)
+
+## 🚀 New Features
+
+- Enable Series.divide and DataFrame.divide ([#9630](https://github.com/rapidsai/cudf/pull/9630)) [@vyasr](https://github.com/vyasr)
+- Update `bitmask_and` and `bitmask_or` to return a pair of resulting mask and count of unset bits ([#9616](https://github.com/rapidsai/cudf/pull/9616)) [@PointKernel](https://github.com/PointKernel)
+- Add handling of mixed numeric types in `to_dlpack` ([#9585](https://github.com/rapidsai/cudf/pull/9585)) [@galipremsagar](https://github.com/galipremsagar)
+- Support re.Pattern object for pat arg in str.replace ([#9573](https://github.com/rapidsai/cudf/pull/9573)) [@davidwendt](https://github.com/davidwendt)
+- Add JNI for `lists::drop_list_duplicates` with keys-values input column ([#9553](https://github.com/rapidsai/cudf/pull/9553)) [@ttnghia](https://github.com/ttnghia)
+- Support structs column in `min`, `max`, `argmin` and `argmax` groupby aggregate() and scan() ([#9545](https://github.com/rapidsai/cudf/pull/9545)) [@ttnghia](https://github.com/ttnghia)
+- Move libcudacxx to use `rapids_cpm` and use newer versions ([#9539](https://github.com/rapidsai/cudf/pull/9539)) [@robertmaynard](https://github.com/robertmaynard)
+- Add scan min/max support for chrono types to libcudf reduction-scan (not groupby scan) ([#9518](https://github.com/rapidsai/cudf/pull/9518)) [@davidwendt](https://github.com/davidwendt)
+- Support `args=` in `apply` ([#9514](https://github.com/rapidsai/cudf/pull/9514)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add groupby scan min/max support for strings values ([#9502](https://github.com/rapidsai/cudf/pull/9502)) [@davidwendt](https://github.com/davidwendt)
+- Add list output option to character_ngrams() function ([#9499](https://github.com/rapidsai/cudf/pull/9499)) [@davidwendt](https://github.com/davidwendt)
+- More granular column selection in ORC reader ([#9496](https://github.com/rapidsai/cudf/pull/9496)) [@vuule](https://github.com/vuule)
+- add min_periods, ddof to groupby covariance, &amp; correlation aggregation ([#9492](https://github.com/rapidsai/cudf/pull/9492)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement Series.datetime.floor ([#9488](https://github.com/rapidsai/cudf/pull/9488)) [@skirui-source](https://github.com/skirui-source)
+- Enable linting of CMake files using pre-commit ([#9484](https://github.com/rapidsai/cudf/pull/9484)) [@vyasr](https://github.com/vyasr)
+- Add support for single-line regex anchors ^/$ in contains_re ([#9482](https://github.com/rapidsai/cudf/pull/9482)) [@davidwendt](https://github.com/davidwendt)
+- Augment `order_by` to Accept a List of `null_precedence` ([#9455](https://github.com/rapidsai/cudf/pull/9455)) [@isVoid](https://github.com/isVoid)
+- Add format API for list column of strings ([#9454](https://github.com/rapidsai/cudf/pull/9454)) [@davidwendt](https://github.com/davidwendt)
+- Enable Datetime/Timedelta dtypes in Masked UDFs ([#9451](https://github.com/rapidsai/cudf/pull/9451)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add cudf python groupby.diff ([#9446](https://github.com/rapidsai/cudf/pull/9446)) [@karthikeyann](https://github.com/karthikeyann)
+- Implement `lists::stable_sort_lists` for stable sorting of elements within each row of lists column ([#9425](https://github.com/rapidsai/cudf/pull/9425)) [@ttnghia](https://github.com/ttnghia)
+- add ctest memcheck using cuda-sanitizer ([#9414](https://github.com/rapidsai/cudf/pull/9414)) [@karthikeyann](https://github.com/karthikeyann)
+- Support Unary Operations in Masked UDF ([#9409](https://github.com/rapidsai/cudf/pull/9409)) [@isVoid](https://github.com/isVoid)
+- Move Several Series Function to Frame ([#9394](https://github.com/rapidsai/cudf/pull/9394)) [@isVoid](https://github.com/isVoid)
+- MD5 Python hash API ([#9390](https://github.com/rapidsai/cudf/pull/9390)) [@bdice](https://github.com/bdice)
+- Add cudf strings is_title API ([#9380](https://github.com/rapidsai/cudf/pull/9380)) [@davidwendt](https://github.com/davidwendt)
+- Enable casting to int64, uint64, and double in AST code. ([#9379](https://github.com/rapidsai/cudf/pull/9379)) [@vyasr](https://github.com/vyasr)
+- Add support for writing ORC with map columns ([#9369](https://github.com/rapidsai/cudf/pull/9369)) [@vuule](https://github.com/vuule)
+- extract_list_elements() with column_view indices ([#9367](https://github.com/rapidsai/cudf/pull/9367)) [@mythrocks](https://github.com/mythrocks)
+- Reimplement `lists::drop_list_duplicates` for keys-values lists columns ([#9345](https://github.com/rapidsai/cudf/pull/9345)) [@ttnghia](https://github.com/ttnghia)
+- Support Python UDFs written in terms of rows ([#9343](https://github.com/rapidsai/cudf/pull/9343)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- JNI: Support nested types in ORC writer ([#9334](https://github.com/rapidsai/cudf/pull/9334)) [@firestarman](https://github.com/firestarman)
+- Optionally nullify out-of-bounds indices in segmented_gather(). ([#9318](https://github.com/rapidsai/cudf/pull/9318)) [@mythrocks](https://github.com/mythrocks)
+- Add shallow hash function and shallow equality comparison for column_view ([#9312](https://github.com/rapidsai/cudf/pull/9312)) [@karthikeyann](https://github.com/karthikeyann)
+- Add CudaMemoryBuffer for cudaMalloc memory using RMM cuda_memory_resource ([#9311](https://github.com/rapidsai/cudf/pull/9311)) [@rongou](https://github.com/rongou)
+- Add parameters to control row index stride and stripe size in ORC writer ([#9310](https://github.com/rapidsai/cudf/pull/9310)) [@vuule](https://github.com/vuule)
+- Add `na_position` param to dask-cudf `sort_values` ([#9264](https://github.com/rapidsai/cudf/pull/9264)) [@charlesbluca](https://github.com/charlesbluca)
+- Add `ascending` parameter for dask-cudf `sort_values` ([#9250](https://github.com/rapidsai/cudf/pull/9250)) [@charlesbluca](https://github.com/charlesbluca)
+- New array conversion methods ([#9236](https://github.com/rapidsai/cudf/pull/9236)) [@vyasr](https://github.com/vyasr)
+- Series `apply` method backed by masked UDFs ([#9217](https://github.com/rapidsai/cudf/pull/9217)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Grouping by frequency and resampling ([#9178](https://github.com/rapidsai/cudf/pull/9178)) [@shwina](https://github.com/shwina)
+- Pure-python masked UDFs ([#9174](https://github.com/rapidsai/cudf/pull/9174)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add Covariance, Pearson correlation for sort groupby (libcudf) ([#9154](https://github.com/rapidsai/cudf/pull/9154)) [@karthikeyann](https://github.com/karthikeyann)
+- Add `calendrical_month_sequence` in c++ and `date_range` in python ([#8886](https://github.com/rapidsai/cudf/pull/8886)) [@shwina](https://github.com/shwina)
+
+## 🛠️ Improvements
+
+- Followup to PR 9088 comments ([#9659](https://github.com/rapidsai/cudf/pull/9659)) [@cwharris](https://github.com/cwharris)
+- Update cuCollections to version that supports installed libcudacxx ([#9633](https://github.com/rapidsai/cudf/pull/9633)) [@robertmaynard](https://github.com/robertmaynard)
+- Add `11.5` dev.yml to `cudf` ([#9617](https://github.com/rapidsai/cudf/pull/9617)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `xfail` for parquet reader `11.5` issue ([#9612](https://github.com/rapidsai/cudf/pull/9612)) [@galipremsagar](https://github.com/galipremsagar)
+- remove deprecated Rmm.initialize method ([#9607](https://github.com/rapidsai/cudf/pull/9607)) [@rongou](https://github.com/rongou)
+- Use HostColumnVectorCore for child columns in JCudfSerialization.unpackHostColumnVectors ([#9596](https://github.com/rapidsai/cudf/pull/9596)) [@sperlingxx](https://github.com/sperlingxx)
+- Set RMM pool to a fixed size in JNI ([#9583](https://github.com/rapidsai/cudf/pull/9583)) [@rongou](https://github.com/rongou)
+- Use nvCOMP for Snappy compression/decompression ([#9582](https://github.com/rapidsai/cudf/pull/9582)) [@vuule](https://github.com/vuule)
+- Build CUDA version agnostic packages for dask-cudf ([#9578](https://github.com/rapidsai/cudf/pull/9578)) [@Ethyling](https://github.com/Ethyling)
+- Fixed tests warning: &quot;TYPED_TEST_CASE is deprecated, please use TYPED_TEST_SUITE&quot; ([#9574](https://github.com/rapidsai/cudf/pull/9574)) [@ttnghia](https://github.com/ttnghia)
+- Enable CMake format in CI and fix style ([#9570](https://github.com/rapidsai/cudf/pull/9570)) [@vyasr](https://github.com/vyasr)
+- Add NVTX Start/End Ranges to JNI ([#9563](https://github.com/rapidsai/cudf/pull/9563)) [@abellina](https://github.com/abellina)
+- Add librdkafka and python-confluent-kafka to dev conda environments s… ([#9562](https://github.com/rapidsai/cudf/pull/9562)) [@jdye64](https://github.com/jdye64)
+- Add offsets_begin/end() to strings_column_view ([#9559](https://github.com/rapidsai/cudf/pull/9559)) [@davidwendt](https://github.com/davidwendt)
+- remove alignment options for RMM jni ([#9550](https://github.com/rapidsai/cudf/pull/9550)) [@rongou](https://github.com/rongou)
+- Add axis parameter passthrough to `DataFrame` and `Series` take for pandas API compatibility ([#9549](https://github.com/rapidsai/cudf/pull/9549)) [@dantegd](https://github.com/dantegd)
+- Remove sizeof and standardize on memory_usage ([#9544](https://github.com/rapidsai/cudf/pull/9544)) [@vyasr](https://github.com/vyasr)
+- Adds cudaProfilerStart/cudaProfilerStop in JNI api ([#9543](https://github.com/rapidsai/cudf/pull/9543)) [@abellina](https://github.com/abellina)
+- Generalize comparison binary operations ([#9542](https://github.com/rapidsai/cudf/pull/9542)) [@vyasr](https://github.com/vyasr)
+- Expose APIs to wrap CUDA or RMM allocations with a Java device buffer instance ([#9538](https://github.com/rapidsai/cudf/pull/9538)) [@jlowe](https://github.com/jlowe)
+- Add scan sum support for duration types to libcudf ([#9536](https://github.com/rapidsai/cudf/pull/9536)) [@davidwendt](https://github.com/davidwendt)
+- Force inlining to improve AST performance ([#9530](https://github.com/rapidsai/cudf/pull/9530)) [@vyasr](https://github.com/vyasr)
+- Generalize some more indexed frame methods ([#9529](https://github.com/rapidsai/cudf/pull/9529)) [@vyasr](https://github.com/vyasr)
+- Add Java bindings for rolling window stddev aggregation ([#9527](https://github.com/rapidsai/cudf/pull/9527)) [@razajafri](https://github.com/razajafri)
+- catch rmm::out_of_memory exceptions in jni ([#9525](https://github.com/rapidsai/cudf/pull/9525)) [@rongou](https://github.com/rongou)
+- Add an overload of `make_empty_column` with `type_id` parameter ([#9524](https://github.com/rapidsai/cudf/pull/9524)) [@ttnghia](https://github.com/ttnghia)
+- Accelerate conditional inner joins with larger right tables ([#9523](https://github.com/rapidsai/cudf/pull/9523)) [@vyasr](https://github.com/vyasr)
+- Initial pass of generalizing `decimal` support in `cudf` python layer ([#9517](https://github.com/rapidsai/cudf/pull/9517)) [@galipremsagar](https://github.com/galipremsagar)
+- Cleanup for flattening nested columns ([#9509](https://github.com/rapidsai/cudf/pull/9509)) [@rwlee](https://github.com/rwlee)
+- Enable running tests using RMM arena and async memory resources ([#9506](https://github.com/rapidsai/cudf/pull/9506)) [@rongou](https://github.com/rongou)
+- Remove dependency on six. ([#9495](https://github.com/rapidsai/cudf/pull/9495)) [@bdice](https://github.com/bdice)
+- Cleanup some libcudf strings gtests ([#9489](https://github.com/rapidsai/cudf/pull/9489)) [@davidwendt](https://github.com/davidwendt)
+- Rename strings/array_tests.cu to strings/array_tests.cpp ([#9480](https://github.com/rapidsai/cudf/pull/9480)) [@davidwendt](https://github.com/davidwendt)
+- Refactor sorting APIs ([#9464](https://github.com/rapidsai/cudf/pull/9464)) [@vyasr](https://github.com/vyasr)
+- Implement DataFrame.hash_values, deprecate DataFrame.hash_columns. ([#9458](https://github.com/rapidsai/cudf/pull/9458)) [@bdice](https://github.com/bdice)
+- Deprecate Series.hash_encode. ([#9457](https://github.com/rapidsai/cudf/pull/9457)) [@bdice](https://github.com/bdice)
+- Update `conda` recipes for Enhanced Compatibility effort ([#9456](https://github.com/rapidsai/cudf/pull/9456)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Small clean up to simplify column selection code in ORC reader ([#9444](https://github.com/rapidsai/cudf/pull/9444)) [@vuule](https://github.com/vuule)
+- add missing stream to scalar.is_valid() wherever stream is available ([#9436](https://github.com/rapidsai/cudf/pull/9436)) [@karthikeyann](https://github.com/karthikeyann)
+- Adds Deprecation Warnings to `one_hot_encoding` and Implement `get_dummies` with Cython API ([#9435](https://github.com/rapidsai/cudf/pull/9435)) [@isVoid](https://github.com/isVoid)
+- Update pre-commit hook URLs. ([#9433](https://github.com/rapidsai/cudf/pull/9433)) [@bdice](https://github.com/bdice)
+- Remove pyarrow import in `dask_cudf.io.parquet` ([#9429](https://github.com/rapidsai/cudf/pull/9429)) [@charlesbluca](https://github.com/charlesbluca)
+- Miscellaneous improvements for UDFs ([#9422](https://github.com/rapidsai/cudf/pull/9422)) [@isVoid](https://github.com/isVoid)
+- Use pre-commit for CI ([#9412](https://github.com/rapidsai/cudf/pull/9412)) [@vyasr](https://github.com/vyasr)
+- Update to UCX-Py 0.23 ([#9407](https://github.com/rapidsai/cudf/pull/9407)) [@pentschev](https://github.com/pentschev)
+- Expose OutOfBoundsPolicy in JNI for Table.gather ([#9406](https://github.com/rapidsai/cudf/pull/9406)) [@abellina](https://github.com/abellina)
+- Improvements to tdigest aggregation code. ([#9403](https://github.com/rapidsai/cudf/pull/9403)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Add Java API to deserialize a table to host columns ([#9402](https://github.com/rapidsai/cudf/pull/9402)) [@jlowe](https://github.com/jlowe)
+- Frame copy to use __class__ instead of type() ([#9397](https://github.com/rapidsai/cudf/pull/9397)) [@madsbk](https://github.com/madsbk)
+- Change all DeprecationWarnings to FutureWarning. ([#9392](https://github.com/rapidsai/cudf/pull/9392)) [@bdice](https://github.com/bdice)
+- Update Java nvcomp JNI bindings to nvcomp 2.x API ([#9384](https://github.com/rapidsai/cudf/pull/9384)) [@jbrennan333](https://github.com/jbrennan333)
+- Add IndexedFrame class and move SingleColumnFrame to a separate module ([#9378](https://github.com/rapidsai/cudf/pull/9378)) [@vyasr](https://github.com/vyasr)
+- Support Arrow NativeFile and PythonFile for remote ORC storage ([#9377](https://github.com/rapidsai/cudf/pull/9377)) [@rjzamora](https://github.com/rjzamora)
+- Use Arrow PythonFile for remote CSV storage ([#9376](https://github.com/rapidsai/cudf/pull/9376)) [@rjzamora](https://github.com/rjzamora)
+- Add multi-threaded writing to GDS writes ([#9372](https://github.com/rapidsai/cudf/pull/9372)) [@devavret](https://github.com/devavret)
+- Miscellaneous column cleanup ([#9370](https://github.com/rapidsai/cudf/pull/9370)) [@vyasr](https://github.com/vyasr)
+- Use single kernel to extract all groups in cudf::strings::extract ([#9358](https://github.com/rapidsai/cudf/pull/9358)) [@davidwendt](https://github.com/davidwendt)
+- Consolidate binary ops into `Frame` ([#9357](https://github.com/rapidsai/cudf/pull/9357)) [@isVoid](https://github.com/isVoid)
+- Move rank scan implementations from scan_inclusive.cu to rank_scan.cu ([#9351](https://github.com/rapidsai/cudf/pull/9351)) [@davidwendt](https://github.com/davidwendt)
+- Remove usage of deprecated thrust::host_space_tag. ([#9350](https://github.com/rapidsai/cudf/pull/9350)) [@bdice](https://github.com/bdice)
+- Use Default Memory Resource for Temporaries in `reduction.cpp` ([#9344](https://github.com/rapidsai/cudf/pull/9344)) [@isVoid](https://github.com/isVoid)
+- Fix Cython compilation warnings. ([#9327](https://github.com/rapidsai/cudf/pull/9327)) [@bdice](https://github.com/bdice)
+- Fix some unused variable warnings in libcudf ([#9326](https://github.com/rapidsai/cudf/pull/9326)) [@davidwendt](https://github.com/davidwendt)
+- Use optional-iterator for copy-if-else kernel ([#9324](https://github.com/rapidsai/cudf/pull/9324)) [@davidwendt](https://github.com/davidwendt)
+- Remove Table class ([#9315](https://github.com/rapidsai/cudf/pull/9315)) [@vyasr](https://github.com/vyasr)
+- Unpin `dask` and `distributed` in CI ([#9307](https://github.com/rapidsai/cudf/pull/9307)) [@galipremsagar](https://github.com/galipremsagar)
+- Add optional-iterator support to indexalator ([#9306](https://github.com/rapidsai/cudf/pull/9306)) [@davidwendt](https://github.com/davidwendt)
+- Consolidate more methods in Frame ([#9305](https://github.com/rapidsai/cudf/pull/9305)) [@vyasr](https://github.com/vyasr)
+- Add Arrow-NativeFile and PythonFile support to read_parquet and read_csv in cudf ([#9304](https://github.com/rapidsai/cudf/pull/9304)) [@rjzamora](https://github.com/rjzamora)
+- Pin mypy in .pre-commit-config.yaml to match conda environment pinning. ([#9300](https://github.com/rapidsai/cudf/pull/9300)) [@bdice](https://github.com/bdice)
+- Use gather.hpp when gather-map exists in device memory ([#9299](https://github.com/rapidsai/cudf/pull/9299)) [@davidwendt](https://github.com/davidwendt)
+- Fix Automerger for `Branch-21.12` from `branch-21.10` ([#9285](https://github.com/rapidsai/cudf/pull/9285)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor cuIO timestamp processing with `cuda::std::chrono` ([#9278](https://github.com/rapidsai/cudf/pull/9278)) [@PointKernel](https://github.com/PointKernel)
+- Change strings copy_if_else to use optional-iterator instead of pair-iterator ([#9266](https://github.com/rapidsai/cudf/pull/9266)) [@davidwendt](https://github.com/davidwendt)
+- Update cudf java bindings to 21.12.0-SNAPSHOT ([#9248](https://github.com/rapidsai/cudf/pull/9248)) [@pxLi](https://github.com/pxLi)
+- Various internal MultiIndex improvements ([#9243](https://github.com/rapidsai/cudf/pull/9243)) [@vyasr](https://github.com/vyasr)
+- Add detail interface for `split` and `slice(table_view)`, refactors both function with `host_span` ([#9226](https://github.com/rapidsai/cudf/pull/9226)) [@isVoid](https://github.com/isVoid)
+- Refactor MD5 implementation. ([#9212](https://github.com/rapidsai/cudf/pull/9212)) [@bdice](https://github.com/bdice)
+- Update groupby result_cache to allow sharing intermediate results based on column_view instead of requests. ([#9195](https://github.com/rapidsai/cudf/pull/9195)) [@karthikeyann](https://github.com/karthikeyann)
+- Use nvcomp&#39;s snappy decompressor in avro reader ([#9181](https://github.com/rapidsai/cudf/pull/9181)) [@devavret](https://github.com/devavret)
+- Add `isocalendar` API support ([#9169](https://github.com/rapidsai/cudf/pull/9169)) [@marlenezw](https://github.com/marlenezw)
+- Simplify read_json by removing unnecessary reader/impl classes ([#9088](https://github.com/rapidsai/cudf/pull/9088)) [@cwharris](https://github.com/cwharris)
+- Simplify read_csv by removing unnecessary reader/impl classes ([#9041](https://github.com/rapidsai/cudf/pull/9041)) [@cwharris](https://github.com/cwharris)
+- Refactor hash join with cuCollections multimap ([#8934](https://github.com/rapidsai/cudf/pull/8934)) [@PointKernel](https://github.com/PointKernel)
 
 # cuDF 21.10.00 (7 Oct 2021)
 

From ce02856c099694ad463dbf7970dfc69276842557 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 17 Dec 2021 15:44:56 -0800
Subject: [PATCH 102/202] Add decimal types to cuIO benchmarks (#9776)

Closes https://github.com/rapidsai/cudf/issues/9769
Depends on https://github.com/rapidsai/cudf/pull/9775

Benchmarks now include decimal32/64/128 columns for all supported formats.
Also fixes an issue in distribution factory, which caused all normal distributions to generate `upper_bound` in many cases.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9776
---
 .../common/generate_benchmark_input.cpp       | 25 +++++++++++++++--
 .../common/generate_benchmark_input.hpp       | 13 +++++++--
 .../common/random_distribution_factory.hpp    | 27 +++++++++++--------
 .../io/csv/csv_reader_benchmark.cpp           |  2 ++
 .../io/csv/csv_writer_benchmark.cpp           |  2 ++
 .../io/orc/orc_reader_benchmark.cpp           |  5 +++-
 .../io/orc/orc_writer_benchmark.cpp           |  5 +++-
 .../io/parquet/parquet_reader_benchmark.cpp   |  5 +++-
 .../io/parquet/parquet_writer_benchmark.cpp   |  5 +++-
 9 files changed, 70 insertions(+), 19 deletions(-)

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index 0ec2590bdb5..995cea13c27 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -161,8 +161,29 @@ struct random_value_fn<T, typename std::enable_if_t<cudf::is_chrono<T>()>> {
  */
 template <typename T>
 struct random_value_fn<T, typename std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  random_value_fn(distribution_params<T> const&) {}
-  T operator()(std::mt19937& engine) { CUDF_FAIL("Not implemented"); }
+  using rep = typename T::rep;
+  rep const lower_bound;
+  rep const upper_bound;
+  distribution_fn<rep> dist;
+  std::optional<numeric::scale_type> scale;
+
+  random_value_fn(distribution_params<rep> const& desc)
+    : lower_bound{desc.lower_bound},
+      upper_bound{desc.upper_bound},
+      dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
+  {
+  }
+
+  T operator()(std::mt19937& engine)
+  {
+    if (not scale.has_value()) {
+      int const max_scale = std::numeric_limits<rep>::digits10;
+      auto scale_dist     = make_distribution<int>(distribution_id::NORMAL, -max_scale, max_scale);
+      scale = numeric::scale_type{std::max(std::min(scale_dist(engine), max_scale), -max_scale)};
+    }
+    // Clamp the generated random value to the specified range
+    return T{std::max(std::min(dist(engine), upper_bound), lower_bound), *scale};
+  }
 };
 
 /**
diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp
index 6ea57c0a7ad..3dbc6561839 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.hpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.hpp
@@ -216,6 +216,7 @@ class data_profile {
   distribution_params<cudf::string_view> string_dist_desc{{distribution_id::NORMAL, 0, 32}};
   distribution_params<cudf::list_view> list_dist_desc{
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 100}, 2};
+  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
 
   double bool_probability        = 0.5;
   double null_frequency          = 0.01;
@@ -284,9 +285,17 @@ class data_profile {
   }
 
   template <typename T, typename std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<T> get_distribution_params() const
+  distribution_params<typename T::rep> get_distribution_params() const
   {
-    CUDF_FAIL("Not implemented");
+    using rep = typename T::rep;
+    auto it   = decimal_params.find(cudf::type_to_id<T>());
+    if (it == decimal_params.end()) {
+      auto const range = default_range<rep>();
+      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+    } else {
+      auto& desc = it->second;
+      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+    }
   }
 
   auto get_bool_probability() const { return bool_probability; }
diff --git a/cpp/benchmarks/common/random_distribution_factory.hpp b/cpp/benchmarks/common/random_distribution_factory.hpp
index c21fb645573..65dc8b4dd4d 100644
--- a/cpp/benchmarks/common/random_distribution_factory.hpp
+++ b/cpp/benchmarks/common/random_distribution_factory.hpp
@@ -21,19 +21,24 @@
 #include <memory>
 #include <random>
 
+/**
+ * @brief Generates a normal(binomial) distribution between zero and upper_bound.
+ */
 template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-auto make_normal_dist(T range_start, T range_end)
+auto make_normal_dist(T upper_bound)
 {
-  using uT            = typename std::make_unsigned<T>::type;
-  uT const range_size = range_end - range_start;
-  return std::binomial_distribution<uT>(range_size, 0.5);
+  using uT = typename std::make_unsigned<T>::type;
+  return std::binomial_distribution<uT>(upper_bound, 0.5);
 }
 
+/**
+ * @brief Generates a normal distribution between zero and upper_bound.
+ */
 template <typename T, std::enable_if_t<cudf::is_floating_point<T>()>* = nullptr>
-auto make_normal_dist(T range_start, T range_end)
+auto make_normal_dist(T upper_bound)
 {
-  T const mean   = range_start / 2 + range_end / 2;
-  T const stddev = range_end / 6 - range_start / 6;
+  T const mean   = upper_bound / 2;
+  T const stddev = upper_bound / 6;
   return std::normal_distribution<T>(mean, stddev);
 }
 
@@ -82,8 +87,8 @@ distribution_fn<T> make_distribution(distribution_id did, T lower_bound, T upper
 {
   switch (did) {
     case distribution_id::NORMAL:
-      return [lower_bound, dist = make_normal_dist(lower_bound, upper_bound)](
-               std::mt19937& engine) mutable -> T { return dist(engine) - lower_bound; };
+      return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
+               std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
     case distribution_id::UNIFORM:
       return [dist = make_uniform_dist(lower_bound, upper_bound)](
                std::mt19937& engine) mutable -> T { return dist(engine); };
@@ -104,8 +109,8 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
 {
   switch (dist_id) {
     case distribution_id::NORMAL:
-      return [dist = make_normal_dist(lower_bound, upper_bound)](
-               std::mt19937& engine) mutable -> T { return dist(engine); };
+      return [lower_bound, dist = make_normal_dist(upper_bound - lower_bound)](
+               std::mt19937& engine) mutable -> T { return dist(engine) + lower_bound; };
     case distribution_id::UNIFORM:
       return [dist = make_uniform_dist(lower_bound, upper_bound)](
                std::mt19937& engine) mutable -> T { return dist(engine); };
diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index 3f5549a3148..77bf4b03a14 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -70,6 +70,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
                                                    int32_t(cudf::type_id::STRING)}),
                                 col_sel);
@@ -143,6 +144,7 @@ void BM_csv_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(CSV_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index fdd7c63eece..9baab6b2571 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -63,6 +63,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
                                              int32_t(cudf::type_id::STRING)});
 
@@ -96,6 +97,7 @@ void BM_csv_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(CSV_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index f0624e40149..6ab8d8d09c0 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -91,8 +91,10 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING)}),
+                                                   int32_t(cudf::type_id::STRING),
+                                                   int32_t(cudf::type_id::LIST)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -158,6 +160,7 @@ void BM_orc_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(ORC_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index bfa7d4fc6d9..933b3d02e08 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -70,8 +70,10 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
-                                             int32_t(cudf::type_id::STRING)});
+                                             int32_t(cudf::type_id::STRING),
+                                             int32_t(cudf::type_id::LIST)});
 
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -101,6 +103,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL_SIGNED);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(ORC_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 045aa0e043b..a68ce2bd1a1 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -92,8 +92,10 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
                                                    int32_t(type_group_id::FLOATING_POINT),
+                                                   int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING)}),
+                                                   int32_t(cudf::type_id::STRING),
+                                                   int32_t(cudf::type_id::LIST)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -160,6 +162,7 @@ void BM_parq_read_varying_options(benchmark::State& state)
 
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, integral, type_group_id::INTEGRAL);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, string, cudf::type_id::STRING);
 RD_BENCHMARK_DEFINE_ALL_SOURCES(PARQ_RD_BM_INPUTS_DEFINE, list, cudf::type_id::LIST);
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 5c3c53fee8e..1af7e206692 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -71,8 +71,10 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
+                                             int32_t(type_group_id::FIXED_POINT),
                                              int32_t(type_group_id::TIMESTAMP),
-                                             int32_t(cudf::type_id::STRING)});
+                                             int32_t(cudf::type_id::STRING),
+                                             int32_t(cudf::type_id::LIST)});
 
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -103,6 +105,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
 
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, integral, type_group_id::INTEGRAL);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, floats, type_group_id::FLOATING_POINT);
+WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, decimal, type_group_id::FIXED_POINT);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, timestamps, type_group_id::TIMESTAMP);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, string, cudf::type_id::STRING);
 WR_BENCHMARK_DEFINE_ALL_SINKS(PARQ_WR_BM_INOUTS_DEFINE, list, cudf::type_id::LIST);

From a4dc42d4c6b88b7f2da78d55934c01cb7479a6a1 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Mon, 20 Dec 2021 08:26:23 -0800
Subject: [PATCH 103/202] Implement `lists::index_of()` to find positions in
 list rows (#9510)

Fixes #9164.

### Prelude
`lists::contains()` (introduced in #7039) returns a `BOOL8` column, indicating whether the specified search_key(s) exist at all in each corresponding list row of an input LIST column. It does not return the actual position.

### `index_of()`
This commit introduces `lists::index_of()`, to return the INT32 positions of the specified search_key(s) in a LIST column.

The search keys may be searched for using either `FIND_FIRST` (which finds the position of the first occurrence), or `FIND_LAST` (which finds the last occurrence). Both column_view and scalar search keys are supported.

As with `lists::contains()`, nested types are not supported as search keys in `lists::index_of()`.

If the search_key cannot be found, that output row is set to `-1`. Additionally, the row `output[i]` is set to null if:
  1. The `search_key`(scalar) or `search_keys[i]`(column_view) is null.
  2. The list row `lists[i]` is null

In all other cases, `output[i]` should contain a non-negative value.

### Semantic changes for `lists::contains()`
This commit also modifies the semantics of `lists::contains()`: it will now return nulls only for the following cases:
  1. The `search_key`(scalar) or `search_keys[i]`(column_view) is null.
  2. The list row `lists[i]` is null

In all other cases, a non-null bool is returned. Specifically `lists::contains()` no longer conforms to SQL semantics of returning `NULL` for list rows that don't contain the search key, while simultaneously containing nulls. In this case, `false` is returned.

### `lists::contains_null_elements()`
A new function has been introduced to check if each list row contains null elements. The semantics are similar to `lists::contains()`, in that the column returned is BOOL8 typed:
  1. If even 1 element in a list row is null, the returned row is `true`.
  2. If no element is null, the returned row is `false`.
  3. If the list row is null, the returned row is `null`.
  4. If the list row is empty, the returned row is `false`.

The current implementation is an inefficient placeholder, to be replaced once (#9588) is available. It is included here to reconstruct the SQL semantics dropped from `lists::contains()`.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Jason Lowe (https://github.com/jlowe)
  - Mark Harris (https://github.com/harrism)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9510
---
 cpp/include/cudf/lists/contains.hpp           | 102 +-
 cpp/src/lists/contains.cu                     | 353 +++++--
 cpp/tests/lists/contains_tests.cpp            | 995 ++++++++++++------
 .../main/java/ai/rapids/cudf/ColumnView.java  |  84 +-
 java/src/main/native/src/ColumnViewJni.cpp    |  50 +
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 165 ++-
 python/cudf/cudf/tests/test_list.py           |   4 +-
 7 files changed, 1283 insertions(+), 470 deletions(-)

diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 7cd40bb2f86..d529677d505 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -27,7 +27,7 @@ namespace lists {
  */
 
 /**
- * @brief Create a column of bool values indicating whether the specified scalar
+ * @brief Create a column of `bool` values indicating whether the specified scalar
  * is an element of each row of a list column.
  *
  * The output column has as many elements as the input `lists` column.
@@ -51,7 +51,7 @@ std::unique_ptr<column> contains(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Create a column of bool values indicating whether the list rows of the first
+ * @brief Create a column of `bool` values indicating whether the list rows of the first
  * column contain the corresponding values in the second column
  *
  * The output column has as many elements as the input `lists` column.
@@ -74,6 +74,104 @@ std::unique_ptr<column> contains(
   cudf::column_view const& search_keys,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a column of `bool` values indicating whether each row in the `lists` column
+ * contains at least one null element.
+ *
+ * The output column has as many elements as the input `lists` column.
+ * Output `column[i]` is set to null the list row `lists[i]` is null.
+ * Otherwise, `column[i]` is set to a non-null boolean value, depending on whether that list
+ * contains a null element.
+ * (Empty list rows are considered *NOT* to contain a null element.)
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> BOOL8 column of `n` rows with the result of the lookup
+ */
+std::unique_ptr<column> contains_nulls(
+  cudf::lists_column_view const& lists,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Option to choose whether `index_of()` returns the first or last match
+ * of a search key in a list row
+ */
+enum class duplicate_find_option : int32_t {
+  FIND_FIRST = 0,  ///< Finds first instance of a search key in a list row.
+  FIND_LAST        ///< Finds last instance of a search key in a list row.
+};
+
+/**
+ * @brief Create a column of `size_type` values indicating the position of a search key
+ * within each list row in the `lists` column
+ *
+ * The output column has as many elements as there are rows in the input `lists` column.
+ * Output `column[i]` contains a 0-based index indicating the position of the search key
+ * in each list, counting from the beginning of the list.
+ * Note:
+ *   1. If the `search_key` is null, all output rows are set to null.
+ *   2. If the row `lists[i]` is null, `output[i]` is also null.
+ *   3. If the row `lists[i]` does not contain the `search_key`, `output[i]` is set to `-1`.
+ *   4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
+ *
+ * If the `find_option` is set to `FIND_FIRST`, the position of the first match for
+ * `search_key` is returned.
+ * If `find_option == FIND_LAST`, the position of the last match in the list row is
+ * returned.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_key The scalar key to be looked up in each list row
+ * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
+ * last (`FIND_LAST`)
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
+ *
+ * @throw cudf::logic_error If `search_key` type does not match the element type in `lists`
+ * @throw cudf::logic_error If `search_key` is of a nested type, or `lists` contains nested
+ * elements (LIST, STRUCT)
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::scalar const& search_key,
+  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a column of `size_type` values indicating the position of a search key
+ * row within the corresponding list row in the `lists` column
+ *
+ * The output column has as many elements as there are rows in the input `lists` column.
+ * Output `column[i]` contains a 0-based index indicating the position of each search key
+ * row in its corresponding list row, counting from the beginning of the list.
+ * Note:
+ *   1. If `search_keys[i]` is null, `output[i]` is also null.
+ *   2. If the row `lists[i]` is null, `output[i]` is also null.
+ *   3. If the row `lists[i]` does not contain `search_key[i]`, `output[i]` is set to `-1`.
+ *   4. In all other cases, `output[i]` is set to a non-negative `size_type` index.
+ *
+ * If the `find_option` is set to `FIND_FIRST`, the position of the first match for
+ * `search_key` is returned.
+ * If `find_option == FIND_LAST`, the position of the last match in the list row is
+ * returned.
+ *
+ * @param lists Lists column whose `n` rows are to be searched
+ * @param search_keys A column of search keys to be looked up in each corresponding row of
+ * `lists`
+ * @param find_option Whether to return the position of the first match (`FIND_FIRST`) or
+ * last (`FIND_LAST`)
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return std::unique_ptr<column> INT32 column of `n` rows with the location of the `search_key`
+ *
+ * @throw cudf::logic_error If `search_keys` does not match `lists` in its number of rows
+ * @throw cudf::logic_error If `search_keys` type does not match the element type in `lists`
+ * @throw cudf::logic_error If `lists` or `search_keys` contains nested elements (LIST, STRUCT)
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::column_view const& search_keys,
+  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 3d135992dea..5d095fdd5a3 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -35,6 +35,8 @@ namespace lists {
 
 namespace {
 
+auto constexpr absent_index = size_type{-1};
+
 auto get_search_keys_device_iterable_view(cudf::column_view const& search_keys,
                                           rmm::cuda_stream_view stream)
 {
@@ -46,6 +48,59 @@ auto get_search_keys_device_iterable_view(cudf::scalar const& search_key, rmm::c
   return &search_key;
 }
 
+template <typename ElementType, duplicate_find_option find_option>
+auto __device__ find_begin(list_device_view const& list)
+{
+  if constexpr (find_option == duplicate_find_option::FIND_FIRST) {
+    return list.pair_rep_begin<ElementType>();
+  } else {
+    return thrust::make_reverse_iterator(list.pair_rep_end<ElementType>());
+  }
+}
+
+template <typename ElementType, duplicate_find_option find_option>
+auto __device__ find_end(list_device_view const& list)
+{
+  if constexpr (find_option == duplicate_find_option::FIND_FIRST) {
+    return list.pair_rep_end<ElementType>();
+  } else {
+    return thrust::make_reverse_iterator(list.pair_rep_begin<ElementType>());
+  }
+}
+
+template <duplicate_find_option find_option, typename Iterator>
+size_type __device__ distance([[maybe_unused]] Iterator begin, Iterator end, Iterator find_iter)
+{
+  if (find_iter == end) {
+    return absent_index;  // Not found.
+  }
+
+  if constexpr (find_option == duplicate_find_option::FIND_FIRST) {
+    return find_iter - begin;  // Distance of find_position from begin.
+  } else {
+    return end - find_iter - 1;  // Distance of find_position from end.
+  }
+}
+
+/**
+ * @brief __device__ functor to search for a key in a `list_device_view`.
+ */
+template <duplicate_find_option find_option>
+struct finder {
+  template <typename ElementType>
+  __device__ size_type operator()(list_device_view const& list, ElementType const& search_key) const
+  {
+    auto const list_begin = find_begin<ElementType, find_option>(list);
+    auto const list_end   = find_end<ElementType, find_option>(list);
+    auto const find_iter  = thrust::find_if(
+      thrust::seq, list_begin, list_end, [search_key] __device__(auto element_and_validity) {
+        auto [element, element_is_valid] = element_and_validity;
+        return element_is_valid && cudf::equality_compare(element, search_key);
+      });
+    return distance<find_option>(list_begin, list_end, find_iter);
+  };
+};
+
 /**
  * @brief Functor to search each list row for the specified search keys.
  */
@@ -63,13 +118,15 @@ struct lookup_functor {
     Args&&...) const
   {
     CUDF_FAIL(
-      "lists::contains() is only supported on numeric types, decimals, chrono types, and strings.");
+      "List search operations are only supported on numeric types, decimals, chrono types, and "
+      "strings.");
   }
 
-  std::pair<rmm::device_buffer, size_type> construct_null_mask(lists_column_view const& input_lists,
-                                                               column_view const& result_validity,
-                                                               rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+  std::pair<rmm::device_buffer, size_type> construct_null_mask(
+    lists_column_view const& input_lists,
+    column_view const& result_validity,
+    rmm::cuda_stream_view stream,
+    rmm::mr::device_memory_resource* mr) const
   {
     if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) {
       return {rmm::device_buffer{0, stream, mr}, size_type{0}};
@@ -82,50 +139,31 @@ struct lookup_functor {
   template <typename ElementType, typename SearchKeyPairIter>
   void search_each_list_row(cudf::detail::lists_column_device_view const& d_lists,
                             SearchKeyPairIter search_key_pair_iter,
-                            cudf::mutable_column_device_view mutable_ret_bools,
-                            cudf::mutable_column_device_view mutable_ret_validity,
-                            rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource*)
+                            duplicate_find_option find_option,
+                            cudf::mutable_column_device_view ret_positions,
+                            cudf::mutable_column_device_view ret_validity,
+                            rmm::cuda_stream_view stream) const
   {
-    thrust::for_each(
+    auto output_iterator = thrust::make_zip_iterator(
+      thrust::make_tuple(ret_positions.data<size_type>(), ret_validity.data<bool>()));
+
+    thrust::tabulate(
       rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(d_lists.size()),
-      [d_lists,
-       search_key_pair_iter,
-       d_bools    = mutable_ret_bools.data<bool>(),
-       d_validity = mutable_ret_validity.data<bool>()] __device__(auto row_index) {
-        auto search_key_and_validity    = search_key_pair_iter[row_index];
-        auto const& search_key_is_valid = search_key_and_validity.second;
-
-        if (search_keys_have_nulls && !search_key_is_valid) {
-          d_bools[row_index]    = false;
-          d_validity[row_index] = false;
-          return;
-        }
+      output_iterator,
+      output_iterator + d_lists.size(),
+      [d_lists, search_key_pair_iter, absent_index = absent_index, find_option] __device__(
+        auto row_index) -> thrust::pair<size_type, bool> {
+        auto [search_key, search_key_is_valid] = search_key_pair_iter[row_index];
+
+        if (search_keys_have_nulls && !search_key_is_valid) { return {absent_index, false}; }
 
         auto list = cudf::list_device_view(d_lists, row_index);
-        if (list.is_null()) {
-          d_bools[row_index]    = false;
-          d_validity[row_index] = false;
-          return;
-        }
-
-        auto search_key = search_key_and_validity.first;
-        d_bools[row_index] =
-          thrust::find_if(thrust::seq,
-                          list.pair_rep_begin<ElementType>(),
-                          list.pair_rep_end<ElementType>(),
-                          [search_key] __device__(auto element_and_validity) {
-                            return element_and_validity.second &&
-                                   cudf::equality_compare(element_and_validity.first, search_key);
-                          }) != list.pair_rep_end<ElementType>();
-        d_validity[row_index] =
-          d_bools[row_index] ||
-          thrust::none_of(thrust::seq,
-                          thrust::make_counting_iterator(size_type{0}),
-                          thrust::make_counting_iterator(list.size()),
-                          [&list] __device__(auto const& i) { return list.is_null(i); });
+        if (list.is_null()) { return {absent_index, false}; }
+
+        auto const position = find_option == duplicate_find_option::FIND_FIRST
+                                ? finder<duplicate_find_option::FIND_FIRST>{}(list, search_key)
+                                : finder<duplicate_find_option::FIND_LAST>{}(list, search_key);
+        return {position, true};
       });
   }
 
@@ -133,74 +171,171 @@ struct lookup_functor {
   std::enable_if_t<is_supported<ElementType>::value, std::unique_ptr<column>> operator()(
     cudf::lists_column_view const& lists,
     SearchKeyType const& search_key,
+    duplicate_find_option find_option,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const
   {
     using namespace cudf;
     using namespace cudf::detail;
 
     CUDF_EXPECTS(!cudf::is_nested(lists.child().type()),
-                 "Nested types not supported in lists::contains()");
+                 "Nested types not supported in list search operations.");
     CUDF_EXPECTS(lists.child().type() == search_key.type(),
                  "Type/Scale of search key does not match list column element type.");
     CUDF_EXPECTS(search_key.type().id() != type_id::EMPTY, "Type cannot be empty.");
 
     auto constexpr search_key_is_scalar = std::is_same_v<SearchKeyType, cudf::scalar>;
 
-    if (search_keys_have_nulls && search_key_is_scalar) {
-      return make_fixed_width_column(data_type(type_id::BOOL8),
-                                     lists.size(),
-                                     cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
-                                     lists.size(),
-                                     stream,
-                                     mr);
+    if constexpr (search_keys_have_nulls && search_key_is_scalar) {
+      return make_numeric_column(data_type(type_id::INT32),
+                                 lists.size(),
+                                 cudf::create_null_mask(lists.size(), mask_state::ALL_NULL, mr),
+                                 lists.size(),
+                                 stream,
+                                 mr);
     }
 
     auto const device_view = column_device_view::create(lists.parent(), stream);
-    auto const d_lists     = lists_column_device_view(*device_view);
+    auto const d_lists     = lists_column_device_view{*device_view};
     auto const d_skeys     = get_search_keys_device_iterable_view(search_key, stream);
 
-    auto result_validity = make_fixed_width_column(
+    auto result_positions = make_numeric_column(
+      data_type{type_id::INT32}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr);
+    auto result_validity = make_numeric_column(
       data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr);
-    auto result_bools = make_fixed_width_column(
-      data_type{type_id::BOOL8}, lists.size(), cudf::mask_state::UNALLOCATED, stream, mr);
-    auto mutable_result_bools =
-      mutable_column_device_view::create(result_bools->mutable_view(), stream);
+    auto mutable_result_positions =
+      mutable_column_device_view::create(result_positions->mutable_view(), stream);
     auto mutable_result_validity =
       mutable_column_device_view::create(result_validity->mutable_view(), stream);
     auto search_key_iter =
       cudf::detail::make_pair_rep_iterator<ElementType, search_keys_have_nulls>(*d_skeys);
 
-    search_each_list_row<ElementType>(
-      d_lists, search_key_iter, *mutable_result_bools, *mutable_result_validity, stream, mr);
-
-    rmm::device_buffer null_mask;
-    size_type num_nulls;
+    search_each_list_row<ElementType>(d_lists,
+                                      search_key_iter,
+                                      find_option,
+                                      *mutable_result_positions,
+                                      *mutable_result_validity,
+                                      stream);
 
-    std::tie(null_mask, num_nulls) =
-      construct_null_mask(lists, result_validity->view(), stream, mr);
-    result_bools->set_null_mask(std::move(null_mask), num_nulls);
-
-    return result_bools;
+    auto [null_mask, num_nulls] = construct_null_mask(lists, result_validity->view(), stream, mr);
+    result_positions->set_null_mask(std::move(null_mask), num_nulls);
+    return result_positions;
   }
 };
 
+/**
+ * @brief Converts key-positions vector (from index_of()) to a BOOL8 vector, indicating if
+ * the search key was found.
+ */
+std::unique_ptr<column> to_contains(std::unique_ptr<column>&& key_positions,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(key_positions->type().id() == type_id::INT32,
+               "Expected input column of type INT32.");
+  // If position == -1, the list did not contain the search key.
+  auto const num_rows        = key_positions->size();
+  auto const positions_begin = key_positions->view().begin<size_type>();
+  auto result =
+    make_numeric_column(data_type{type_id::BOOL8}, num_rows, mask_state::UNALLOCATED, stream, mr);
+  thrust::transform(rmm::exec_policy(stream),
+                    positions_begin,
+                    positions_begin + num_rows,
+                    result->mutable_view().begin<bool>(),
+                    [] __device__(auto i) { return i != absent_index; });
+  [[maybe_unused]] auto [_, null_mask, __] = key_positions->release();
+  result->set_null_mask(std::move(*null_mask));
+  return result;
+}
 }  // namespace
 
 namespace detail {
+/**
+ * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
+ *                                cudf::scalar const&,
+ *                                duplicate_find_option,
+ *                                rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::scalar const& search_key,
+  duplicate_find_option find_option,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  return search_key.is_valid(stream)
+           ? cudf::type_dispatcher(search_key.type(),
+                                   lookup_functor<false>{},  // No nulls in search key
+                                   lists,
+                                   search_key,
+                                   find_option,
+                                   stream,
+                                   mr)
+           : cudf::type_dispatcher(search_key.type(),
+                                   lookup_functor<true>{},  // Nulls in search key
+                                   lists,
+                                   search_key,
+                                   find_option,
+                                   stream,
+                                   mr);
+}
+
+/**
+ * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
+ *                                cudf::column_view const&,
+ *                                duplicate_find_option,
+ *                                rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> index_of(
+  cudf::lists_column_view const& lists,
+  cudf::column_view const& search_keys,
+  duplicate_find_option find_option,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(search_keys.size() == lists.size(),
+               "Number of search keys must match list column size.");
+
+  return search_keys.has_nulls()
+           ? cudf::type_dispatcher(search_keys.type(),
+                                   lookup_functor<true>{},  // Nulls in search keys
+                                   lists,
+                                   search_keys,
+                                   find_option,
+                                   stream,
+                                   mr)
+           : cudf::type_dispatcher(search_keys.type(),
+                                   lookup_functor<false>{},  // No nulls in search keys
+                                   lists,
+                                   search_keys,
+                                   find_option,
+                                   stream,
+                                   mr);
+}
 
+/**
+ * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
+ *                                cudf::scalar const&,
+ *                                rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return search_key.is_valid(stream)
-           ? cudf::type_dispatcher(
-               search_key.type(), lookup_functor<false>{}, lists, search_key, stream, mr)
-           : cudf::type_dispatcher(
-               search_key.type(), lookup_functor<true>{}, lists, search_key, stream, mr);
+  return to_contains(
+    index_of(lists, search_key, duplicate_find_option::FIND_FIRST, stream), stream, mr);
 }
 
+/**
+ * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
+ *                                cudf::column_view const&,
+ *                                rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
@@ -209,11 +344,44 @@ std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
 
-  return search_keys.has_nulls()
-           ? cudf::type_dispatcher(
-               search_keys.type(), lookup_functor<true>{}, lists, search_keys, stream, mr)
-           : cudf::type_dispatcher(
-               search_keys.type(), lookup_functor<false>{}, lists, search_keys, stream, mr);
+  return to_contains(
+    index_of(lists, search_keys, duplicate_find_option::FIND_FIRST, stream), stream, mr);
+}
+
+/**
+ * @copydoc cudf::lists::contain_nulls(cudf::lists_column_view const&,
+ *                                     rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> contains_nulls(cudf::lists_column_view const& input_lists,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  auto const num_rows   = input_lists.size();
+  auto const d_lists    = column_device_view::create(input_lists.parent());
+  auto has_nulls_output = make_numeric_column(
+    data_type{type_id::BOOL8}, input_lists.size(), mask_state::UNALLOCATED, stream, mr);
+  auto const output_begin = has_nulls_output->mutable_view().begin<bool>();
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    output_begin,
+    output_begin + num_rows,
+    [lists = cudf::detail::lists_column_device_view{*d_lists}] __device__(auto list_idx) {
+      auto list       = list_device_view{lists, list_idx};
+      auto list_begin = thrust::make_counting_iterator(size_type{0});
+      return list.is_null() ||
+             thrust::any_of(thrust::seq, list_begin, list_begin + list.size(), [&list](auto i) {
+               return list.is_null(i);
+             });
+    });
+  auto const validity_begin = cudf::detail::make_counting_transform_iterator(
+    0, [lists = cudf::detail::lists_column_device_view{*d_lists}] __device__(auto list_idx) {
+      return not list_device_view{lists, list_idx}.is_null();
+    });
+  auto [null_mask, num_nulls] = cudf::detail::valid_if(
+    validity_begin, validity_begin + num_rows, thrust::identity<bool>{}, stream, mr);
+  has_nulls_output->set_null_mask(std::move(null_mask), num_nulls);
+  return has_nulls_output;
 }
 
 }  // namespace detail
@@ -234,5 +402,30 @@ std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
   return detail::contains(lists, search_keys, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> contains_nulls(cudf::lists_column_view const& input_lists,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_nulls(input_lists, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::scalar const& search_key,
+                                 duplicate_find_option find_option,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::index_of(lists, search_key, find_option, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
+                                 cudf::column_view const& search_keys,
+                                 duplicate_find_option find_option,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::index_of(lists, search_keys, find_option, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 5d7e218898c..066eb7eafc8 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 namespace cudf {
@@ -42,6 +43,12 @@ struct TypedContainsTest : public ContainsTest {
 TYPED_TEST_SUITE(TypedContainsTest, ContainsTestTypes);
 
 namespace {
+
+auto constexpr x          = int32_t{-1};    // Placeholder for nulls.
+auto constexpr absent     = size_type{-1};  // Index when key is not found in a list.
+auto constexpr FIND_FIRST = lists::duplicate_find_option::FIND_FIRST;
+auto constexpr FIND_LAST  = lists::duplicate_find_option::FIND_LAST;
+
 template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
 auto create_scalar_search_key(T const& value)
 {
@@ -101,238 +108,381 @@ auto create_null_search_key()
 
 }  // namespace
 
-TYPED_TEST(TypedContainsTest, ListContainsScalarWithNoNulls)
+using iterators::all_nulls;
+using iterators::null_at;
+using iterators::nulls_at;
+using bools   = fixed_width_column_wrapper<bool>;
+using indices = fixed_width_column_wrapper<size_type>;
+
+TYPED_TEST(TypedContainsTest, ScalarKeyWithNoNulls)
 {
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{
-    {0, 1, 2},
-    {3, 4, 5},
-    {6, 7, 8},
-    {9, 0, 1},
-    {2, 3, 4},
-    {5, 6, 7},
-    {8, 9, 0},
-    {},
-    {1, 2, 3},
-    {}}.release();
-  auto search_key_one  = create_scalar_search_key<T>(1);
-  auto actual_result   = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
+  auto search_space   = lists_column_view{lists_column_wrapper<T, int32_t>{{0, 1, 2, 1},
+                                                                         {3, 4, 5},
+                                                                         {6, 7, 8},
+                                                                         {9, 0, 1, 3, 1},
+                                                                         {2, 3, 4},
+                                                                         {5, 6, 7},
+                                                                         {8, 9, 0},
+                                                                         {},
+                                                                         {1, 2, 1, 3},
+                                                                         {}}};
+  auto search_key_one = create_scalar_search_key<T>(1);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space, *search_key_one);
+    auto expected = bools{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS
+    auto result   = lists::contains_nulls(search_space);
+    auto expected = bools{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space, *search_key_one, FIND_FIRST);
+    auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space, *search_key_one, FIND_LAST);
+    auto expected = indices{3, absent, absent, 4, absent, absent, absent, absent, 2, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullLists)
+TYPED_TEST(TypedContainsTest, ScalarKeyWithNullLists)
 {
   // Test List columns that have NULL list rows.
-
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{
-    {{0, 1, 2},
-     {3, 4, 5},
-     {6, 7, 8},
-     {},
-     {9, 0, 1},
-     {2, 3, 4},
-     {5, 6, 7},
-     {8, 9, 0},
-     {},
-     {1, 2, 3},
-     {}},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 3) && (i != 10);
-    })}.release();
-
+  auto search_space   = lists_column_view{lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                                          {3, 4, 5},
+                                                                          {6, 7, 8},
+                                                                          {},
+                                                                          {9, 0, 1, 3, 1},
+                                                                          {2, 3, 4},
+                                                                          {5, 6, 7},
+                                                                          {8, 9, 0},
+                                                                          {},
+                                                                          {1, 2, 2, 3},
+                                                                          {}},
+                                                                         nulls_at({3, 10})}};
   auto search_key_one = create_scalar_search_key<T>(1);
-  auto actual_result  = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0},
-                                     cudf::detail::make_counting_transform_iterator(
-                                       0, [](auto i) { return (i != 3) && (i != 10); })};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space, *search_key_one);
+    auto expected = bools{{1, 0, 0, x, 1, 0, 0, 0, 0, 1, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS
+    auto result   = lists::contains_nulls(search_space);
+    auto expected = bools{{0, 0, 0, x, 0, 0, 0, 0, 0, 0, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result = lists::index_of(search_space, *search_key_one, FIND_FIRST);
+    auto expected =
+      indices{{1, absent, absent, x, 2, absent, absent, absent, absent, 0, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result = lists::index_of(search_space, *search_key_one, FIND_LAST);
+    auto expected =
+      indices{{3, absent, absent, x, 4, absent, absent, absent, absent, 0, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 TYPED_TEST(TypedContainsTest, SlicedLists)
 {
   // Test sliced List columns.
-
   using namespace cudf;
+  using T = TypeParam;
 
-  using T     = TypeParam;
-  using bools = fixed_width_column_wrapper<bool>;
-
-  auto search_space = lists_column_wrapper<T, int32_t>{
-    {{0, 1, 2},
-     {3, 4, 5},
-     {6, 7, 8},
-     {},
-     {9, 0, 1},
-     {2, 3, 4},
-     {5, 6, 7},
-     {8, 9, 0},
-     {},
-     {1, 2, 3},
-     {}},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 3) && (i != 10);
-    })}.release();
-
-  auto sliced_column_1 = cudf::detail::slice(search_space->view(), {1, 8}).front();
-
-  auto search_key_one = create_scalar_search_key<T>(1);
-  auto result_1       = lists::contains(sliced_column_1, *search_key_one);
-
-  auto expected_result_1 = bools{
-    {0, 0, 0, 1, 0, 0, 0}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 2);
-    })}.release();
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_1->view(), result_1->view());
-
-  auto sliced_column_2 = cudf::detail::slice(search_space->view(), {3, 10}).front();
-
-  auto result_2 = lists::contains(sliced_column_2, *search_key_one);
+  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                        {3, 4, 5},
+                                                        {6, 7, 8},
+                                                        {},
+                                                        {9, 0, 1, 3, 1},
+                                                        {2, 3, 4},
+                                                        {5, 6, 7},
+                                                        {8, 9, 0},
+                                                        {},
+                                                        {1, 2, 1, 3},
+                                                        {}},
+                                                       nulls_at({3, 10})};
 
-  auto expected_result_2 = bools{
-    {0, 1, 0, 0, 0, 0, 1}, cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 0);
-    })}.release();
+  {
+    // First Slice.
+    auto sliced_column_1 = cudf::detail::slice(search_space, {1, 8}).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
+    {
+      // CONTAINS
+      auto result          = lists::contains(sliced_column_1, *search_key_one);
+      auto expected_result = bools{{0, 0, x, 1, 0, 0, 0}, null_at(2)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // CONTAINS NULLS
+      auto result          = lists::contains_nulls(sliced_column_1);
+      auto expected_result = bools{{0, 0, x, 0, 0, 0, 0}, null_at(2)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // FIND_FIRST
+      auto result          = lists::index_of(sliced_column_1, *search_key_one, FIND_FIRST);
+      auto expected_result = indices{{absent, absent, 0, 2, absent, absent, absent}, null_at(2)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // FIND_LAST
+      auto result          = lists::index_of(sliced_column_1, *search_key_one, FIND_LAST);
+      auto expected_result = indices{{absent, absent, 0, 4, absent, absent, absent}, null_at(2)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+  }
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2->view());
+  {
+    // Second Slice.
+    auto sliced_column_2 = cudf::detail::slice(search_space, {3, 10}).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
+    {
+      // CONTAINS
+      auto result          = lists::contains(sliced_column_2, *search_key_one);
+      auto expected_result = bools{{x, 1, 0, 0, 0, 0, 1}, null_at(0)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // CONTAINS NULLS
+      auto result          = lists::contains_nulls(sliced_column_2);
+      auto expected_result = bools{{x, 0, 0, 0, 0, 0, 0}, null_at(0)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // FIND_FIRST
+      auto result          = lists::index_of(sliced_column_2, *search_key_one, FIND_FIRST);
+      auto expected_result = indices{{0, 2, absent, absent, absent, absent, 0}, null_at(0)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+    {
+      // FIND_LAST
+      auto result          = lists::index_of(sliced_column_2, *search_key_one, FIND_LAST);
+      auto expected_result = indices{{0, 4, absent, absent, absent, absent, 2}, null_at(0)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
+    }
+  }
 }
 
-TYPED_TEST(TypedContainsTest, ListContainsScalarNonNullListsWithNullValues)
+TYPED_TEST(TypedContainsTest, ScalarKeyNonNullListsWithNullValues)
 {
   // Test List columns that have no NULL list rows, but NULL elements in some list rows.
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto search_space =
-    make_lists_column(8,
-                      fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
-                      numerals.release(),
-                      0,
-                      {});
-
+  auto numerals     = fixed_width_column_wrapper<T>{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto search_space = make_lists_column(
+    8, indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {});
+  // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ]
   auto search_key_one = create_scalar_search_key<T>(1);
-  auto actual_result  = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 1, 0, 1, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto expected = bools{0, 1, 0, 0, 0, 0, 0, 1};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS
+    auto result   = lists::contains_nulls(search_space->view());
+    auto expected = bools{1, 0, 1, 0, 0, 1, 1, 1};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto expected = indices{absent, 0, absent, absent, absent, absent, absent, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto expected = indices{absent, 0, absent, absent, absent, absent, absent, 3};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedContainsTest, ListContainsScalarWithNullsInLists)
+TYPED_TEST(TypedContainsTest, ScalarKeysWithNullsInLists)
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+  auto numerals = fixed_width_column_wrapper<T>{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto input_null_mask_iter = null_at(4);
 
   auto search_space = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
+  // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
   auto search_key_one = create_scalar_search_key<T>(1);
-  auto actual_result  = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS.
+    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS.
+    auto result   = lists::contains_nulls(search_space->view());
+    auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 0}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 3}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TEST_F(ContainsTest, BoolListContainsScalarWithNullsInLists)
+TEST_F(ContainsTest, BoolScalarWithNullsInLists)
 {
   using T = bool;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
-
-  auto search_space = make_lists_column(
+  auto numerals = fixed_width_column_wrapper<T>{{x, 1, 1, x, 1, 1, x, 1, 1, x, x, 1, 1, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto input_null_mask_iter = null_at(4);
+  auto search_space         = make_lists_column(
     8,
     fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
+  // Search space: [ [x], [1,1], [x,1,1,x], [], x, [1,1,x], [x], [1,1,x,1] ]
   auto search_key_one = create_scalar_search_key<T>(1);
-  auto actual_result  = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 0, 1, 1, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto expected = bools{{0, 1, 1, 0, x, 1, 0, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS
+    auto result   = lists::contains_nulls(search_space->view());
+    auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto expected = indices{{absent, 0, 1, absent, x, 0, absent, 0}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto expected = indices{{absent, 1, 2, absent, x, 1, absent, 3}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TEST_F(ContainsTest, StringListContainsScalarWithNullsInLists)
+TEST_F(ContainsTest, StringScalarWithNullsInLists)
 {
   using T = std::string;
 
   auto strings = strings_column_wrapper{
-    {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
-
-  auto search_space = make_lists_column(
+    {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"},
+    nulls_at({0, 3, 6, 9, 10, 13})};
+  auto input_null_mask_iter = null_at(4);
+  auto search_space         = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     strings.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
+  // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
   auto search_key_one = create_scalar_search_key<T>("1");
-  auto actual_result  = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // CONTAINS NULLS
+    auto result   = lists::contains_nulls(search_space->view());
+    auto expected = bools{{1, 0, 1, 0, x, 1, 1, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 0}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST.
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto expected = indices{{absent, 0, absent, absent, x, absent, absent, 3}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedContainsTest, ContainsScalarNullSearchKey)
+TYPED_TEST(TypedContainsTest, ScalarNullSearchKey)
 {
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{
-    {{0, 1, 2},
-     {3, 4, 5},
-     {6, 7, 8},
-     {},
-     {9, 0, 1},
-     {2, 3, 4},
-     {5, 6, 7},
-     {8, 9, 0},
-     {},
-     {1, 2, 3},
-     {}},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 3) && (i != 10);
-    })}.release();
-
+  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2},
+                                                        {3, 4, 5},
+                                                        {6, 7, 8},
+                                                        {},
+                                                        {9, 0, 1},
+                                                        {2, 3, 4},
+                                                        {5, 6, 7},
+                                                        {8, 9, 0},
+                                                        {},
+                                                        {1, 2, 3},
+                                                        {}},
+                                                       nulls_at({3, 10})}
+                        .release();
   auto search_key_null = create_null_search_key<T>();
-  auto actual_result   = lists::contains(search_space->view(), *search_key_null);
-  auto expected_result = fixed_width_column_wrapper<bool>{
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return false; })};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_null);
+    auto expected = bools{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), *search_key_null, FIND_FIRST);
+    auto expected = indices{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), *search_key_null, FIND_LAST);
+    auto expected = indices{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 TEST_F(ContainsTest, ScalarTypeRelatedExceptions)
@@ -346,9 +496,12 @@ TEST_F(ContainsTest, ScalarTypeRelatedExceptions)
        {4, 5, 6}}}.release();
     auto skey = create_scalar_search_key<int32_t>(10);
     CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), *skey),
-                              "Nested types not supported in lists::contains()");
+                              "Nested types not supported in list search operations.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), *skey, FIND_FIRST),
+                              "Nested types not supported in list search operations.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), *skey, FIND_LAST),
+                              "Nested types not supported in list search operations.");
   }
-
   {
     // Search key must match list elements in type.
     auto list_of_ints =
@@ -360,6 +513,10 @@ TEST_F(ContainsTest, ScalarTypeRelatedExceptions)
     auto skey = create_scalar_search_key<std::string>("Hello, World!");
     CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), *skey),
                               "Type/Scale of search key does not match list column element type.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), *skey, FIND_FIRST),
+                              "Type/Scale of search key does not match list column element type.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), *skey, FIND_LAST),
+                              "Type/Scale of search key does not match list column element type.");
   }
 }
 
@@ -367,199 +524,275 @@ template <typename T>
 struct TypedVectorContainsTest : public ContainsTest {
 };
 
-using VectorContainsTestTypes =
+using VectorTestTypes =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
-TYPED_TEST_SUITE(TypedVectorContainsTest, VectorContainsTestTypes);
+TYPED_TEST_SUITE(TypedVectorContainsTest, VectorTestTypes);
 
-TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNoNulls)
+TYPED_TEST(TypedVectorContainsTest, VectorKeysWithNoNulls)
 {
   using T = TypeParam;
 
   auto search_space = lists_column_wrapper<T, int32_t>{
-    {0, 1, 2},
+    {0, 1, 2, 1},
     {3, 4, 5},
     {6, 7, 8},
-    {9, 0, 1},
+    {9, 0, 1, 3, 1},
     {2, 3, 4},
     {5, 6, 7},
     {8, 9, 0},
     {},
-    {1, 2, 3},
+    {1, 2, 3, 3},
     {}}.release();
 
-  auto search_key      = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1};
-  auto actual_result   = lists::contains(search_space->view(), search_key);
-  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_key = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1};
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_key);
+    auto expected = bools{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_key, FIND_FIRST);
+    auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_key, FIND_LAST);
+    auto expected = indices{3, absent, absent, 4, 0, absent, absent, absent, 3, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullLists)
+TYPED_TEST(TypedVectorContainsTest, VectorWithNullLists)
 {
   // Test List columns that have NULL list rows.
 
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{
-    {{0, 1, 2},
-     {3, 4, 5},
-     {6, 7, 8},
-     {},
-     {9, 0, 1},
-     {2, 3, 4},
-     {5, 6, 7},
-     {8, 9, 0},
-     {},
-     {1, 2, 3},
-     {}},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-      return (i != 3) && (i != 10);
-    })}.release();
-
-  auto search_keys   = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2};
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0},
-                                     cudf::detail::make_counting_transform_iterator(
-                                       0, [](auto i) { return (i != 3) && (i != 10); })};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                        {3, 4, 5},
+                                                        {6, 7, 8},
+                                                        {},
+                                                        {9, 0, 1, 3, 1},
+                                                        {2, 3, 4},
+                                                        {5, 6, 7},
+                                                        {8, 9, 0},
+                                                        {},
+                                                        {1, 2, 3, 3},
+                                                        {}},
+                                                       nulls_at({3, 10})}
+                        .release();
+
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2};
+
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{{1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected =
+      indices{{1, absent, absent, x, absent, 1, absent, absent, absent, 0, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected =
+      indices{{3, absent, absent, x, absent, 1, absent, absent, absent, 0, x}, nulls_at({3, 10})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedVectorContainsTest, ListContainsVectorNonNullListsWithNullValues)
+TYPED_TEST(TypedVectorContainsTest, VectorNonNullListsWithNullValues)
 {
   // Test List columns that have no NULL list rows, but NULL elements in some list rows.
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto search_space =
-    make_lists_column(8,
-                      fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
-                      numerals.release(),
-                      0,
-                      {});
-
-  auto search_keys   = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 3};
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 1, 0, 1, 1}};
+  auto numerals = fixed_width_column_wrapper<T>{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_space = make_lists_column(
+    8, indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {});
+  // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ]
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{0, 1, 0, 0, 0, 0, 0, 1};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected = indices{absent, 1, absent, absent, absent, absent, absent, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected = indices{absent, 1, absent, absent, absent, absent, absent, 3};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInLists)
+TYPED_TEST(TypedVectorContainsTest, VectorWithNullsInLists)
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+  auto numerals = fixed_width_column_wrapper<T>{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
 
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+  auto input_null_mask_iter = null_at(4);
 
   auto search_space = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+  // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
 
-  auto search_keys   = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 3};
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 1, 1}, {0, 1, 0, 1, 0, 0, 1, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{{0, 1, 0, 0, x, 0, 0, 1}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected = indices{{absent, 1, absent, absent, x, absent, absent, 0}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected = indices{{absent, 1, absent, absent, x, absent, absent, 3}, null_at(4)};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearchKeys)
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+  auto numerals = fixed_width_column_wrapper<T>{{x, 1, 2, x, 4, 5, x, 7, 8, x, x, 1, 2, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
 
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+  auto input_null_mask_iter = null_at(4);
 
   auto search_space = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
+  // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{
-    {1, 2, 3, 1, 2, 3, 1, 3},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
-
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{{1, 2, 3, x, 2, 3, 1, 1}, null_at(3)};
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 0}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TEST_F(ContainsTest, BoolListContainsVectorWithNullsInListsAndInSearchKeys)
+TEST_F(ContainsTest, BoolKeyVectorWithNullsInListsAndInSearchKeys)
 {
   using T = bool;
 
-  auto numerals = fixed_width_column_wrapper<T, int32_t>{
-    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
+  auto numerals = fixed_width_column_wrapper<T>{{x, 0, 1, x, 1, 1, x, 1, 1, x, x, 0, 1, x, 1},
+                                                nulls_at({0, 3, 6, 9, 10, 13})};
 
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
+  auto input_null_mask_iter = null_at(4);
 
   auto search_space = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
+    indices{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{
-    {0, 1, 0, 1, 0, 0, 1, 1},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
-
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_keys = fixed_width_column_wrapper<T, int32_t>{{0, 1, 0, x, 0, 0, 1, 1}, null_at(3)};
+  // Search space: [ [x], [0,1], [x,1,1,x], [], x, [1,1,x], [x], [0,1,x,1] ]
+  // Search keys : [  0,   1,     0,         x, 0,  0,       1,   1        ]
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 1}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TEST_F(ContainsTest, StringListContainsVectorWithNullsInListsAndInSearchKeys)
+TEST_F(ContainsTest, StringKeyVectorWithNullsInListsAndInSearchKeys)
 {
-  auto numerals = strings_column_wrapper{
-    {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "1", "2", "3", "4"},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) -> bool { return i % 3; })};
-
-  auto input_null_mask_iter =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
-
-  auto search_space = make_lists_column(
+  auto strings = strings_column_wrapper{
+    {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"},
+    nulls_at({0, 3, 6, 9, 10, 13})};
+  auto input_null_mask_iter = null_at(4);
+  auto search_space         = make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 12, 15}.release(),
-    numerals.release(),
+    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    strings.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
-  auto search_keys = strings_column_wrapper{
-    {"1", "2", "3", "1", "2", "3", "1", "3"},
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 6; })};
+  auto search_keys = strings_column_wrapper{{"1", "2", "3", "X", "2", "3", "1", "1"}, null_at(3)};
 
-  auto actual_result = lists::contains(search_space->view(), search_keys);
-  auto expected_result =
-    fixed_width_column_wrapper<bool>{{0, 1, 0, 0, 0, 0, 0, 1}, {0, 1, 0, 1, 0, 0, 0, 1}};
+  // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
+  // Search keys:  [  1,   2,     3,         X, 2,  3,       1,   1]
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_keys);
+    auto expected = bools{{0, 1, 0, x, x, 0, 0, 1}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 0}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto expected = indices{{absent, 1, absent, x, x, absent, absent, 3}, nulls_at({3, 4})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 TEST_F(ContainsTest, VectorTypeRelatedExceptions)
@@ -573,9 +806,12 @@ TEST_F(ContainsTest, VectorTypeRelatedExceptions)
        {4, 5, 6}}}.release();
     auto skey = fixed_width_column_wrapper<int32_t>{0, 1, 2};
     CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_lists->view(), skey),
-                              "Nested types not supported in lists::contains()");
+                              "Nested types not supported in list search operations.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), skey, FIND_FIRST),
+                              "Nested types not supported in list search operations.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_lists->view(), skey, FIND_LAST),
+                              "Nested types not supported in list search operations.");
   }
-
   {
     // Search key must match list elements in type.
     auto list_of_ints =
@@ -587,15 +823,21 @@ TEST_F(ContainsTest, VectorTypeRelatedExceptions)
     auto skey = strings_column_wrapper{"Hello", "World"};
     CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey),
                               "Type/Scale of search key does not match list column element type.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_FIRST),
+                              "Type/Scale of search key does not match list column element type.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_LAST),
+                              "Type/Scale of search key does not match list column element type.");
   }
-
   {
     // Search key column size must match lists column size.
     auto list_of_ints = lists_column_wrapper<int32_t>{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release();
-
-    auto skey = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+    auto skey         = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
     CUDF_EXPECT_THROW_MESSAGE(lists::contains(list_of_ints->view(), skey),
                               "Number of search keys must match list column size.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_FIRST),
+                              "Number of search keys must match list column size.");
+    CUDF_EXPECT_THROW_MESSAGE(lists::index_of(list_of_ints->view(), skey, FIND_LAST),
+                              "Number of search keys must match list column size.");
   }
 }
 
@@ -605,6 +847,7 @@ struct TypedContainsNaNsTest : public ContainsTest {
 
 TYPED_TEST_SUITE(TypedContainsNaNsTest, FloatingPointTypes);
 
+namespace {
 template <typename T>
 T get_nan(const char* nan_contents)
 {
@@ -616,8 +859,9 @@ float get_nan<float>(const char* nan_contents)
 {
   return std::nanf(nan_contents);
 }
+}  // namespace
 
-TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsScalar)
+TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsScalar)
 {
   using T = TypeParam;
 
@@ -637,11 +881,25 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsScalar)
     {1, 2, 3},
     {}}.release();
 
-  auto search_key_nan  = create_scalar_search_key<T>(nan_3);
-  auto actual_result   = lists::contains(search_space->view(), *search_key_nan);
-  auto expected_result = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 1, 0, 1, 0, 0, 0};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto search_key_nan = create_scalar_search_key<T>(nan_3);
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_nan);
+    auto expected = bools{0, 0, 0, 0, 1, 0, 1, 0, 0, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), *search_key_nan, FIND_FIRST);
+    auto expected = indices{absent, absent, absent, absent, 0, absent, 1, absent, absent, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), *search_key_nan, FIND_LAST);
+    auto expected = indices{absent, absent, absent, absent, 0, absent, 1, absent, absent, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
@@ -652,19 +910,18 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
   // presence of NaN values:
   //   1. If the search key is null, null is still returned.
   //   2. If the list contains a null, and the non-null search
-  //      key is not found, null is returned.
+  //      key is not found:
+  //      a) contains() returns `null`.
+  //      b) index_of() returns -1.
   using T = TypeParam;
 
   auto nan_1 = get_nan<T>("1");
   auto nan_2 = get_nan<T>("2");
   auto nan_3 = get_nan<T>("3");
 
-  auto null_at_index_2 =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 2; });
-
   auto search_space = lists_column_wrapper<T>{
     {0.0, 1.0, 2.0},
-    {{3, 4, 5}, null_at_index_2},  // i.e. {3, 4, ∅}.
+    {{3, 4, 5}, null_at(2)},  // i.e. {3, 4, ∅}.
     {6, 7, 8},
     {9, 0, 1},
     {nan_1, 3.0, 4.0},
@@ -679,33 +936,52 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
   {
     // With nulls in the search key rows. (At index 2.)
     auto search_keys =
-      fixed_width_column_wrapper<T>{
-        search_key_values.begin(), search_key_values.end(), null_at_index_2}
+      fixed_width_column_wrapper<T>{search_key_values.begin(), search_key_values.end(), null_at(2)}
         .release();
 
-    auto actual_result = lists::contains(search_space->view(), search_keys->view());
-    auto null_at_index_1_and_2 =
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1 && i != 2; });
-
-    auto expected_result =
-      fixed_width_column_wrapper<bool>{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at_index_1_and_2};
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+    {
+      // CONTAINS
+      auto result   = lists::contains(search_space->view(), search_keys->view());
+      auto expected = bools{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at(2)};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
+    {
+      // FIND_FIRST
+      auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
+      auto expected =
+        indices{{1, absent, x, absent, 0, absent, 2, absent, 1, absent}, nulls_at({2})};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
+    {
+      // FIND_LAST
+      auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
+      auto expected =
+        indices{{1, absent, x, absent, 0, absent, 2, absent, 1, absent}, nulls_at({2})};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
   }
-
   {
     // No nulls in the search key rows.
     auto search_keys =
       fixed_width_column_wrapper<T>(search_key_values.begin(), search_key_values.end()).release();
-
-    auto actual_result = lists::contains(search_space->view(), search_keys->view());
-    auto null_at_index_1 =
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-
-    auto expected_result =
-      fixed_width_column_wrapper<bool>{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at_index_1};
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+    {
+      // CONTAINS
+      auto result   = lists::contains(search_space->view(), search_keys->view());
+      auto expected = bools{1, 0, 0, 0, 1, 0, 1, 0, 1, 0};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
+    {
+      // FIND_FIRST
+      auto result   = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
+      auto expected = indices{1, absent, absent, absent, 0, absent, 2, absent, 1, absent};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
+    {
+      // FIND_LAST
+      auto result   = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
+      auto expected = indices{1, absent, absent, absent, 0, absent, 2, absent, 1, absent};
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+    }
   }
 }
 
@@ -715,50 +991,79 @@ struct TypedContainsDecimalsTest : public ContainsTest {
 
 TYPED_TEST_SUITE(TypedContainsDecimalsTest, FixedPointTypes);
 
-TYPED_TEST(TypedContainsDecimalsTest, ListContainsScalar)
+TYPED_TEST(TypedContainsDecimalsTest, ScalarKey)
 {
   using T = TypeParam;
 
-  auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
-                                                   2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
-  auto decimals     = fixed_point_column_wrapper<typename T::rep>{
-    values.begin(), values.end(), numeric::scale_type{0}};
-
-  auto list_offsets = fixed_width_column_wrapper<size_type>{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
-
-  auto const search_space =
-    make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
-
-  auto search_key_one  = make_fixed_point_scalar<T>(typename T::rep{1}, numeric::scale_type{0});
-  auto actual_result   = lists::contains(search_space->view(), *search_key_one);
-  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  auto const search_space = [] {
+    auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
+                                                     2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
+    auto decimals     = fixed_point_column_wrapper<typename T::rep>{
+      values.begin(), values.end(), numeric::scale_type{0}};
+    auto list_offsets = indices{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
+    return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
+  }();
+  auto search_key_one = make_fixed_point_scalar<T>(typename T::rep{1}, numeric::scale_type{0});
+
+  // Search space: [[0,1,2], [3,4,5], [6,7,8], [9,0,1], [2,3,4], [5,6,7], [8,9,0], [], [1,2,3], []]
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto expected = bools{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto expected = indices{1, absent, absent, 2, absent, absent, absent, absent, 0, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
-TYPED_TEST(TypedContainsDecimalsTest, ListContainsVector)
+TYPED_TEST(TypedContainsDecimalsTest, VectorKey)
 {
   using T = TypeParam;
 
-  auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
-                                                   2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
-  auto decimals     = fixed_point_column_wrapper<typename T::rep>{
-    values.begin(), values.end(), numeric::scale_type{0}};
-
-  auto list_offsets = fixed_width_column_wrapper<size_type>{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
-
-  auto const search_space =
-    make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
+  auto const search_space = [] {
+    auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
+                                                     2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
+    auto decimals     = fixed_point_column_wrapper<typename T::rep>{
+      values.begin(), values.end(), numeric::scale_type{0}};
+    auto list_offsets = indices{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
+    return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
+  }();
 
   auto search_key = fixed_point_column_wrapper<typename T::rep>{
     {1, 2, 3, 1, 2, 3, 1, 2, 3, 1},
     numeric::scale_type{
       0}}.release();
 
-  auto actual_result   = lists::contains(search_space->view(), search_key->view());
-  auto expected_result = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, *actual_result);
+  // Search space: [ [0,1,2], [3,4,5], [6,7,8], [9,0,1], [2,3,4], [5,6,7], [8,9,0], [], [1,2,3], []
+  // ] Search keys:  [  1,       2,       3,       1,       2,       3,       1,       2,  3, 1 ]
+  {
+    // CONTAINS
+    auto result   = lists::contains(search_space->view(), search_key->view());
+    auto expected = bools{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_FIRST
+    auto result   = lists::index_of(search_space->view(), search_key->view(), FIND_FIRST);
+    auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+  {
+    // FIND_LAST
+    auto result   = lists::index_of(search_space->view(), search_key->view(), FIND_LAST);
+    auto expected = indices{1, absent, absent, 2, 0, absent, absent, absent, 2, absent};
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
 }
 
 }  // namespace test
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 5153c5c1d2a..a2e080e02f6 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -3170,8 +3170,6 @@ public static ColumnView fromDeviceBuffer(BaseDeviceMemoryBuffer buffer,
    * Output `column[i]` is set to null if one or more of the following are true:
    * 1. The key is null
    * 2. The column vector list value is null
-   * 3. The list row does not contain the key, and contains at least
-   *    one null.
    * @param key the scalar to look up
    * @return a Boolean ColumnVector with the result of the lookup
    */
@@ -3183,10 +3181,9 @@ public final ColumnVector listContains(Scalar key) {
   /**
    * Create a column of bool values indicating whether the list rows of the first
    * column contain the corresponding values in the second column.
+   * Output `column[i]` is set to null if one or more of the following are true:
    * 1. The key value is null
    * 2. The column vector list value is null
-   * 3. The list row does not contain the key, and contains at least
-   *    one null.
    * @param key the ColumnVector with look up values
    * @return a Boolean ColumnVector with the result of the lookup
    */
@@ -3195,6 +3192,58 @@ public final ColumnVector listContainsColumn(ColumnView key) {
     return new ColumnVector(listContainsColumn(getNativeView(), key.getNativeView()));
   }
 
+  /**
+   * Create a column of bool values indicating whether the list rows of the specified
+   * column contain null elements.
+   * Output `column[i]` is set to null iff the input list row is null.
+   * @return a Boolean ColumnVector with the result of the lookup
+   */
+  public final ColumnVector listContainsNulls() {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    return new ColumnVector(listContainsNulls(getNativeView()));
+  }
+
+  /**
+   * Enum to choose behaviour of listIndexOf functions:
+   *   1. FIND_FIRST finds the first occurrence of a search key.
+   *   2. FIND_LAST finds the last occurrence of a search key.
+   */
+  public enum FindOptions {FIND_FIRST, FIND_LAST};
+
+  /**
+   * Create a column of int32 indices, indicating the position of the scalar search key
+   * in each list row.
+   * All indices are 0-based. If a search key is not found, the index is set to -1.
+   * The index is set to null if one of the following is true: 
+   * 1. The search key is null.
+   * 2. The list row is null.
+   * @param key The scalar search key
+   * @param findOption Whether to find the first index of the key, or the last.
+   * @return The resultant column of int32 indices
+   */
+  public final ColumnVector listIndexOf(Scalar key, FindOptions findOption) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    boolean isFindFirst = findOption == FindOptions.FIND_FIRST;
+    return new ColumnVector(listIndexOfScalar(getNativeView(), key.getScalarHandle(), isFindFirst));
+  }
+
+  /**
+   * Create a column of int32 indices, indicating the position of each row in the
+   * search key column in the corresponding row of the lists column.
+   * All indices are 0-based. If a search key is not found, the index is set to -1.
+   * The index is set to null if one of the following is true: 
+   * 1. The search key row is null.
+   * 2. The list row is null.
+   * @param key ColumnView of search keys.
+   * @param findOption Whether to find the first index of the key, or the last.
+   * @return The resultant column of int32 indices
+   */
+  public final ColumnVector listIndexOf(ColumnView keys, FindOptions findOption) {
+    assert type.equals(DType.LIST) : "column type must be a LIST";
+    boolean isFindFirst = findOption == FindOptions.FIND_FIRST;
+    return new ColumnVector(listIndexOfColumn(getNativeView(), keys.getNativeView(), isFindFirst));
+  }
+
   /**
    * Segmented sort of the elements within a list in each row of a list column.
    * NOTICE: list columns with nested child are NOT supported yet.
@@ -3616,6 +3665,33 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    */
   private static native long listContainsColumn(long nativeView, long keyColumn);
 
+  /**
+   * Native method to search list rows for null elements.
+   * @param nativeView the column view handle of the list
+   * @return column handle of the resultant boolean column 
+   */
+  private static native long listContainsNulls(long nativeView);
+
+  /**
+   * Native method to find the first (or last) index of a specified scalar key,
+   * in each row of a list column.
+   * @param nativeView the column view handle of the list
+   * @param scalarKeyHandle handle to the scalar search key
+   * @param isFindFirst Whether to find the first index of the key, or the last.
+   * @return column handle of the resultant column of int32 indices
+   */
+  private static native long listIndexOfScalar(long nativeView, long scalarKeyHandle, boolean isFindFirst);
+
+  /**
+   * Native method to find the first (or last) index of each search key in the specified column,
+   * in each row of a list column.
+   * @param nativeView the column view handle of the list
+   * @param scalarColumnHandle handle to the search key column
+   * @param isFindFirst Whether to find the first index of the key, or the last.
+   * @return column handle of the resultant column of int32 indices
+   */
+  private static native long listIndexOfColumn(long nativeView, long keyColumnHandle, boolean isFindFirst);
+
   private static native long listSortRows(long nativeView, boolean isDescending, boolean isNullSmallest);
 
   private static native long getElement(long nativeView, int index);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4cd4b070aed..73ea49c18d9 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -511,6 +511,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv *env, jclass,
+                                                                         jlong column_view) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto lcv = cudf::lists_column_view{*cv};
+    return reinterpret_cast<jlong>(cudf::lists::contains_nulls(lcv).release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass,
                                                                           jlong column_view,
                                                                           jlong lookup_key_cv) {
@@ -528,6 +540,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv *env, jclass,
+                                                                         jlong column_view,
+                                                                         jlong lookup_key,
+                                                                         jboolean is_find_first) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    auto const lcv = cudf::lists_column_view{*cv};
+    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const *>(lookup_key);
+    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
+                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto result = cudf::lists::index_of(lcv, *lookup_key_scalar, find_option);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv *env, jclass,
+                                                                         jlong column_view,
+                                                                         jlong lookup_keys,
+                                                                         jboolean is_find_first) {
+  JNI_NULL_CHECK(env, column_view, "column is null", 0);
+  JNI_NULL_CHECK(env, lookup_keys, "lookup key column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    auto const lcv = cudf::lists_column_view{*cv};
+    auto const lookup_key_column = reinterpret_cast<cudf::column_view const *>(lookup_keys);
+    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
+                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto result = cudf::lists::index_of(lcv, *lookup_key_column, find_option);
+    return reinterpret_cast<jlong>(result.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass,
                                                                     jlong column_view,
                                                                     jboolean is_descending,
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index b78183692a3..0771de9492d 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -18,6 +18,7 @@
 
 package ai.rapids.cudf;
 
+import ai.rapids.cudf.ColumnView.FindOptions;
 import ai.rapids.cudf.HostColumnVector.*;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
@@ -4364,70 +4365,160 @@ void testDropListDuplicatesWithKeysValues() {
     }
   }
 
+  @SafeVarargs
+  private static <T> ColumnVector makeListsColumn(DType childDType, List<T>... rows) {
+    HostColumnVector.DataType childType = new HostColumnVector.BasicType(true, childDType);
+    HostColumnVector.DataType listType  = new HostColumnVector.ListType(true, childType);
+    return ColumnVector.fromLists(listType, rows);
+  }
+
   @Test
   void testListContainsString() {
-    List<String> list1 = Arrays.asList("Héllo there", "thésé");
-    List<String> list2 = Arrays.asList("", "ARé some", "test strings");
-    List<String> list3 = Arrays.asList(null, "", "ARé some", "test strings", "thésé");
-    List<String> list4 = Arrays.asList(null, "", "ARé some", "test strings");
-    List<String> list5 = null;
-    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-        new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null);
-         Scalar strScalar = Scalar.fromString("thésé");
-         ColumnVector result = v.listContains(strScalar)) {
+    List<String> list0 = Arrays.asList("Héllo there", "thésé");
+    List<String> list1 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list2 = Arrays.asList(null, "", "ARé some", "test strings", "thésé");
+    List<String> list3 = Arrays.asList(null, "", "ARé some", "test strings");
+    List<String> list4 = null;
+    try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4);
+         Scalar searchKey = Scalar.fromString("thésé");
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, false, null);
+         ColumnVector result = input.listContains(searchKey)) {
       assertColumnsAreEqual(expected, result);
     }
   }
 
   @Test
   void testListContainsInt() {
-    List<Integer> list1 = Arrays.asList(1, 2, 3);
-    List<Integer> list2 = Arrays.asList(4, 5, 6);
-    List<Integer> list3 = Arrays.asList(7, 8, 9);
-    List<Integer> list4 = null;
-    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-        new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4);
+    List<Integer> list0 = Arrays.asList(1, 2, 3);
+    List<Integer> list1 = Arrays.asList(4, 5, 6);
+    List<Integer> list2 = Arrays.asList(7, 8, 9);
+    List<Integer> list3 = null;
+    try (ColumnVector input =  makeListsColumn(DType.INT32, list0, list1, list2, list3);
+         Scalar searchKey = Scalar.fromInt(7);
          ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, null);
-         Scalar intScalar = Scalar.fromInt(7);
-         ColumnVector result = v.listContains(intScalar)) {
+         ColumnVector result = input.listContains(searchKey)) {
       assertColumnsAreEqual(expected, result);
     }
   }
 
   @Test
   void testListContainsStringCol() {
-    List<String> list1 = Arrays.asList("Héllo there", "thésé");
-    List<String> list2 = Arrays.asList("", "ARé some", "test strings");
-    List<String> list3 = Arrays.asList("FOO", "", "ARé some", "test");
+    List<String> list0 = Arrays.asList("Héllo there", "thésé");
+    List<String> list1 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list2 = Arrays.asList("FOO", "", "ARé some", "test");
+    List<String> list3 = Arrays.asList(null, "FOO", "", "ARé some", "test");
     List<String> list4 = Arrays.asList(null, "FOO", "", "ARé some", "test");
-    List<String> list5 = Arrays.asList(null, "FOO", "", "ARé some", "test");
-    List<String> list6 = null;
-    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-        new HostColumnVector.BasicType(true, DType.STRING)), list1, list2, list3, list4, list5, list6);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, null, null);
-         ColumnVector strCol = ColumnVector.fromStrings("thésé", "", "test", "test", "iotA", null);
-         ColumnVector result = v.listContainsColumn(strCol)) {
+    List<String> list5 = null;
+    try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5);
+         ColumnVector searchKeys = ColumnVector.fromStrings("thésé", "", "test", "test", "iotA", null);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, true, true, true, false, null);
+         ColumnVector result = input.listContainsColumn(searchKeys)) {
       assertColumnsAreEqual(expected, result);
     }
   }
 
   @Test
   void testListContainsIntCol() {
-    List<Integer> list1 = Arrays.asList(1, 2, 3);
-    List<Integer> list2 = Arrays.asList(4, 5, 6);
+    List<Integer> list0 = Arrays.asList(1, 2, 3);
+    List<Integer> list1 = Arrays.asList(4, 5, 6);
+    List<Integer> list2 = Arrays.asList(null, 8, 9);
     List<Integer> list3 = Arrays.asList(null, 8, 9);
-    List<Integer> list4 = Arrays.asList(null, 8, 9);
-    List<Integer> list5 = null;
-    try (ColumnVector v = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-        new HostColumnVector.BasicType(true, DType.INT32)), list1, list2, list3, list4, list5);
-         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, null, null);
-         ColumnVector intCol = ColumnVector.fromBoxedInts(3, 3, 8, 3, null);
-         ColumnVector result = v.listContainsColumn(intCol)) {
+    List<Integer> list4 = null;
+    try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3, list4);
+         ColumnVector searchKeys = ColumnVector.fromBoxedInts(3, 3, 8, 3, null);
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(true, false, true, false, null);
+         ColumnVector result = input.listContainsColumn(searchKeys)) {
+      assertColumnsAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testListContainsNulls() {
+    List<String> list0 = Arrays.asList("Héllo there", "thésé");
+    List<String> list1 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list2 = Arrays.asList("FOO", "", "ARé some", "test");
+    List<String> list3 = Arrays.asList(null, "FOO", "", "ARé some", "test");
+    List<String> list4 = Arrays.asList(null, "FOO", "", "ARé some", "test");
+    List<String> list5 = null;
+    try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5);
+         ColumnVector result = input.listContainsNulls();
+         ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, true, true, null)) {
       assertColumnsAreEqual(expected, result);
     }
   }
 
+  @Test
+  void testListIndexOfString() {
+    List<String> list0 = Arrays.asList("Héllo there", "thésé");
+    List<String> list1 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list2 = Arrays.asList(null, "", "ARé some", "thésé", "test strings", "thésé");
+    List<String> list3 = Arrays.asList(null, "", "ARé some", "test strings");
+    List<String> list4 = null;
+    try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4);
+         Scalar searchKey = Scalar.fromString("thésé");
+         ColumnVector expectedFirst = ColumnVector.fromBoxedInts(1, -1, 3, -1, null);
+         ColumnVector resultFirst = input.listIndexOf(searchKey, FindOptions.FIND_FIRST);
+         ColumnVector expectedLast = ColumnVector.fromBoxedInts(1, -1, 5, -1, null);
+         ColumnVector resultLast = input.listIndexOf(searchKey, FindOptions.FIND_LAST)) {
+      assertColumnsAreEqual(expectedFirst, resultFirst);
+      assertColumnsAreEqual(expectedLast, resultLast);
+    }
+  }
+
+  @Test
+  void testListIndexOfInt() {
+    List<Integer> list0 = Arrays.asList(1, 2, 3);
+    List<Integer> list1 = Arrays.asList(4, 5, 6);
+    List<Integer> list2 = Arrays.asList(7, 8, 9, 7);
+    List<Integer> list3 = null;
+    try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3);
+         Scalar searchKey = Scalar.fromInt(7);
+         ColumnVector expectedFirst = ColumnVector.fromBoxedInts(-1, -1, 0, null);
+         ColumnVector resultFirst = input.listIndexOf(searchKey, FindOptions.FIND_FIRST);
+         ColumnVector expectedLast = ColumnVector.fromBoxedInts(-1, -1, 3, null);
+         ColumnVector resultLast = input.listIndexOf(searchKey, FindOptions.FIND_LAST)) {
+      assertColumnsAreEqual(expectedFirst, resultFirst);
+      assertColumnsAreEqual(expectedLast, resultLast);
+    }
+  }
+
+  @Test
+  void testListIndexOfStringCol() {
+    List<String> list0 = Arrays.asList("Héllo there", "thésé");
+    List<String> list1 = Arrays.asList("", "ARé some", "test strings");
+    List<String> list2 = Arrays.asList("FOO", "", "ARé some", "test");
+    List<String> list3 = Arrays.asList(null, "FOO", "", "test", "ARé some", "test");
+    List<String> list4 = Arrays.asList(null, "FOO", "", "ARé some", "test");
+    List<String> list5 = null;
+    try (ColumnVector input = makeListsColumn(DType.STRING, list0, list1, list2, list3, list4, list5);
+         ColumnVector searchKeys = ColumnVector.fromStrings("thésé", "", "test", "test", "iotA", null);
+         ColumnVector expectedFirst = ColumnVector.fromBoxedInts(1, 0, 3, 3, -1, null);
+         ColumnVector resultFirst = input.listIndexOf(searchKeys, FindOptions.FIND_FIRST);
+         ColumnVector expectedLast = ColumnVector.fromBoxedInts(1, 0, 3, 5, -1, null);
+         ColumnVector resultLast = input.listIndexOf(searchKeys, FindOptions.FIND_LAST)) {
+      assertColumnsAreEqual(expectedFirst, resultFirst);
+      assertColumnsAreEqual(expectedLast, resultLast);
+    }
+  }
+
+  @Test
+  void testListIndexOfIntCol() {
+    List<Integer> list0 = Arrays.asList(1, 2, 3);
+    List<Integer> list1 = Arrays.asList(4, 5, 6);
+    List<Integer> list2 = Arrays.asList(null, 8, 9, 8);
+    List<Integer> list3 = Arrays.asList(null, 8, 9);
+    List<Integer> list4 = null;
+    try (ColumnVector input = makeListsColumn(DType.INT32, list0, list1, list2, list3, list4);
+         ColumnVector searchKeys = ColumnVector.fromBoxedInts(3, 3, 8, 3, null);
+         ColumnVector expectedFirst = ColumnVector.fromBoxedInts(2, -1, 1, -1, null);
+         ColumnVector resultFirst = input.listIndexOf(searchKeys, FindOptions.FIND_FIRST);
+         ColumnVector expectedLast = ColumnVector.fromBoxedInts(2, -1, 3, -1, null);
+         ColumnVector resultLast = input.listIndexOf(searchKeys, FindOptions.FIND_LAST)) {
+      assertColumnsAreEqual(expectedFirst, resultFirst);
+      assertColumnsAreEqual(expectedLast, resultLast);
+    }
+  }
+
   @Test
   void testListSortRowsWithIntChild() {
     List<Integer> list1 = Arrays.asList(1, 3, 0, 2);
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index b898222d7d7..44749103b54 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -304,8 +304,8 @@ def test_get_nulls():
         ([[1, 2, 3], [], [3, 4, 5]], 6, [False, False, False],),
         ([[1.0, 2.0, 3.0], None, []], 2.0, [True, None, False],),
         ([[None, "b", "c"], [], ["b", "e", "f"]], "b", [True, False, True],),
-        ([[None, 2, 3], None, []], 1, [None, None, False]),
-        ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [None, False, False],),
+        ([[None, 2, 3], None, []], 1, [False, None, False]),
+        ([[None, "b", "c"], [], ["b", "e", "f"]], "d", [False, False, False],),
     ],
 )
 def test_contains_scalar(data, scalar, expect):

From 68384ea2e1071d2f35867514d8a6add500d50cc6 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 22 Dec 2021 09:14:27 -0600
Subject: [PATCH 104/202] Merge branch-21.12 into branch-22.02

---
 CHANGELOG.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36c550926ab..68ff9abc9ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,6 @@
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch.
 
-# cuDF 21.12.00 (Date TBD)
 # cuDF 21.12.00 (9 Dec 2021)
 
 ## 🚨 Breaking Changes
@@ -1705,7 +1704,7 @@ Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the late
 - PR #6459 Add `map` method to series
 - PR #6379 Add list hashing functionality to MD5
 - PR #6498 Add helper method to ColumnBuilder with some nits
-- PR #6336 Add `join` functionality in cudf concat 
+- PR #6336 Add `join` functionality in cudf concat
 - PR #6653 Replaced SHFL_XOR calls with cub::WarpReduce
 - PR #6751 Rework ColumnViewAccess and its usage
 - PR #6698 Remove macros from ORC reader and writer

From 04f4219428f734ddc284aad141a34f9d2bca37f5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 23 Dec 2021 00:29:27 -0600
Subject: [PATCH 105/202] Use gpuci_mamba_retry to install local artifacts.
 (#9951)

I see CI timeouts occurring at the step where local conda artifacts for libcudf and libcudf_kafka are installed. This PR uses `gpuci_mamba_retry` instead of `conda` to install those local artifacts (this change was also recently made in https://github.com/rapidsai/cugraph/pull/1928).

Example timeouts:
- https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5764/console
- https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5773/console

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/9951
---
 ci/gpu/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 5646c268301..a557a2ef066 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -219,7 +219,7 @@ else
     KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install
 
     gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
-    conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+    gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
 
     install_dask
 

From c99a37fb804282565204fd1544d145177620c0a3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 23 Dec 2021 17:40:31 -0600
Subject: [PATCH 106/202] Remove deprecated method Series.hash_encode. (#9942)

This PR removes the deprecated method `Series.hash_encode`. Resolves #9475. Follows up on #9457, #9381.

This PR also removes libcudf code paths used solely for this Python method.

Users may replace code like `series.hash_encode(stop, use_name=False)` with `series.hash_values(method="murmur3") % stop`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9942
---
 cpp/include/cudf/detail/hashing.hpp      | 14 ++---
 cpp/include/cudf/hashing.hpp             |  9 +--
 cpp/include/cudf/table/row_operators.cuh | 48 ---------------
 cpp/src/hash/hashing.cu                  |  6 +-
 cpp/src/hash/murmur_hash.cu              | 25 ++------
 cpp/tests/hashing/hash_test.cpp          | 50 +++++++--------
 docs/cudf/source/api_docs/series.rst     |  2 -
 python/cudf/cudf/_lib/cpp/hash.pxd       |  3 +-
 python/cudf/cudf/_lib/hash.pyx           |  4 +-
 python/cudf/cudf/core/frame.py           |  4 +-
 python/cudf/cudf/core/series.py          | 77 ------------------------
 python/cudf/cudf/tests/test_dataframe.py | 38 ------------
 12 files changed, 46 insertions(+), 234 deletions(-)

diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index bd5c8a42a51..0fc807593fb 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -32,17 +32,15 @@ namespace detail {
  */
 std::unique_ptr<column> hash(
   table_view const& input,
-  hash_id hash_function                        = hash_id::HASH_MURMUR3,
-  cudf::host_span<uint32_t const> initial_hash = {},
-  uint32_t seed                                = 0,
-  rmm::cuda_stream_view stream                 = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
+  hash_id hash_function               = hash_id::HASH_MURMUR3,
+  uint32_t seed                       = 0,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> murmur_hash3_32(
   table_view const& input,
-  cudf::host_span<uint32_t const> initial_hash = {},
-  rmm::cuda_stream_view stream                 = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> md5_hash(
   table_view const& input,
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 6b281c3f7f4..cce05042917 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -31,8 +31,6 @@ namespace cudf {
  *
  * @param input The table of columns to hash.
  * @param hash_function The hash function enum to use.
- * @param initial_hash Optional host_span of initial hash values for each column.
- * If this span is empty then each element will be hashed as-is.
  * @param seed Optional seed value to use for the hash function.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
@@ -40,10 +38,9 @@ namespace cudf {
  */
 std::unique_ptr<column> hash(
   table_view const& input,
-  hash_id hash_function                        = hash_id::HASH_MURMUR3,
-  cudf::host_span<uint32_t const> initial_hash = {},
-  uint32_t seed                                = DEFAULT_HASH_SEED,
-  rmm::mr::device_memory_resource* mr          = rmm::mr::get_current_device_resource());
+  hash_id hash_function               = hash_id::HASH_MURMUR3,
+  uint32_t seed                       = DEFAULT_HASH_SEED,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 0f3ca073380..32ddd1ef49a 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -539,52 +539,4 @@ class row_hasher {
   uint32_t _seed{DEFAULT_HASH_SEED};
 };
 
-/**
- * @brief Computes the hash value of a row in the given table, combined with an
- * initial hash value for each column.
- *
- * @tparam hash_function Hash functor to use for hashing elements.
- * @tparam Nullate A cudf::nullate type describing how to check for nulls.
- */
-template <template <typename> class hash_function, typename Nullate>
-class row_hasher_initial_values {
- public:
-  row_hasher_initial_values() = delete;
-  row_hasher_initial_values(Nullate has_nulls, table_device_view t, hash_value_type* initial_hash)
-    : _table{t}, _initial_hash(initial_hash), _has_nulls{has_nulls}
-  {
-  }
-
-  __device__ auto operator()(size_type row_index) const
-  {
-    auto hash_combiner = [](hash_value_type lhs, hash_value_type rhs) {
-      return hash_function<hash_value_type>{}.hash_combine(lhs, rhs);
-    };
-
-    // Hashes an element in a column and combines with an initial value
-    auto hasher = [=](size_type column_index) {
-      auto hash_value = cudf::type_dispatcher<dispatch_storage_type>(
-        _table.column(column_index).type(),
-        element_hasher<hash_function, Nullate>{_has_nulls},
-        _table.column(column_index),
-        row_index);
-
-      return hash_combiner(_initial_hash[column_index], hash_value);
-    };
-
-    // Hash each element and combine all the hash values together
-    return thrust::transform_reduce(thrust::seq,
-                                    thrust::make_counting_iterator(0),
-                                    thrust::make_counting_iterator(_table.num_columns()),
-                                    hasher,
-                                    hash_value_type{0},
-                                    hash_combiner);
-  }
-
- private:
-  table_device_view _table;
-  hash_value_type* _initial_hash;
-  Nullate _has_nulls;
-};
-
 }  // namespace cudf
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index 039652e0012..ac2f06c0ea3 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -88,13 +88,12 @@ std::unique_ptr<column> serial_murmur_hash3_32(table_view const& input,
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
-                             cudf::host_span<uint32_t const> initial_hash,
                              uint32_t seed,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, initial_hash, stream, mr);
+    case (hash_id::HASH_MURMUR3): return murmur_hash3_32(input, stream, mr);
     case (hash_id::HASH_MD5): return md5_hash(input, stream, mr);
     case (hash_id::HASH_SERIAL_MURMUR3):
       return serial_murmur_hash3_32<MurmurHash3_32>(input, seed, stream, mr);
@@ -108,12 +107,11 @@ std::unique_ptr<column> hash(table_view const& input,
 
 std::unique_ptr<column> hash(table_view const& input,
                              hash_id hash_function,
-                             cudf::host_span<uint32_t const> initial_hash,
                              uint32_t seed,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash(input, hash_function, initial_hash, seed, rmm::cuda_stream_default, mr);
+  return detail::hash(input, hash_function, seed, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/hash/murmur_hash.cu b/cpp/src/hash/murmur_hash.cu
index a761d058180..bc8d3577513 100644
--- a/cpp/src/hash/murmur_hash.cu
+++ b/cpp/src/hash/murmur_hash.cu
@@ -29,7 +29,6 @@ namespace cudf {
 namespace detail {
 
 std::unique_ptr<column> murmur_hash3_32(table_view const& input,
-                                        cudf::host_span<uint32_t const> initial_hash,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
@@ -44,24 +43,12 @@ std::unique_ptr<column> murmur_hash3_32(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
   auto output_view        = output->mutable_view();
 
-  // Compute the hash value for each row depending on the specified hash function
-  if (!initial_hash.empty()) {
-    CUDF_EXPECTS(initial_hash.size() == size_t(input.num_columns()),
-                 "Expected same size of initial hash values as number of columns");
-    auto device_initial_hash = make_device_uvector_async(initial_hash, stream);
-
-    thrust::tabulate(rmm::exec_policy(stream),
-                     output_view.begin<int32_t>(),
-                     output_view.end<int32_t>(),
-                     row_hasher_initial_values<MurmurHash3_32, nullate::DYNAMIC>(
-                       nullate::DYNAMIC{nullable}, *device_input, device_initial_hash.data()));
-  } else {
-    thrust::tabulate(
-      rmm::exec_policy(stream),
-      output_view.begin<int32_t>(),
-      output_view.end<int32_t>(),
-      row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
-  }
+  // Compute the hash value for each row
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    output_view.begin<int32_t>(),
+    output_view.end<int32_t>(),
+    row_hasher<MurmurHash3_32, nullate::DYNAMIC>(nullate::DYNAMIC{nullable}, *device_input));
 
   return output;
 }
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index ee321b761db..bd6deae9dc4 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -116,13 +116,13 @@ TEST_F(HashTest, MultiValueNulls)
   EXPECT_EQ(input1.num_rows(), output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 
-  auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
+  auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
   auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);
 
   EXPECT_EQ(input1.num_rows(), serial_output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());
 
-  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
+  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
   auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);
 
   EXPECT_EQ(input1.num_rows(), spark_output1->size());
@@ -147,13 +147,13 @@ TYPED_TEST(HashTestTyped, Equality)
   EXPECT_EQ(input.num_rows(), output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 
-  auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
+  auto const serial_output1 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
   auto const serial_output2 = cudf::hash(input, cudf::hash_id::HASH_SERIAL_MURMUR3);
 
   EXPECT_EQ(input.num_rows(), serial_output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());
 
-  auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
+  auto const spark_output1 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
   auto const spark_output2 = cudf::hash(input, cudf::hash_id::HASH_SPARK_MURMUR3);
 
   EXPECT_EQ(input.num_rows(), spark_output1->size());
@@ -177,13 +177,13 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
   EXPECT_EQ(input1.num_rows(), output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view());
 
-  auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 0);
+  auto const serial_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, 0);
   auto const serial_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3);
 
   EXPECT_EQ(input1.num_rows(), serial_output1->size());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(serial_output1->view(), serial_output2->view());
 
-  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, {}, 0);
+  auto const spark_output1 = cudf::hash(input1, cudf::hash_id::HASH_SPARK_MURMUR3, 0);
   auto const spark_output2 = cudf::hash(input2, cudf::hash_id::HASH_SPARK_MURMUR3);
 
   EXPECT_EQ(input1.num_rows(), spark_output1->size());
@@ -222,7 +222,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity);
 
   constexpr auto serial_hasher   = cudf::hash_id::HASH_SERIAL_MURMUR3;
-  auto const serial_col          = cudf::hash(table_col, serial_hasher, {}, 0);
+  auto const serial_col          = cudf::hash(table_col, serial_hasher, 0);
   auto const serial_col_neg_zero = cudf::hash(table_col_neg_zero, serial_hasher);
   auto const serial_col_neg_nan  = cudf::hash(table_col_neg_nan, serial_hasher);
 
@@ -231,7 +231,7 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
 
   // Spark hash is sensitive to 0 and -0
   constexpr auto spark_hasher  = cudf::hash_id::HASH_SPARK_MURMUR3;
-  auto const spark_col         = cudf::hash(table_col, spark_hasher, {}, 0);
+  auto const spark_col         = cudf::hash(table_col, spark_hasher, 0);
   auto const spark_col_neg_nan = cudf::hash(table_col_neg_nan, spark_hasher);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
@@ -269,8 +269,8 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds)
   auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2});
 
   constexpr auto hasher   = cudf::hash_id::HASH_SERIAL_MURMUR3;
-  auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
-  auto const ints_hash    = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
+  auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
+  auto const ints_hash    = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
   auto const combo1_hash  = cudf::hash(combo1, hasher, {});
   auto const combo2_hash  = cudf::hash(combo2, hasher, {});
   auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {});
@@ -396,20 +396,20 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
 
   constexpr auto hasher      = cudf::hash_id::HASH_SPARK_MURMUR3;
-  auto const hash_structs    = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42);
-  auto const hash_strings    = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314);
-  auto const hash_doubles    = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42);
-  auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42);
-  auto const hash_decimal64  = cudf::hash(cudf::table_view({decimal64_col}), hasher, {}, 42);
-  auto const hash_longs      = cudf::hash(cudf::table_view({longs_col}), hasher, {}, 42);
-  auto const hash_floats     = cudf::hash(cudf::table_view({floats_col}), hasher, {}, 42);
-  auto const hash_dates      = cudf::hash(cudf::table_view({dates_col}), hasher, {}, 42);
-  auto const hash_decimal32  = cudf::hash(cudf::table_view({decimal32_col}), hasher, {}, 42);
-  auto const hash_ints       = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42);
-  auto const hash_shorts     = cudf::hash(cudf::table_view({shorts_col}), hasher, {}, 42);
-  auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, {}, 42);
-  auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42);
-  auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42);
+  auto const hash_structs    = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
+  auto const hash_strings    = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
+  auto const hash_doubles    = cudf::hash(cudf::table_view({doubles_col}), hasher, 42);
+  auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, 42);
+  auto const hash_decimal64  = cudf::hash(cudf::table_view({decimal64_col}), hasher, 42);
+  auto const hash_longs      = cudf::hash(cudf::table_view({longs_col}), hasher, 42);
+  auto const hash_floats     = cudf::hash(cudf::table_view({floats_col}), hasher, 42);
+  auto const hash_dates      = cudf::hash(cudf::table_view({dates_col}), hasher, 42);
+  auto const hash_decimal32  = cudf::hash(cudf::table_view({decimal32_col}), hasher, 42);
+  auto const hash_ints       = cudf::hash(cudf::table_view({ints_col}), hasher, 42);
+  auto const hash_shorts     = cudf::hash(cudf::table_view({shorts_col}), hasher, 42);
+  auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
+  auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
+  auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
@@ -439,7 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
                                                 shorts_col,
                                                 bytes_col,
                                                 bools_col2});
-  auto const hash_combined  = cudf::hash(combined_table, hasher, {}, 42);
+  auto const hash_combined  = cudf::hash(combined_table, hasher, 42);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
 }
 
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 6dc38d985f8..e0dc4bd4f46 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -422,6 +422,4 @@ Serialization / IO / conversion
    Series.from_categorical
    Series.from_masked_array
    Series.from_pandas
-   Series.hash_encode
    Series.hash_values
-   
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/cpp/hash.pxd
index f07a6c0f046..fd9992152a6 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/cpp/hash.pxd
@@ -13,7 +13,6 @@ from cudf._lib.cpp.table.table_view cimport table_view
 cdef extern from "cudf/hashing.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] hash "cudf::hash" (
         const table_view& input,
-        const libcudf_types.hash_id& hash_function,
-        const vector[uint32_t]& initial_hash,
+        const libcudf_types.hash_id hash_function,
         const uint32_t seed
     ) except +
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 9b34a049cac..adc48159aac 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -54,8 +54,7 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, str method, object initial_hash=None, int seed=0):
-    cdef vector[uint32_t] c_initial_hash = initial_hash or []
+def hash(source_table, str method, int seed=0):
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
     cdef unique_ptr[column] c_result
@@ -71,7 +70,6 @@ def hash(source_table, str method, object initial_hash=None, int seed=0):
             cpp_hash(
                 c_source_view,
                 c_hash_function,
-                c_initial_hash,
                 seed
             )
         )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c85ed0c8555..c83b06707a4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -580,8 +580,8 @@ def _gather(
         result._copy_type_metadata(self)
         return result
 
-    def _hash(self, method, initial_hash=None):
-        return libcudf.hash.hash(self, method, initial_hash)
+    def _hash(self, method):
+        return libcudf.hash.hash(self, method)
 
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 036c8c1ee00..4ec7c3df076 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -7,7 +7,6 @@
 import pickle
 import warnings
 from collections import abc as abc
-from hashlib import sha256
 from numbers import Number
 from shutil import get_terminal_size
 from typing import Any, MutableMapping, Optional, Set, Union
@@ -3144,82 +3143,6 @@ def hash_values(self, method="murmur3"):
             {None: self._hash(method=method)}, index=self.index
         )
 
-    def hash_encode(self, stop, use_name=False):
-        """Encode column values as ints in [0, stop) using hash function.
-
-        This method is deprecated. Replace ``series.hash_encode(stop,
-        use_name=False)`` with ``series.hash_values(method="murmur3") % stop``.
-
-        Parameters
-        ----------
-        stop : int
-            The upper bound on the encoding range.
-        use_name : bool
-            If ``True`` then combine hashed column values
-            with hashed column name. This is useful for when the same
-            values in different columns should be encoded
-            with different hashed values.
-
-        Returns
-        -------
-        result : Series
-            The encoded Series.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 120, 30])
-        >>> series.hash_encode(stop=200)
-        0     53
-        1     51
-        2    124
-        dtype: int32
-
-        You can choose to include name while hash
-        encoding by specifying `use_name=True`
-
-        >>> series.hash_encode(stop=200, use_name=True)
-        0    131
-        1     29
-        2     76
-        dtype: int32
-        """
-        warnings.warn(
-            "The `hash_encode` method will be removed in a future cuDF "
-            "release. Replace `series.hash_encode(stop, use_name=False)` "
-            'with `series.hash_values(method="murmur3") % stop`.',
-            FutureWarning,
-        )
-
-        if not stop > 0:
-            raise ValueError("stop must be a positive integer.")
-
-        if use_name:
-            name_hasher = sha256()
-            name_hasher.update(str(self.name).encode())
-            name_hash_bytes = name_hasher.digest()[:4]
-            name_hash_int = (
-                int.from_bytes(name_hash_bytes, "little", signed=False)
-                & 0xFFFFFFFF
-            )
-            initial_hash = [name_hash_int]
-        else:
-            initial_hash = None
-
-        hashed_values = Series._from_data(
-            {
-                self.name: self._hash(
-                    method="murmur3", initial_hash=initial_hash
-                )
-            },
-            self.index,
-        )
-
-        if hashed_values.has_nulls:
-            raise ValueError("Column must have no nulls.")
-
-        return hashed_values % stop
-
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ab0856fad1e..78560ee6723 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2237,44 +2237,6 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     assert_eq(pdf2, gdf2)
 
 
-@pytest.mark.parametrize("nrows", [1, 8, 100, 1000, 100000])
-def test_series_hash_encode(nrows):
-    data = np.asarray(range(nrows))
-    # Python hash returns different value which sometimes
-    # results in enc_with_name_arr and enc_arr to be same.
-    # And there is no other better way to make hash return same value.
-    # So using an integer name to get constant value back from hash.
-    s = cudf.Series(data, name=1)
-    num_features = 1000
-
-    with pytest.warns(FutureWarning):
-        encoded_series = s.hash_encode(num_features)
-    assert isinstance(encoded_series, cudf.Series)
-    enc_arr = encoded_series.to_numpy()
-    assert np.all(enc_arr >= 0)
-    assert np.max(enc_arr) < num_features
-
-    with pytest.warns(FutureWarning):
-        enc_with_name_arr = s.hash_encode(
-            num_features, use_name=True
-        ).to_numpy()
-    assert enc_with_name_arr[0] != enc_arr[0]
-
-
-def test_series_hash_encode_reproducible_results():
-    # Regression test to ensure that hash_encode outputs are reproducible
-    data = cudf.Series([0, 1, 2])
-    with pytest.warns(FutureWarning):
-        hash_result = data.hash_encode(stop=2 ** 16, use_name=False)
-    expected_result = cudf.Series([42165, 55037, 7341])
-    assert_eq(hash_result, expected_result)
-
-    with pytest.warns(FutureWarning):
-        hash_result_with_name = data.hash_encode(stop=2 ** 16, use_name=True)
-    expected_result_with_name = cudf.Series([36137, 39649, 58673])
-    assert_eq(hash_result_with_name, expected_result_with_name)
-
-
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
 

From e432d016c3f9fd9911a47807506ceea24228a996 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 24 Dec 2021 05:52:20 -0800
Subject: [PATCH 107/202] Add `first` and `last` method to `IndexedFrame`
 (#9710)

closes #9600

This PR adds `first` and `last` method to `indexed_frame`.  This method only applies to `IndexedFrame` with `DatetimeIndex` and gathers the first or last rows within time range specified by `offset` argument.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9710
---
 python/cudf/cudf/core/indexed_frame.py  | 126 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_datetime.py | 104 +++++++++++++++++++
 2 files changed, 229 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 51bfad3a054..4be35d960ee 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3,9 +3,10 @@
 
 from __future__ import annotations
 
+import operator
 import warnings
 from collections import abc
-from typing import Type, TypeVar
+from typing import Callable, Type, TypeVar
 from uuid import uuid4
 
 import cupy as cp
@@ -109,6 +110,7 @@ class IndexedFrame(Frame):
     # mypy can't handle bound type variables as class members
     _loc_indexer_type: Type[_LocIndexerClass]  # type: ignore
     _iloc_indexer_type: Type[_IlocIndexerClass]  # type: ignore
+    _index: cudf.core.index.BaseIndex
 
     def __init__(self, data=None, index=None):
         super().__init__(data=data, index=index)
@@ -1104,3 +1106,125 @@ def resample(
             if isinstance(self, cudf.Series)
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
+
+    def _first_or_last(
+        self, offset, idx: int, op: Callable, side: str, slice_func: Callable
+    ) -> "IndexedFrame":
+        """Shared code path for ``first`` and ``last``."""
+        if not isinstance(self._index, cudf.core.index.DatetimeIndex):
+            raise TypeError("'first' only supports a DatetimeIndex index.")
+        if not isinstance(offset, str):
+            raise NotImplementedError(
+                f"Unsupported offset type {type(offset)}."
+            )
+
+        if len(self) == 0:
+            return self.copy()
+
+        pd_offset = pd.tseries.frequencies.to_offset(offset)
+        to_search = op(pd.Timestamp(self._index._column[idx]), pd_offset)
+        if (
+            idx == 0
+            and not isinstance(pd_offset, pd.tseries.offsets.Tick)
+            and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
+        ):
+            # Special handle is required when the start time of the index
+            # is on the end of the offset. See pandas gh29623 for detail.
+            to_search = to_search - pd_offset.base
+            return self.loc[:to_search]
+        end_point = int(
+            self._index._column.searchsorted(to_search, side=side)[0]
+        )
+        return slice_func(end_point)
+
+    def first(self, offset):
+        """Select initial periods of time series data based on a date offset.
+
+        When having a DataFrame with **sorted** dates as index, this function
+        can select the first few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset: str
+            The offset length of the data that will be selected. For intance,
+            '1M' will display all rows having their index within the first
+            month.
+
+        Returns
+        -------
+        Series or DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a ``DatetimeIndex``
+
+        Examples
+        --------
+        >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
+        >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+        >>> ts
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+        >>> ts.first('3D')
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        """
+        return self._first_or_last(
+            offset,
+            idx=0,
+            op=operator.__add__,
+            side="left",
+            slice_func=lambda i: self.iloc[:i],
+        )
+
+    def last(self, offset):
+        """Select final periods of time series data based on a date offset.
+
+        When having a DataFrame with **sorted** dates as index, this function
+        can select the last few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset: str
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all rows having their index within the last 3
+            days.
+
+        Returns
+        -------
+        Series or DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a ``DatetimeIndex``
+
+        Examples
+        --------
+        >>> i = cudf.date_range('2018-04-09', periods=4, freq='2D')
+        >>> ts = cudf.DataFrame({'A': [1, 2, 3, 4]}, index=i)
+        >>> ts
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+        >>> ts.last('3D')
+                    A
+        2018-04-13  3
+        2018-04-15  4
+        """
+        return self._first_or_last(
+            offset,
+            idx=-1,
+            op=operator.__sub__,
+            side="right",
+            slice_func=lambda i: self.iloc[i:],
+        )
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 1a1b21aa3d5..9d120819248 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1892,3 +1892,107 @@ def test_round(data, time_type, resolution):
     expect = ps.dt.round(resolution)
     got = gs.dt.round(resolution)
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.DatetimeIndex([]),
+        pd.DatetimeIndex(["2010-05-31"]),
+        pd.date_range("2000-01-01", "2000-12-31", periods=21),
+    ],
+)
+@pytest.mark.parametrize(
+    "offset",
+    [
+        "10Y",
+        "6M",
+        "M",
+        "31D",
+        "0H",
+        "44640T",
+        "44640min",
+        "2678000S",
+        "2678000000L",
+        "2678000000ms",
+        "2678000000000U",
+        "2678000000000us",
+        "2678000000000000N",
+        "2678000000000000ns",
+    ],
+)
+def test_first(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
+
+    expect = p.first(offset=offset)
+    got = g.first(offset=offset)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    # This test case tests correctness when start is end of month
+    "idx, offset",
+    [
+        (
+            pd.DatetimeIndex(
+                [
+                    "2020-01-31",
+                    "2020-02-15",
+                    "2020-02-29",
+                    "2020-03-15",
+                    "2020-03-31",
+                    "2020-04-15",
+                    "2020-04-30",
+                ]
+            ),
+            "3M",
+        )
+    ],
+)
+def test_first_start_at_end_of_month(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
+
+    expect = p.first(offset=offset)
+    got = g.first(offset=offset)
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.DatetimeIndex([]),
+        pd.DatetimeIndex(["2010-05-31"]),
+        pd.date_range("2000-01-01", "2000-12-31", periods=21),
+    ],
+)
+@pytest.mark.parametrize(
+    "offset",
+    [
+        "10Y",
+        "6M",
+        "M",
+        "31D",
+        "0H",
+        "44640T",
+        "44640min",
+        "2678000S",
+        "2678000000L",
+        "2678000000ms",
+        "2678000000000U",
+        "2678000000000us",
+        "2678000000000000N",
+        "2678000000000000ns",
+    ],
+)
+def test_last(idx, offset):
+    p = pd.Series(range(len(idx)), index=idx)
+    g = cudf.from_pandas(p)
+
+    expect = p.last(offset=offset)
+    got = g.last(offset=offset)
+
+    assert_eq(expect, got)

From bf7f7bea4674600168bfb73b380ae57071c4e53c Mon Sep 17 00:00:00 2001
From: esoha-nvidia <69258779+esoha-nvidia@users.noreply.github.com>
Date: Fri, 24 Dec 2021 11:54:40 -0700
Subject: [PATCH 108/202] Fix cudf compilation instructions. (#9956)

This change is needed since https://github.com/rapidsai/cudf/commit/967f3397fb486368d74916ae344c0e1d9eb0a1a8 by @bdice

Authors:
  - https://github.com/esoha-nvidia

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9956
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index aae62fbd47c..6d1c0528832 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -86,7 +86,7 @@ git submodule update --init --remote --recursive
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.0.yml
+conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
 # activate the environment
 conda activate cudf_dev
 ```

From 67c925c84aa111ad3a54c0352c18c570d17f5d75 Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Wed, 29 Dec 2021 15:04:31 +0800
Subject: [PATCH 109/202] Fix cudf java build error. (#9958)

cudf Java build is broken by https://github.com/rapidsai/cudf/pull/9942. So update the `hash` JNI accordingly.

Signed-off-by: Firestarman <firestarmanllc@gmail.com>

Authors:
  - Liangcai Li (https://github.com/firestarman)

Approvers:
  - Gary Shen (https://github.com/GaryShen2008)

URL: https://github.com/rapidsai/cudf/pull/9958
---
 java/src/main/java/ai/rapids/cudf/ColumnVector.java | 12 ++++--------
 java/src/main/native/src/ColumnVectorJni.cpp        | 11 ++---------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 3fed6316215..c83fe6adca1 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -680,7 +680,7 @@ public static ColumnVector md5Hash(ColumnView... columns) {
           "Unsupported nested type column";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), new int[0], 0));
+    return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
   }
 
   /**
@@ -704,7 +704,7 @@ public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[]
       assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), new int[0], seed));
+    return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), seed));
   }
 
   /**
@@ -739,7 +739,7 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[])
       assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), new int[0], seed));
+    return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), seed));
   }
 
   /**
@@ -859,14 +859,10 @@ private static native long stringConcatenationSepCol(long[] columnViews,
    *
    * @param viewHandles array of native handles to the cudf::column_view columns being operated on.
    * @param hashId integer native ID of the hashing function identifier HashType.
-   * @param initialValues array of integer values, one per column, only used by non-serial murmur3
-   *                      hash. Each element's hash value is merged with its column's initial value
-   *                      before the row is merged into a single value.
    * @param seed integer seed for the hash. Only used by serial murmur3 hash.
    * @return native handle of the resulting cudf column containing the hex-string hashing results.
    */
-  private static native long hash(long[] viewHandles, int hashId, int[] initialValues,
-                                  int seed) throws CudfException;
+  private static native long hash(long[] viewHandles, int hashId, int seed) throws CudfException;
 
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 7fe466f828f..cfad89cb399 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -327,10 +327,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobject j_object,
                                                               jlongArray column_handles,
-                                                              jint hash_function_id,
-                                                              jintArray initial_values, jint seed) {
+                                                              jint hash_function_id, jint seed) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
-  JNI_NULL_CHECK(env, initial_values, "array of initial values is null", 0);
 
   try {
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
@@ -340,13 +338,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje
                    [](auto const &p_column) { return *p_column; });
     cudf::table_view *input_table = new cudf::table_view(column_views);
 
-    cudf::jni::native_jintArray native_iv(env, initial_values);
-    std::vector<uint32_t> vector_iv;
-    std::transform(native_iv.data(), native_iv.data() + native_iv.size(),
-                   std::back_inserter(vector_iv), [](auto const &iv) { return iv; });
-
     std::unique_ptr<cudf::column> result =
-        cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), vector_iv, seed);
+        cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);

From 723376576b2f36711b12dd434b95e8aeac99f653 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 3 Jan 2022 12:38:41 -0600
Subject: [PATCH 110/202] Remove various unused functions (#9922)

This PR removes a number of unused functions and inlines some helpers that are only called in one place. This PR also deprecates `Series.fill`, which does not appear to be a pandas API. This PR resolves #9824.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9922
---
 python/cudf/cudf/core/dataframe.py  | 143 +++++++---------------------
 python/cudf/cudf/core/frame.py      |  35 +++----
 python/cudf/cudf/core/series.py     |  18 +++-
 python/cudf/cudf/tests/test_fill.py |   2 +-
 python/cudf/cudf/tests/test_repr.py |   6 --
 5 files changed, 62 insertions(+), 142 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 88c8aaebd9e..b7fc5efb412 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -59,6 +59,7 @@
     _get_label_range_or_mask,
     _indices_from_labels,
 )
+from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
 from cudf.core.series import Series
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
@@ -90,8 +91,6 @@
 
 class _DataFrameIndexer(_FrameIndexer):
     def __getitem__(self, arg):
-        from cudf import MultiIndex
-
         if isinstance(self._frame.index, MultiIndex) or isinstance(
             self._frame.columns, MultiIndex
         ):
@@ -118,8 +117,6 @@ def _can_downcast_to_series(self, df, arg):
         operation should be "downcasted" from a DataFrame to a
         Series
         """
-        from cudf.core.column import as_column
-
         if isinstance(df, cudf.Series):
             return False
         nrows, ncols = df.shape
@@ -201,11 +198,6 @@ def _getitem_scalar(self, arg):
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.dataframe import DataFrame
-        from cudf.core.index import as_index
-
         # Step 1: Gather columns
         if isinstance(arg, tuple):
             columns_df = self._frame._get_columns_by_label(arg[1])
@@ -245,7 +237,7 @@ def _getitem_tuple_arg(self, arg):
                     tmp_arg = ([tmp_arg[0]], tmp_arg[1])
                 if len(tmp_arg[0]) == 0:
                     return columns_df._empty_like(keep_index=True)
-                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
+                tmp_arg = (as_column(tmp_arg[0]), tmp_arg[1])
 
                 if is_bool_dtype(tmp_arg[0]):
                     df = columns_df._apply_boolean_mask(tmp_arg[0])
@@ -273,7 +265,7 @@ def _getitem_tuple_arg(self, arg):
                     start = self._frame.index[0]
                 df.index = as_index(start)
             else:
-                row_selection = column.as_column(arg[0])
+                row_selection = as_column(arg[0])
                 if is_bool_dtype(row_selection.dtype):
                     df.index = self._frame.index.take(row_selection)
                 else:
@@ -285,7 +277,7 @@ def _getitem_tuple_arg(self, arg):
 
     @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
     def _setitem_tuple_arg(self, key, value):
-        if isinstance(self._frame.index, cudf.MultiIndex) or isinstance(
+        if isinstance(self._frame.index, MultiIndex) or isinstance(
             self._frame.columns, pd.MultiIndex
         ):
             raise NotImplementedError(
@@ -322,7 +314,7 @@ def _setitem_tuple_arg(self, key, value):
             self._frame._data.insert(key[1], new_col)
         else:
             if isinstance(value, (cupy.ndarray, np.ndarray)):
-                value_df = cudf.DataFrame(value)
+                value_df = DataFrame(value)
                 if value_df.shape[1] != columns_df.shape[1]:
                     if value_df.shape[1] == 1:
                         value_cols = (
@@ -351,13 +343,9 @@ class _DataFrameIlocIndexer(_DataFrameIndexer):
 
     @annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
     def _getitem_tuple_arg(self, arg):
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.index import as_index
-
         # Iloc Step 1:
         # Gather the columns specified by the second tuple arg
-        columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1]))
+        columns_df = DataFrame(self._frame._get_columns_by_index(arg[1]))
 
         columns_df._index = self._frame._index
 
@@ -385,7 +373,7 @@ def _getitem_tuple_arg(self, arg):
                     index += len(columns_df)
                 df = columns_df._slice(slice(index, index + 1, 1))
             else:
-                arg = (column.as_column(arg[0]), arg[1])
+                arg = (as_column(arg[0]), arg[1])
                 if is_bool_dtype(arg[0]):
                     df = columns_df._apply_boolean_mask(arg[0])
                 else:
@@ -407,7 +395,7 @@ def _getitem_tuple_arg(self, arg):
 
     @annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
     def _setitem_tuple_arg(self, key, value):
-        columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1]))
+        columns = DataFrame(self._frame._get_columns_by_index(key[1]))
 
         for col in columns:
             self._frame[col].iloc[key[0]] = value
@@ -953,6 +941,7 @@ def ndim(self):
         return 2
 
     def __dir__(self):
+        # Add the columns of the DataFrame to the dir output.
         o = set(dir(type(self)))
         o.update(self.__dict__)
         o.update(
@@ -1169,8 +1158,6 @@ def _slice(self: T, arg: slice) -> T:
         arg : should always be of type slice
 
         """
-        from cudf.core.index import RangeIndex
-
         num_rows = len(self)
         if num_rows == 0:
             return self
@@ -1284,8 +1271,6 @@ def memory_usage(self, index=True, deep=False):
         return Series(sizes, index=ind)
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        import cudf
-
         if method == "__call__" and hasattr(cudf, ufunc.__name__):
             func = getattr(cudf, ufunc.__name__)
             return func(self)
@@ -1329,6 +1314,7 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
+    # The _get_numeric_data method is necessary for dask compatibility.
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1554,9 +1540,9 @@ def _concat(
                 out._index._data,
                 indices[:first_data_column_position],
             )
-            if not isinstance(
-                out._index, cudf.MultiIndex
-            ) and is_categorical_dtype(out._index._values.dtype):
+            if not isinstance(out._index, MultiIndex) and is_categorical_dtype(
+                out._index._values.dtype
+            ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
                 )
@@ -1672,51 +1658,6 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
 
         return result
 
-    def _repr_pandas025_formatting(self, ncols, nrows, dtype=None):
-        """
-        With Pandas > 0.25 there are some new conditional formatting for some
-        datatypes and column/row configurations. This fixes most of them in
-        context to match the expected Pandas repr of the same content.
-
-        Examples
-        --------
-        >>> gdf.__repr__()
-            0   ...  19
-        0   46  ...  48
-        ..  ..  ...  ..
-        19  40  ...  29
-
-        [20 rows x 20 columns]
-
-        >>> nrows, ncols = _repr_pandas025_formatting(2, 2, dtype="category")
-        >>> pd.options.display.max_rows = nrows
-        >>> pd.options.display.max_columns = ncols
-        >>> gdf.__repr__()
-             0  ...  19
-        0   46  ...  48
-        ..  ..  ...  ..
-        19  40  ...  29
-
-        [20 rows x 20 columns]
-        """
-        ncols = 1 if ncols in [0, 2] and dtype == "datetime64[ns]" else ncols
-        ncols = (
-            1
-            if ncols == 0
-            and nrows == 1
-            and dtype in ["int8", "str", "category"]
-            else ncols
-        )
-        ncols = (
-            1
-            if nrows == 1
-            and dtype in ["int8", "int16", "int64", "str", "category"]
-            else ncols
-        )
-        ncols = 0 if ncols == 2 else ncols
-        ncols = 19 if ncols in [20, 21] else ncols
-        return ncols, nrows
-
     def _clean_renderable_dataframe(self, output):
         """
         This method takes in partial/preprocessed dataframe
@@ -1822,7 +1763,7 @@ def _get_renderable_dataframe(self):
                 # adjust right columns for output if multiindex.
                 right_cols = (
                     right_cols - 1
-                    if isinstance(self.index, cudf.MultiIndex)
+                    if isinstance(self.index, MultiIndex)
                     else right_cols
                 )
                 left_cols = int(ncols / 2.0) + 1
@@ -2151,20 +2092,6 @@ def columns(self, columns):
             data, multiindex=is_multiindex, level_names=columns.names,
         )
 
-    def _rename_columns(self, new_names):
-        old_cols = iter(self._data.names)
-        l_old_cols = len(self._data)
-        l_new_cols = len(new_names)
-        if l_new_cols != l_old_cols:
-            msg = (
-                f"Length of new column names: {l_new_cols} does not "
-                "match length of previous column names: {l_old_cols}"
-            )
-            raise ValueError(msg)
-
-        mapper = dict(zip(old_cols, new_names))
-        self.rename(mapper=mapper, inplace=True, axis=1)
-
     def _reindex(
         self, columns, dtypes=None, deep=False, index=None, inplace=False
     ):
@@ -2209,11 +2136,9 @@ def _reindex(
                 columns = (
                     columns if columns is not None else list(df._column_names)
                 )
-                df = cudf.DataFrame()
+                df = DataFrame()
             else:
-                df = cudf.DataFrame(None, index).join(
-                    df, how="left", sort=True
-                )
+                df = DataFrame(None, index).join(df, how="left", sort=True)
                 # double-argsort to map back from sorted to unsorted positions
                 df = df.take(index.argsort(ascending=True).argsort())
 
@@ -2445,7 +2370,7 @@ def set_index(
                     except TypeError:
                         msg = f"{col} cannot be converted to column-like."
                         raise TypeError(msg)
-                if isinstance(col, (cudf.MultiIndex, pd.MultiIndex)):
+                if isinstance(col, (MultiIndex, pd.MultiIndex)):
                     col = (
                         cudf.from_pandas(col)
                         if isinstance(col, pd.MultiIndex)
@@ -2473,7 +2398,7 @@ def set_index(
 
         if append:
             idx_cols = [self.index._data[x] for x in self.index._data]
-            if isinstance(self.index, cudf.MultiIndex):
+            if isinstance(self.index, MultiIndex):
                 idx_names = self.index.names
             else:
                 idx_names = [self.index.name]
@@ -2485,7 +2410,7 @@ def set_index(
         elif len(columns_to_add) == 1:
             idx = cudf.Index(columns_to_add[0], name=names[0])
         else:
-            idx = cudf.MultiIndex._from_data(
+            idx = MultiIndex._from_data(
                 {i: col for i, col in enumerate(columns_to_add)}
             )
             idx.names = names
@@ -2568,7 +2493,7 @@ class max_speed
         result = self if inplace else self.copy()
 
         if not drop:
-            if isinstance(self.index, cudf.MultiIndex):
+            if isinstance(self.index, MultiIndex):
                 names = tuple(
                     name if name is not None else f"level_{i}"
                     for i, name in enumerate(self.index.names)
@@ -3028,9 +2953,7 @@ def rename(
                     "mixed type is not yet supported."
                 )
 
-            if level is not None and isinstance(
-                self.index, cudf.core.multiindex.MultiIndex
-            ):
+            if level is not None and isinstance(self.index, MultiIndex):
                 out_index = self.index.copy(deep=copy)
                 out_index.get_level_values(level).to_frame().replace(
                     to_replace=list(index.keys()),
@@ -3307,7 +3230,7 @@ def agg(self, aggs, axis=None):
             raise NotImplementedError("axis not implemented yet")
 
         if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
-            result = cudf.DataFrame()
+            result = DataFrame()
             # TODO : Allow simultaneous pass for multi-aggregation as
             # a future optimization
             for agg in aggs:
@@ -3320,7 +3243,7 @@ def agg(self, aggs, axis=None):
                     f"{aggs} is not a valid function for "
                     f"'DataFrame' object"
                 )
-            result = cudf.DataFrame()
+            result = DataFrame()
             result[aggs] = getattr(df_normalized, aggs)()
             result = result.iloc[:, 0]
             result.name = None
@@ -3355,7 +3278,7 @@ def agg(self, aggs, axis=None):
                         raise NotImplementedError(
                             "callable parameter is not implemented yet"
                         )
-                result = cudf.DataFrame(index=idxs, columns=cols)
+                result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = df_normalized[key]
                     col_empty = column_empty(
@@ -4758,7 +4681,7 @@ def to_pandas(self, nullable=False, **kwargs):
 
         if isinstance(self.columns, BaseIndex):
             out_columns = self.columns.to_pandas()
-            if isinstance(self.columns, cudf.core.multiindex.MultiIndex):
+            if isinstance(self.columns, MultiIndex):
                 if self.columns.names is not None:
                     out_columns.names = self.columns.names
             else:
@@ -4934,7 +4857,7 @@ def to_arrow(self, preserve_index=True):
                     "step": 1,
                 }
             else:
-                if isinstance(self.index, cudf.MultiIndex):
+                if isinstance(self.index, MultiIndex):
                     gen_names = tuple(
                         f"level_{i}"
                         for i, _ in enumerate(self.index._data.names)
@@ -5462,7 +5385,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
             warnings.warn(msg)
 
         if not skipna and any(col.nullable for col in filtered._columns):
-            mask = cudf.DataFrame(
+            mask = DataFrame(
                 {
                     name: filtered._data[name]._get_mask_as_column()
                     if filtered._data[name].nullable
@@ -6010,11 +5933,11 @@ def stack(self, level=-1, dropna=True):
         repeated_index = self.index.repeat(self.shape[1])
         name_index = Frame({0: self._column_names}).tile(self.shape[0])
         new_index = list(repeated_index._columns) + [name_index._columns[0]]
-        if isinstance(self._index, cudf.MultiIndex):
+        if isinstance(self._index, MultiIndex):
             index_names = self._index.names + [None]
         else:
             index_names = [None] * len(new_index)
-        new_index = cudf.core.multiindex.MultiIndex.from_frame(
+        new_index = MultiIndex.from_frame(
             DataFrame(dict(zip(range(0, len(new_index)), new_index))),
             names=index_names,
         )
@@ -6275,8 +6198,8 @@ def append(
         elif isinstance(other, list):
             if not other:
                 pass
-            elif not isinstance(other[0], cudf.DataFrame):
-                other = cudf.DataFrame(other)
+            elif not isinstance(other[0], DataFrame):
+                other = DataFrame(other)
                 if (self.columns.get_indexer(other.columns) >= 0).all():
                     other = other.reindex(columns=self.columns)
 
@@ -6574,7 +6497,7 @@ def from_pandas(obj, nan_as_null=None):
     elif isinstance(obj, pd.Series):
         return Series.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.MultiIndex):
-        return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
+        return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.RangeIndex):
         return cudf.core.index.RangeIndex(
             start=obj.start, stop=obj.stop, step=obj.step, name=obj.name
@@ -6692,7 +6615,7 @@ def extract_col(df, col):
         if (
             col == "index"
             and col not in df.index._data
-            and not isinstance(df.index, cudf.MultiIndex)
+            and not isinstance(df.index, MultiIndex)
         ):
             return df.index._data.columns[0]
         return df.index._data[col]
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c83b06707a4..bae15c5e9fd 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1798,40 +1798,27 @@ def repeat(self, repeats, axis=None):
                 "Only axis=`None` supported at this time."
             )
 
-        return self._repeat(repeats)
-
-    def _repeat(self, count):
-        if not is_scalar(count):
-            count = as_column(count)
+        if not is_scalar(repeats):
+            repeats = as_column(repeats)
 
         result = self.__class__._from_data(
-            *libcudf.filling.repeat(self, count)
+            *libcudf.filling.repeat(self, repeats)
         )
 
         result._copy_type_metadata(self)
         return result
 
-    def _fill(self, fill_values, begin, end, inplace):
-        col_and_fill = zip(self._columns, fill_values)
-
-        if not inplace:
-            data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill)
-            return self.__class__._from_data(
-                zip(self._column_names, data_columns), self._index
-            )
-
-        for (c, v) in col_and_fill:
-            c.fill(v, begin, end, inplace=True)
-
-        return self
-
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
-        assert axis in (None, 0) and freq is None
-        return self._shift(periods)
+        axis = self._get_axis_from_axis_arg(axis)
+        if axis != 0:
+            raise ValueError("Only axis=0 is supported.")
+        if freq is not None:
+            raise ValueError("The freq argument is not yet supported.")
 
-    def _shift(self, offset, fill_value=None):
-        data_columns = (col.shift(offset, fill_value) for col in self._columns)
+        data_columns = (
+            col.shift(periods, fill_value) for col in self._columns
+        )
         return self.__class__._from_data(
             zip(self._column_names, data_columns), self._index
         )
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4ec7c3df076..fb86cf85c4c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1628,7 +1628,23 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         return self._mimic_inplace(result, inplace=inplace)
 
     def fill(self, fill_value, begin=0, end=-1, inplace=False):
-        return self._fill([fill_value], begin, end, inplace)
+        warnings.warn(
+            "The fill method will be removed in a future cuDF release.",
+            FutureWarning,
+        )
+        fill_values = [fill_value]
+        col_and_fill = zip(self._columns, fill_values)
+
+        if not inplace:
+            data_columns = (c._fill(v, begin, end) for (c, v) in col_and_fill)
+            return self.__class__._from_data(
+                zip(self._column_names, data_columns), self._index
+            )
+
+        for (c, v) in col_and_fill:
+            c.fill(v, begin, end, inplace=True)
+
+        return self
 
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
diff --git a/python/cudf/cudf/tests/test_fill.py b/python/cudf/cudf/tests/test_fill.py
index efbe2834486..224db2b39d1 100644
--- a/python/cudf/cudf/tests/test_fill.py
+++ b/python/cudf/cudf/tests/test_fill.py
@@ -50,7 +50,7 @@ def test_fill(data, fill_value, begin, end, inplace):
 
         begin = max(0, min(len(gs), begin))
         end = max(0, min(len(gs), end))
-        actual = gs._fill([fill_value], begin, end, False)
+        actual = gs.fill(fill_value, begin, end, False)
         assert actual is not gs
 
     ps[begin:end] = fill_value
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index fe95b2930df..f8c136b8c2d 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -98,15 +98,9 @@ def test_full_dataframe_20(dtype, nrows, ncols):
     ).astype(dtype)
     gdf = cudf.from_pandas(pdf)
 
-    ncols, nrows = gdf._repr_pandas025_formatting(ncols, nrows, dtype)
-    pd.options.display.max_rows = int(nrows)
-    pd.options.display.max_columns = int(ncols)
-
     assert pdf.__repr__() == gdf.__repr__()
     assert pdf._repr_html_() == gdf._repr_html_()
     assert pdf._repr_latex_() == gdf._repr_latex_()
-    pd.reset_option("display.max_rows")
-    pd.reset_option("display.max_columns")
 
 
 @pytest.mark.parametrize("dtype", repr_categories)

From 897a9eaebd8396728a1a91093554ba99ea3e85ba Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Tue, 4 Jan 2022 11:02:58 -0500
Subject: [PATCH 111/202] Refactoring ceil/round/floor code for datetime64
 types (#9926)

This PR is a follow up to #9820 where @bdice and @vyasr raised the point of having a design such that we avoid writing bunch of boilerplate code, which is common in the implementations of ceil/round/floor. The aim is to reduce the total number of functions, as well as have a cleaner design.

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9926
---
 cpp/include/cudf/datetime.hpp            | 278 +++--------------------
 cpp/src/datetime/datetime_ops.cu         | 242 ++------------------
 cpp/tests/datetime/datetime_ops_test.cpp | 200 ++++++++--------
 python/cudf/cudf/_lib/cpp/datetime.pxd   |  53 ++---
 python/cudf/cudf/_lib/datetime.pyx       |  89 +++-----
 5 files changed, 215 insertions(+), 647 deletions(-)

diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 17bea935dfd..117119cd40f 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -285,280 +285,66 @@ std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/** @} */  // end of group
-
-/**
- * @brief Round up to the nearest day
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> ceil_day(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest hour
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> ceil_hour(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest minute
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> ceil_minute(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest second
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> ceil_second(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest millisecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> ceil_millisecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest microsecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> ceil_microsecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round up to the nearest nanosecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> ceil_nanosecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
- * @brief Round down to the nearest day
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
+ * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
  *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
  */
-std::unique_ptr<cudf::column> floor_day(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+enum class rounding_frequency : int32_t {
+  DAY,
+  HOUR,
+  MINUTE,
+  SECOND,
+  MILLISECOND,
+  MICROSECOND,
+  NANOSECOND
+};
 
 /**
- * @brief Round down to the nearest hour
+ * @brief Round datetimes up to the nearest multiple of the given frequency.
  *
- * @param column cudf::column_view of the input datetime values
+ * @param column cudf::column_view of the input datetime values.
+ * @param freq rounding_frequency indicating the frequency to round up to.
  * @param mr Device memory resource used to allocate device memory of the returned column.
  *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
+ * @return cudf::column of the same datetime resolution as the input column.
  */
-std::unique_ptr<cudf::column> floor_hour(
+std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
+  rounding_frequency freq,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Round down to the nearest minute
+ * @brief Round datetimes down to the nearest multiple of the given frequency.
  *
- * @param column cudf::column_view of the input datetime values
+ * @param column cudf::column_view of the input datetime values.
+ * @param freq rounding_frequency indicating the frequency to round down to.
  * @param mr Device memory resource used to allocate device memory of the returned column.
  *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
+ * @return cudf::column of the same datetime resolution as the input column.
  */
-std::unique_ptr<cudf::column> floor_minute(
+std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
+  rounding_frequency freq,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Round down to the nearest second
+ * @brief Round datetimes to the nearest multiple of the given frequency.
  *
- * @param column cudf::column_view of the input datetime values
+ * @param column cudf::column_view of the input datetime values.
+ * @param freq rounding_frequency indicating the frequency to round to.
  * @param mr Device memory resource used to allocate device memory of the returned column.
  *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP.
+ * @return cudf::column of the same datetime resolution as the input column.
  */
-std::unique_ptr<cudf::column> floor_second(
+std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
+  rounding_frequency freq,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Round down to the nearest millisecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> floor_millisecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round down to the nearest microsecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> floor_microsecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round down to the nearest nanosecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> floor_nanosecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest day
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> round_day(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest hour
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> round_hour(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest minute
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> round_minute(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest second
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<cudf::column> round_second(
-  cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest millisecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> round_millisecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest microsecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> round_microsecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Round to the nearest nanosecond
- *
- * @param column cudf::column_view of the input datetime values
- * @param mr Device memory resource used to allocate device memory of the returned column.
- *
- * @throw cudf::logic_error if input column datatype is not TIMESTAMP
- * @return cudf::column of the same datetime resolution as the input column
- */
-std::unique_ptr<column> round_nanosecond(
-  column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+/** @} */  // end of group
 
 }  // namespace datetime
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 85653b4f0be..1e9a39560b8 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -113,9 +113,9 @@ struct RoundFunctor {
 
 struct RoundingDispatcher {
   rounding_function round_kind;
-  datetime_component component;
+  rounding_frequency component;
 
-  RoundingDispatcher(rounding_function round_kind, datetime_component component)
+  RoundingDispatcher(rounding_function round_kind, rounding_frequency component)
     : round_kind(round_kind), component(component)
   {
   }
@@ -124,25 +124,25 @@ struct RoundingDispatcher {
   CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
   {
     switch (component) {
-      case datetime_component::DAY:
+      case rounding_frequency::DAY:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_D>{}(round_kind, ts));
-      case datetime_component::HOUR:
+      case rounding_frequency::HOUR:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_h>{}(round_kind, ts));
-      case datetime_component::MINUTE:
+      case rounding_frequency::MINUTE:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_m>{}(round_kind, ts));
-      case datetime_component::SECOND:
+      case rounding_frequency::SECOND:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_s>{}(round_kind, ts));
-      case datetime_component::MILLISECOND:
+      case rounding_frequency::MILLISECOND:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_ms>{}(round_kind, ts));
-      case datetime_component::MICROSECOND:
+      case rounding_frequency::MICROSECOND:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_us>{}(round_kind, ts));
-      case datetime_component::NANOSECOND:
+      case rounding_frequency::NANOSECOND:
         return time_point_cast<typename Timestamp::duration>(
           RoundFunctor<duration_ns>{}(round_kind, ts));
       default: cudf_assert(false && "Unsupported datetime rounding resolution.");
@@ -234,7 +234,7 @@ struct dispatch_round {
   template <typename Timestamp>
   std::enable_if_t<cudf::is_timestamp<Timestamp>(), std::unique_ptr<cudf::column>> operator()(
     rounding_function round_kind,
-    datetime_component component,
+    rounding_frequency component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource* mr) const
@@ -420,7 +420,7 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
 }
 
 std::unique_ptr<column> round_general(rounding_function round_kind,
-                                      datetime_component component,
+                                      rounding_frequency component,
                                       column_view const& column,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
@@ -531,223 +531,31 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 }  // namespace detail
 
-std::unique_ptr<column> ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::DAY,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::HOUR,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::MINUTE,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::SECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> ceil_millisecond(column_view const& column,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::MILLISECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> ceil_microsecond(column_view const& column,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> ceil_datetimes(column_view const& column,
+                                       rounding_frequency freq,
+                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::MICROSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
+  return detail::round_general(
+    detail::rounding_function::CEIL, freq, column, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> ceil_nanosecond(column_view const& column,
+std::unique_ptr<column> floor_datetimes(column_view const& column,
+                                        rounding_frequency freq,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::CEIL,
-                               detail::datetime_component::NANOSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_day(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::DAY,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::HOUR,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
+  return detail::round_general(
+    detail::rounding_function::FLOOR, freq, column, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<column> floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::MINUTE,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_second(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::SECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_millisecond(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::MILLISECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_microsecond(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::MICROSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> floor_nanosecond(column_view const& column,
-                                         rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::FLOOR,
-                               detail::datetime_component::NANOSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_day(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::DAY,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::HOUR,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_minute(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::MINUTE,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_second(column_view const& column, rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::SECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_millisecond(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::MILLISECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_microsecond(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::MICROSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
-}
-
-std::unique_ptr<column> round_nanosecond(column_view const& column,
-                                         rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> round_datetimes(column_view const& column,
+                                        rounding_frequency freq,
+                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round_general(detail::rounding_function::ROUND,
-                               detail::datetime_component::NANOSECOND,
-                               column,
-                               rmm::cuda_stream_default,
-                               mr);
+  return detail::round_general(
+    detail::rounding_function::ROUND, freq, column, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 62b8425704f..655fbf5679b 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -347,78 +347,6 @@ TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithDate)
     verbosity);
 }
 
-TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
-{
-  using T = TypeParam;
-  using namespace cudf::test;
-  using namespace cudf::datetime;
-  using namespace cuda::std::chrono;
-
-  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
-  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
-
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
-
-  auto host_val                     = to_host<T>(input);
-  thrust::host_vector<T> timestamps = host_val.first;
-
-  thrust::host_vector<T> ceiled_day(timestamps.size());
-  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<days>(i));
-  });
-  auto expected_day =
-    fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_day.begin(), ceiled_day.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_day(input), expected_day);
-
-  thrust::host_vector<T> ceiled_hour(timestamps.size());
-  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<hours>(i));
-  });
-  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_hour.begin(),
-                                                                                ceiled_hour.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_hour(input), expected_hour);
-
-  std::vector<T> ceiled_minute(timestamps.size());
-  std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<minutes>(i));
-  });
-  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
-    ceiled_minute.begin(), ceiled_minute.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_minute(input), expected_minute);
-
-  std::vector<T> ceiled_second(timestamps.size());
-  std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<seconds>(i));
-  });
-  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
-    ceiled_second.begin(), ceiled_second.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_second(input), expected_second);
-
-  std::vector<T> ceiled_millisecond(timestamps.size());
-  std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<milliseconds>(i));
-  });
-  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
-    ceiled_millisecond.begin(), ceiled_millisecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond);
-
-  std::vector<T> ceiled_microsecond(timestamps.size());
-  std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<microseconds>(i));
-  });
-  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
-    ceiled_microsecond.begin(), ceiled_microsecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond);
-
-  std::vector<T> ceiled_nanosecond(timestamps.size());
-  std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) {
-    return time_point_cast<typename T::duration>(ceil<nanoseconds>(i));
-  });
-  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
-    ceiled_nanosecond.begin(), ceiled_nanosecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond);
-}
-
 TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate)
 {
   using namespace cudf::test;
@@ -841,7 +769,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter);
 }
 
-TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
+TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime)
 {
   using T = TypeParam;
   using namespace cudf::test;
@@ -851,10 +779,85 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
   auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
 
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+  auto const input =
+    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+  auto const timestamps = to_host<T>(input).first;
+
+  std::vector<T> ceiled_day(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_day.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<days>(i));
+  });
+  auto expected_day =
+    fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_day.begin(), ceiled_day.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::DAY), expected_day);
+
+  std::vector<T> ceiled_hour(timestamps.size());
+  thrust::transform(timestamps.begin(), timestamps.end(), ceiled_hour.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<hours>(i));
+  });
+  auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(ceiled_hour.begin(),
+                                                                                ceiled_hour.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::HOUR), expected_hour);
+
+  std::vector<T> ceiled_minute(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_minute.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<minutes>(i));
+  });
+  auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_minute.begin(), ceiled_minute.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MINUTE),
+                                 expected_minute);
+
+  std::vector<T> ceiled_second(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_second.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<seconds>(i));
+  });
+  auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_second.begin(), ceiled_second.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::SECOND),
+                                 expected_second);
+
+  std::vector<T> ceiled_millisecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_millisecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<milliseconds>(i));
+  });
+  auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_millisecond.begin(), ceiled_millisecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MILLISECOND),
+                                 expected_millisecond);
+
+  std::vector<T> ceiled_microsecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<microseconds>(i));
+  });
+  auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_microsecond.begin(), ceiled_microsecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::MICROSECOND),
+                                 expected_microsecond);
+
+  std::vector<T> ceiled_nanosecond(timestamps.size());
+  std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) {
+    return time_point_cast<typename T::duration>(ceil<nanoseconds>(i));
+  });
+  auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
+    ceiled_nanosecond.begin(), ceiled_nanosecond.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_datetimes(input, rounding_frequency::NANOSECOND),
+                                 expected_nanosecond);
+}
+
+TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
+{
+  using T = TypeParam;
+  using namespace cudf::test;
+  using namespace cudf::datetime;
+  using namespace cuda::std::chrono;
 
-  auto host_val                     = to_host<T>(input);
-  thrust::host_vector<T> timestamps = host_val.first;
+  auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
+  auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
+
+  auto const input =
+    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+  auto const timestamps = to_host<T>(input).first;
 
   std::vector<T> floored_day(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) {
@@ -862,7 +865,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(floored_day.begin(),
                                                                                floored_day.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::DAY), expected_day);
 
   std::vector<T> floored_hour(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) {
@@ -870,7 +873,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_hour.begin(), floored_hour.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::HOUR), expected_hour);
 
   std::vector<T> floored_minute(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) {
@@ -878,7 +881,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_minute.begin(), floored_minute.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MINUTE),
+                                 expected_minute);
 
   std::vector<T> floored_second(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) {
@@ -886,7 +890,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_second.begin(), floored_second.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::SECOND),
+                                 expected_second);
 
   std::vector<T> floored_millisecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) {
@@ -894,7 +899,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_millisecond.begin(), floored_millisecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MILLISECOND),
+                                 expected_millisecond);
 
   std::vector<T> floored_microsecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) {
@@ -902,7 +908,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_microsecond.begin(), floored_microsecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::MICROSECOND),
+                                 expected_microsecond);
 
   std::vector<T> floored_nanosecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) {
@@ -910,7 +917,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime)
   });
   auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     floored_nanosecond.begin(), floored_nanosecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_datetimes(input, rounding_frequency::NANOSECOND),
+                                 expected_nanosecond);
 }
 
 TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
@@ -923,10 +931,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   auto start = milliseconds(-2500000000000);  // Sat, 11 Oct 1890 19:33:20 GMT
   auto stop  = milliseconds(2500000000000);   // Mon, 22 Mar 2049 04:26:40 GMT
 
-  auto input = generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
-
-  auto host_val   = to_host<T>(input);
-  auto timestamps = host_val.first;
+  auto const input =
+    generate_timestamps<T>(this->size(), time_point_ms(start), time_point_ms(stop));
+  auto const timestamps = to_host<T>(input).first;
 
   std::vector<T> rounded_day(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_day.begin(), [](auto i) {
@@ -934,7 +941,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_day = fixed_width_column_wrapper<T, typename T::duration::rep>(rounded_day.begin(),
                                                                                rounded_day.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_day(input), expected_day);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::DAY), expected_day);
 
   std::vector<T> rounded_hour(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_hour.begin(), [](auto i) {
@@ -942,7 +949,7 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_hour = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_hour.begin(), rounded_hour.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_hour(input), expected_hour);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::HOUR), expected_hour);
 
   std::vector<T> rounded_minute(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_minute.begin(), [](auto i) {
@@ -950,7 +957,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_minute = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_minute.begin(), rounded_minute.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_minute(input), expected_minute);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MINUTE),
+                                 expected_minute);
 
   std::vector<T> rounded_second(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_second.begin(), [](auto i) {
@@ -958,7 +966,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_second = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_second.begin(), rounded_second.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_second(input), expected_second);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::SECOND),
+                                 expected_second);
 
   std::vector<T> rounded_millisecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_millisecond.begin(), [](auto i) {
@@ -966,7 +975,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_millisecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_millisecond.begin(), rounded_millisecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_millisecond(input), expected_millisecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MILLISECOND),
+                                 expected_millisecond);
 
   std::vector<T> rounded_microsecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_microsecond.begin(), [](auto i) {
@@ -974,7 +984,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_microsecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_microsecond.begin(), rounded_microsecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_microsecond(input), expected_microsecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::MICROSECOND),
+                                 expected_microsecond);
 
   std::vector<T> rounded_nanosecond(timestamps.size());
   std::transform(timestamps.begin(), timestamps.end(), rounded_nanosecond.begin(), [](auto i) {
@@ -982,7 +993,8 @@ TYPED_TEST(TypedDatetimeOpsTest, TestRoundDatetime)
   });
   auto expected_nanosecond = fixed_width_column_wrapper<T, typename T::duration::rep>(
     rounded_nanosecond.begin(), rounded_nanosecond.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_nanosecond(input), expected_nanosecond);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*round_datetimes(input, rounding_frequency::NANOSECOND),
+                                 expected_nanosecond);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index f75b39ce6ee..498fc313cf9 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -13,45 +13,26 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_hour(const column_view& column) except +
     cdef unique_ptr[column] extract_minute(const column_view& column) except +
     cdef unique_ptr[column] extract_second(const column_view& column) except +
-    cdef unique_ptr[column] ceil_day(const column_view& column) except +
-    cdef unique_ptr[column] ceil_hour(const column_view& column) except +
-    cdef unique_ptr[column] ceil_minute(const column_view& column) except +
-    cdef unique_ptr[column] ceil_second(const column_view& column) except +
-    cdef unique_ptr[column] ceil_millisecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] ceil_microsecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] ceil_nanosecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] floor_day(const column_view& column) except +
-    cdef unique_ptr[column] floor_hour(const column_view& column) except +
-    cdef unique_ptr[column] floor_minute(const column_view& column) except +
-    cdef unique_ptr[column] floor_second(const column_view& column) except +
-    cdef unique_ptr[column] floor_millisecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] floor_microsecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] floor_nanosecond(
-        const column_view& column
-    ) except +
-    cdef unique_ptr[column] round_day(const column_view& column) except +
-    cdef unique_ptr[column] round_hour(const column_view& column) except +
-    cdef unique_ptr[column] round_minute(const column_view& column) except +
-    cdef unique_ptr[column] round_second(const column_view& column) except +
-    cdef unique_ptr[column] round_millisecond(
-        const column_view& column
+
+    ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
+        DAY "cudf::datetime::rounding_frequency::DAY"
+        HOUR "cudf::datetime::rounding_frequency::HOUR"
+        MINUTE "cudf::datetime::rounding_frequency::MINUTE"
+        SECOND "cudf::datetime::rounding_frequency::SECOND"
+        MILLISECOND "cudf::datetime::rounding_frequency::MILLISECOND"
+        MICROSECOND "cudf::datetime::rounding_frequency::MICROSECOND"
+        NANOSECOND "cudf::datetime::rounding_frequency::NANOSECOND"
+
+    cdef unique_ptr[column] ceil_datetimes(
+        const column_view& column, rounding_frequency freq
     ) except +
-    cdef unique_ptr[column] round_microsecond(
-        const column_view& column
+    cdef unique_ptr[column] floor_datetimes(
+        const column_view& column, rounding_frequency freq
     ) except +
-    cdef unique_ptr[column] round_nanosecond(
-        const column_view& column
+    cdef unique_ptr[column] round_datetimes(
+        const column_view& column, rounding_frequency freq
     ) except +
+
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 3c05a17c268..e41016645cd 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -62,82 +62,63 @@ def extract_datetime_component(Column col, object field):
     return result
 
 
-def ceil_datetime(Column col, object field):
+cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
+    cdef libcudf_datetime.rounding_frequency freq_val
+
+    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Timedelta.resolution_string.html
+    if freq == "D":
+        freq_val = libcudf_datetime.rounding_frequency.DAY
+    elif freq == "H":
+        freq_val = libcudf_datetime.rounding_frequency.HOUR
+    elif freq in ("T", "min"):
+        freq_val = libcudf_datetime.rounding_frequency.MINUTE
+    elif freq == "S":
+        freq_val = libcudf_datetime.rounding_frequency.SECOND
+    elif freq in ("L", "ms"):
+        freq_val = libcudf_datetime.rounding_frequency.MILLISECOND
+    elif freq in ("U", "us"):
+        freq_val = libcudf_datetime.rounding_frequency.MICROSECOND
+    elif freq == "N":
+        freq_val = libcudf_datetime.rounding_frequency.NANOSECOND
+    else:
+        raise ValueError(f"Invalid resolution: '{freq}'")
+    return freq_val
+
+
+def ceil_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
+    cdef libcudf_datetime.rounding_frequency freq_val = \
+        _get_rounding_frequency(freq)
 
     with nogil:
-        # https://pandas.pydata.org/pandas-docs/version/0.25.0/reference/api/pandas.Timedelta.resolution.html
-        if field == "D":
-            c_result = move(libcudf_datetime.ceil_day(col_view))
-        elif field == "H":
-            c_result = move(libcudf_datetime.ceil_hour(col_view))
-        elif field == "T" or field == "min":
-            c_result = move(libcudf_datetime.ceil_minute(col_view))
-        elif field == "S":
-            c_result = move(libcudf_datetime.ceil_second(col_view))
-        elif field == "L" or field == "ms":
-            c_result = move(libcudf_datetime.ceil_millisecond(col_view))
-        elif field == "U" or field == "us":
-            c_result = move(libcudf_datetime.ceil_microsecond(col_view))
-        elif field == "N":
-            c_result = move(libcudf_datetime.ceil_nanosecond(col_view))
-        else:
-            raise ValueError(f"Invalid resolution: '{field}'")
+        c_result = move(libcudf_datetime.ceil_datetimes(col_view, freq_val))
 
     result = Column.from_unique_ptr(move(c_result))
     return result
 
 
-def floor_datetime(Column col, object field):
+def floor_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
+    cdef libcudf_datetime.rounding_frequency freq_val = \
+        _get_rounding_frequency(freq)
 
     with nogil:
-        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
-        if field == "D":
-            c_result = move(libcudf_datetime.floor_day(col_view))
-        elif field == "H":
-            c_result = move(libcudf_datetime.floor_hour(col_view))
-        elif field == "T" or field == "min":
-            c_result = move(libcudf_datetime.floor_minute(col_view))
-        elif field == "S":
-            c_result = move(libcudf_datetime.floor_second(col_view))
-        elif field == "L" or field == "ms":
-            c_result = move(libcudf_datetime.floor_millisecond(col_view))
-        elif field == "U" or field == "us":
-            c_result = move(libcudf_datetime.floor_microsecond(col_view))
-        elif field == "N":
-            c_result = move(libcudf_datetime.floor_nanosecond(col_view))
-        else:
-            raise ValueError(f"Invalid resolution: '{field}'")
+        c_result = move(libcudf_datetime.floor_datetimes(col_view, freq_val))
 
     result = Column.from_unique_ptr(move(c_result))
     return result
 
 
-def round_datetime(Column col, object field):
+def round_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
+    cdef libcudf_datetime.rounding_frequency freq_val = \
+        _get_rounding_frequency(freq)
 
     with nogil:
-        # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html
-        if field == "D":
-            c_result = move(libcudf_datetime.round_day(col_view))
-        elif field == "H":
-            c_result = move(libcudf_datetime.round_hour(col_view))
-        elif field == "T" or field == "min":
-            c_result = move(libcudf_datetime.round_minute(col_view))
-        elif field == "S":
-            c_result = move(libcudf_datetime.round_second(col_view))
-        elif field == "L" or field == "ms":
-            c_result = move(libcudf_datetime.round_millisecond(col_view))
-        elif field == "U" or field == "us":
-            c_result = move(libcudf_datetime.round_microsecond(col_view))
-        elif field == "N":
-            c_result = move(libcudf_datetime.round_nanosecond(col_view))
-        else:
-            raise ValueError(f"Invalid resolution: '{field}'")
+        c_result = move(libcudf_datetime.round_datetimes(col_view, freq_val))
 
     result = Column.from_unique_ptr(move(c_result))
     return result

From d69ea611c8a89cd661d01e361575056e37a6060a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 4 Jan 2022 12:33:33 -0600
Subject: [PATCH 112/202] Remove deprecated method DataFrame.hash_columns.
 (#9943)

This PR removes the deprecated method `DataFrame.hash_columns`. Users can replace existing calls like `df.hash_columns(columns, method)` with `df[columns].hash_values(method)`. Resolves #9503, follows up on #9458.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/9943
---
 docs/cudf/source/api_docs/dataframe.rst  |  1 -
 python/cudf/cudf/core/dataframe.py       | 32 ------------------------
 python/cudf/cudf/tests/test_dataframe.py | 28 ---------------------
 3 files changed, 61 deletions(-)

diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 94f88a40ea5..2de55553c3f 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -254,7 +254,6 @@ Serialization / IO / conversion
    DataFrame.from_arrow
    DataFrame.from_pandas
    DataFrame.from_records
-   DataFrame.hash_columns
    DataFrame.hash_values
    DataFrame.to_arrow
    DataFrame.to_dlpack
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b7fc5efb412..d97ea456f72 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4150,38 +4150,6 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    def hash_columns(self, columns=None, method="murmur3"):
-        """Hash the given *columns* and return a new device array
-
-        This method is deprecated. Replace ``df.hash_columns(columns, method)``
-        with ``df[columns].hash_values(method)``.
-
-        Parameters
-        ----------
-        columns : sequence of str; optional
-            Sequence of column names. If columns is *None* (unspecified),
-            all columns in the frame are used.
-        method : {'murmur3', 'md5'}, default 'murmur3'
-            Hash function to use:
-            * murmur3: MurmurHash3 hash function.
-            * md5: MD5 hash function.
-
-        Returns
-        -------
-        Series
-            Hash values for each row.
-        """
-        warnings.warn(
-            "The `hash_columns` method will be removed in a future cuDF "
-            "release. Replace `df.hash_columns(columns, method)` with "
-            "`df[columns].hash_values(method)`.",
-            FutureWarning,
-        )
-        if columns is None:
-            # Slice by [:] to keep all columns.
-            columns = slice(None, None, None)
-        return self[columns].hash_values(method=method)
-
     def hash_values(self, method="murmur3"):
         """Compute the hash of values in each row.
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 78560ee6723..33c993cc56a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1107,34 +1107,6 @@ def test_assign():
     np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
-@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-@pytest.mark.parametrize("method", ["murmur3", "md5"])
-def test_dataframe_hash_columns(nrows, method):
-    gdf = cudf.DataFrame()
-    data = np.asarray(range(nrows))
-    data[0] = data[-1]  # make first and last the same
-    gdf["a"] = data
-    gdf["b"] = gdf.a + 100
-    with pytest.warns(FutureWarning):
-        out = gdf.hash_columns(["a", "b"])
-    assert isinstance(out, cudf.Series)
-    assert len(out) == nrows
-    assert out.dtype == np.int32
-
-    # Check default
-    with pytest.warns(FutureWarning):
-        out_all = gdf.hash_columns()
-    assert_eq(out, out_all)
-
-    # Check single column
-    with pytest.warns(FutureWarning):
-        out_one = gdf.hash_columns(["a"], method=method)
-    # First matches last
-    assert out_one.iloc[0] == out_one.iloc[-1]
-    # Equivalent to the cudf.Series.hash_values()
-    assert_eq(gdf["a"].hash_values(method=method), out_one)
-
-
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
 def test_dataframe_hash_values(nrows, method):

From cc4a2bd5b4edfe943e6e89e8a574f60a270a0b1d Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 4 Jan 2022 14:35:57 -0500
Subject: [PATCH 113/202] Upgrade thrust version to 1.15 (#9912)

Compile times between 1.12 and 1.15 are comparable.
libcudf library size with 1.15 is < 1MB smaller compared to 1.12

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9912
---
 cpp/cmake/thirdparty/get_thrust.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 574bfa26a0c..fcf9f0d73ee 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -80,6 +80,6 @@ function(find_and_configure_thrust VERSION)
   endif()
 endfunction()
 
-set(CUDF_MIN_VERSION_Thrust 1.12.0)
+set(CUDF_MIN_VERSION_Thrust 1.15.0)
 
 find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust})

From 36fa5f313a31c06aba94d69779d90b6a1128ead4 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Tue, 4 Jan 2022 15:09:43 -0700
Subject: [PATCH 114/202] Implement per-list sequence (#9839)

This PR adds `lists::sequences` API, allowing to generate per-list sequence. In particular, it allows generating a lists column in which each list is a sequence of numbers/durations. These sequences are generated individually from separate sets of (start, step, size) input values.

Closes #9424.

Note: `lists::sequences` supports only numeric types (integer types + floating-point types) and duration types.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - https://github.com/nvdbaranec
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9839
---
 cpp/CMakeLists.txt                  |   1 +
 cpp/include/cudf/filling.hpp        |   8 +-
 cpp/include/cudf/lists/filling.hpp  | 105 ++++++++++++
 cpp/src/lists/sequences.cu          | 225 +++++++++++++++++++++++++
 cpp/tests/CMakeLists.txt            |   1 +
 cpp/tests/lists/sequences_tests.cpp | 251 ++++++++++++++++++++++++++++
 6 files changed, 587 insertions(+), 4 deletions(-)
 create mode 100644 cpp/include/cudf/lists/filling.hpp
 create mode 100644 cpp/src/lists/sequences.cu
 create mode 100644 cpp/tests/lists/sequences_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 86ec24c1b7b..624293ad87c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -343,6 +343,7 @@ add_library(
   src/lists/lists_column_factories.cu
   src/lists/lists_column_view.cu
   src/lists/segmented_sort.cu
+  src/lists/sequences.cu
   src/merge/merge.cu
   src/partitioning/partitioning.cu
   src/partitioning/round_robin.cu
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index aff0d20a467..905a897eb40 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,7 +169,7 @@ std::unique_ptr<table> repeat(
  * @param init First value in the sequence
  * @param step Increment value
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return std::unique_ptr<column> The result table containing the sequence
+ * @return The result column containing the generated sequence
  */
 std::unique_ptr<column> sequence(
   size_type size,
@@ -195,7 +195,7 @@ std::unique_ptr<column> sequence(
  * @param size Size of the output column
  * @param init First value in the sequence
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return std::unique_ptr<column> The result table containing the sequence
+ * @return The result column containing the generated sequence
  */
 std::unique_ptr<column> sequence(
   size_type size,
@@ -223,7 +223,7 @@ std::unique_ptr<column> sequence(
  * @param months Months to increment
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
- * @returns Timestamps column with sequences of months.
+ * @return Timestamps column with sequences of months.
  */
 std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
new file mode 100644
index 00000000000..74a4dac1e10
--- /dev/null
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <memory>
+
+namespace cudf::lists {
+/**
+ * @addtogroup lists_filling
+ * @{
+ * @file
+ * @brief Column APIs for individual list sequence
+ */
+
+/**
+ * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
+ * of (`start`, `size`) parameters.
+ *
+ * Create a lists column in which each row is a sequence of values starting from a `start` value,
+ * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size`
+ * values used to generate each list is taken from the corresponding row of the input @p starts and
+ * @p sizes columns.
+ *
+ *  - @p sizes must be a column of integer types.
+ *  - All the input columns must not have nulls.
+ *  - If any row of the @p sizes column contains negative value, the output is undefined.
+ *
+ * @code{.pseudo}
+ * starts = [0, 1, 2, 3, 4]
+ * sizes  = [0, 2, 2, 1, 3]
+ *
+ * output = [ [], [1, 2], [2, 3], [3], [4, 5, 6] ]
+ * @endcode
+ *
+ * @throws cudf::logic_error if @p sizes column is not of integer types.
+ * @throws cudf::logic_error if any input column has nulls.
+ * @throws cudf::logic_error if @p starts and @p sizes columns do not have the same size.
+ *
+ * @param starts First values in the result sequences.
+ * @param sizes Numbers of values in the result sequences.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return The result column containing generated sequences.
+ */
+std::unique_ptr<column> sequences(
+  column_view const& starts,
+  column_view const& sizes,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
+ * of (`start`, `step`, `size`) parameters.
+ *
+ * Create a lists column in which each row is a sequence of values starting from a `start` value,
+ * incrementing by a `step` value, and its cardinality is specified by a `size` value. The values
+ * `start`, `step`, and `size` used to generate each list is taken from the corresponding row of the
+ * input @p starts, @p steps, and @p sizes columns.
+ *
+ *  - @p sizes must be a column of integer types.
+ *  - @p starts and @p steps columns must have the same type.
+ *  - All the input columns must not have nulls.
+ *  - If any row of the @p sizes column contains negative value, the output is undefined.
+ *
+ * @code{.pseudo}
+ * starts = [0, 1, 2, 3, 4]
+ * steps  = [2, 1, 1, 1, -3]
+ * sizes  = [0, 2, 2, 1, 3]
+ *
+ * output = [ [], [1, 2], [2, 3], [3], [4, 1, -2] ]
+ * @endcode
+ *
+ * @throws cudf::logic_error if @p sizes column is not of integer types.
+ * @throws cudf::logic_error if any input column has nulls.
+ * @throws cudf::logic_error if @p starts and @p steps columns have different types.
+ * @throws cudf::logic_error if @p starts, @p steps, and @p sizes columns do not have the same size.
+ *
+ * @param starts First values in the result sequences.
+ * @param steps Increment values for the result sequences.
+ * @param sizes Numbers of values in the result sequences.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return The result column containing generated sequences.
+ */
+std::unique_ptr<column> sequences(
+  column_view const& starts,
+  column_view const& steps,
+  column_view const& sizes,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of group
+}  // namespace cudf::lists
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
new file mode 100644
index 00000000000..5007918441b
--- /dev/null
+++ b/cpp/src/lists/sequences.cu
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/indexalator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/lists/filling.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/tabulate.h>
+
+#include <optional>
+
+namespace cudf::lists {
+namespace detail {
+namespace {
+template <typename T>
+struct tabulator {
+  size_type const n_lists;
+  size_type const n_elements;
+
+  T const* const starts;
+  T const* const steps;
+  offset_type const* const offsets;
+
+  template <typename U>
+  static std::enable_if_t<!cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
+  {
+    return x * static_cast<T>(times);
+  }
+
+  template <typename U>
+  static std::enable_if_t<cudf::is_duration<U>(), T> __device__ multiply(U x, size_type times)
+  {
+    return T{x.count() * times};
+  }
+
+  auto __device__ operator()(size_type idx) const
+  {
+    auto const list_idx_end = thrust::upper_bound(thrust::seq, offsets, offsets + n_lists, idx);
+    auto const list_idx     = thrust::distance(offsets, list_idx_end) - 1;
+    auto const list_offset  = offsets[list_idx];
+    auto const list_step    = steps ? steps[list_idx] : T{1};
+    return starts[list_idx] + multiply(list_step, idx - list_offset);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct sequences_functor {
+  template <typename... Args>
+  static std::unique_ptr<column> invoke(Args&&...)
+  {
+    CUDF_FAIL("Unsupported per-list sequence type-agg combination.");
+  }
+};
+
+struct sequences_dispatcher {
+  template <typename T>
+  std::unique_ptr<column> operator()(size_type n_lists,
+                                     size_type n_elements,
+                                     column_view const& starts,
+                                     std::optional<column_view> const& steps,
+                                     offset_type const* offsets,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return sequences_functor<T>::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr);
+  }
+};
+
+template <typename T>
+static constexpr bool is_supported()
+{
+  return (cudf::is_numeric<T>() && !cudf::is_boolean<T>()) || cudf::is_duration<T>();
+}
+
+template <typename T>
+struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
+  static std::unique_ptr<column> invoke(size_type n_lists,
+                                        size_type n_elements,
+                                        column_view const& starts,
+                                        std::optional<column_view> const& steps,
+                                        offset_type const* offsets,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    auto result =
+      make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr);
+    if (starts.is_empty()) { return result; }
+
+    auto const result_begin = result->mutable_view().template begin<T>();
+
+    // Use pointers instead of column_device_view to access start and step values should be enough.
+    // This is because we don't need to check for nulls and only support numeric and duration types.
+    auto const starts_begin = starts.template begin<T>();
+    auto const steps_begin  = steps ? steps.value().template begin<T>() : nullptr;
+
+    auto const op = tabulator<T>{n_lists, n_elements, starts_begin, steps_begin, offsets};
+    thrust::tabulate(rmm::exec_policy(stream), result_begin, result_begin + n_elements, op);
+
+    return result;
+  }
+};
+
+std::unique_ptr<column> make_empty_lists_column(data_type child_type,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
+{
+  auto offsets = make_empty_column(data_type(type_to_id<offset_type>()));
+  auto child   = make_empty_column(child_type);
+  return make_lists_column(
+    0, std::move(offsets), std::move(child), 0, rmm::device_buffer(0, stream, mr), stream, mr);
+}
+
+std::unique_ptr<column> sequences(column_view const& starts,
+                                  std::optional<column_view> const& steps,
+                                  column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(),
+               "starts and sizes input columns must not have nulls.");
+  CUDF_EXPECTS(starts.size() == sizes.size(),
+               "starts and sizes input columns must have the same number of rows.");
+  CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");
+
+  if (steps) {
+    auto const& steps_cv = steps.value();
+    CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
+    CUDF_EXPECTS(starts.size() == steps_cv.size(),
+                 "starts and steps input columns must have the same number of rows.");
+    CUDF_EXPECTS(starts.type() == steps_cv.type(),
+                 "starts and steps input columns must have the same type.");
+  }
+
+  auto const n_lists = starts.size();
+  if (n_lists == 0) { return make_empty_lists_column(starts.type(), stream, mr); }
+
+  // Generate list offsets for the output.
+  auto list_offsets = make_numeric_column(
+    data_type(type_to_id<offset_type>()), n_lists + 1, mask_state::UNALLOCATED, stream, mr);
+  auto const offsets_begin  = list_offsets->mutable_view().template begin<offset_type>();
+  auto const sizes_input_it = cudf::detail::indexalator_factory::make_input_iterator(sizes);
+
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), sizes_input_it, sizes_input_it + n_lists + 1, offsets_begin);
+  auto const n_elements = cudf::detail::get_value<size_type>(list_offsets->view(), n_lists, stream);
+
+  auto child = type_dispatcher(starts.type(),
+                               sequences_dispatcher{},
+                               n_lists,
+                               n_elements,
+                               starts,
+                               steps,
+                               offsets_begin,
+                               stream,
+                               mr);
+
+  return make_lists_column(n_lists,
+                           std::move(list_offsets),
+                           std::move(child),
+                           0,
+                           rmm::device_buffer(0, stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<column> sequences(column_view const& starts,
+                                  column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  return sequences(starts, std::nullopt, sizes, stream, mr);
+}
+
+std::unique_ptr<column> sequences(column_view const& starts,
+                                  column_view const& steps,
+                                  column_view const& sizes,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  return sequences(starts, std::optional<column_view>{steps}, sizes, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> sequences(column_view const& starts,
+                                  column_view const& sizes,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sequences(starts, sizes, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> sequences(column_view const& starts,
+                                  column_view const& steps,
+                                  column_view const& sizes,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::sequences(starts, steps, sizes, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf::lists
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c1c209b2413..d90260400a0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -442,6 +442,7 @@ ConfigureTest(
   lists/drop_list_duplicates_tests.cpp
   lists/explode_tests.cpp
   lists/extract_tests.cpp
+  lists/sequences_tests.cpp
   lists/sort_lists_tests.cpp
 )
 
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
new file mode 100644
index 00000000000..2dafeaf5cea
--- /dev/null
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/lists/filling.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+using namespace cudf::test::iterators;
+
+namespace {
+template <typename T, typename U = int32_t>
+using ListsCol = cudf::test::lists_column_wrapper<T, U>;
+
+template <typename T, typename U = int32_t>
+using FWDCol = cudf::test::fixed_width_column_wrapper<T, U>;
+
+using IntsCol = cudf::test::fixed_width_column_wrapper<int32_t>;
+}  // namespace
+
+/*-----------------------------------------------------------------------------------------------*/
+template <typename T>
+class NumericSequencesTypedTest : public cudf::test::BaseFixture {
+};
+using NumericTypes =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
+TYPED_TEST_SUITE(NumericSequencesTypedTest, NumericTypes);
+
+TYPED_TEST(NumericSequencesTypedTest, SimpleTestNoNull)
+{
+  using T = TypeParam;
+
+  auto const starts = FWDCol<T>{1, 2, 3};
+  auto const sizes  = IntsCol{5, 3, 4};
+
+  // Sequences with step == 1.
+  {
+    auto const expected =
+      ListsCol<T>{ListsCol<T>{1, 2, 3, 4, 5}, ListsCol<T>{2, 3, 4}, ListsCol<T>{3, 4, 5, 6}};
+    auto const result = cudf::lists::sequences(starts, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Sequences with various steps.
+  {
+    auto const steps = FWDCol<T>{1, 3, 2};
+    auto const expected =
+      ListsCol<T>{ListsCol<T>{1, 2, 3, 4, 5}, ListsCol<T>{2, 5, 8}, ListsCol<T>{3, 5, 7, 9}};
+    auto const result = cudf::lists::sequences(starts, steps, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TYPED_TEST(NumericSequencesTypedTest, ZeroSizesTest)
+{
+  using T = TypeParam;
+
+  auto const starts = FWDCol<T>{1, 2, 3};
+  auto const sizes  = IntsCol{0, 3, 0};
+
+  // Sequences with step == 1.
+  {
+    auto const expected = ListsCol<T>{ListsCol<T>{}, ListsCol<T>{2, 3, 4}, ListsCol<T>{}};
+    auto const result   = cudf::lists::sequences(starts, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Sequences with various steps.
+  {
+    auto const steps    = FWDCol<T>{1, 3, 2};
+    auto const expected = ListsCol<T>{ListsCol<T>{}, ListsCol<T>{2, 5, 8}, ListsCol<T>{}};
+    auto const result   = cudf::lists::sequences(starts, steps, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TYPED_TEST(NumericSequencesTypedTest, SlicedInputTestNoNulls)
+{
+  using T = TypeParam;
+  constexpr int32_t dont_care{123};
+
+  auto const starts_original =
+    FWDCol<T>{dont_care, dont_care, dont_care, 1, 2, 3, 4, 5, dont_care, dont_care};
+  auto const sizes_original = IntsCol{dont_care, 5, 3, 4, 1, 2, dont_care, dont_care};
+
+  auto const starts = cudf::slice(starts_original, {3, 8})[0];
+  auto const sizes  = cudf::slice(sizes_original, {1, 6})[0];
+
+  // Sequences with step == 1.
+  {
+    auto const expected = ListsCol<T>{ListsCol<T>{1, 2, 3, 4, 5},
+                                      ListsCol<T>{2, 3, 4},
+                                      ListsCol<T>{3, 4, 5, 6},
+                                      ListsCol<T>{4},
+                                      ListsCol<T>{5, 6}
+
+    };
+    auto const result   = cudf::lists::sequences(starts, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Sequences with various steps.
+  {
+    auto const steps_original = FWDCol<T>{dont_care, dont_care, 1, 3, 2, 2, 3, dont_care};
+    auto const steps          = cudf::slice(steps_original, {2, 7})[0];
+
+    auto const expected = ListsCol<T>{ListsCol<T>{1, 2, 3, 4, 5},
+                                      ListsCol<T>{2, 5, 8},
+                                      ListsCol<T>{3, 5, 7, 9},
+                                      ListsCol<T>{4},
+                                      ListsCol<T>{5, 8}
+
+    };
+    auto const result   = cudf::lists::sequences(starts, steps, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+/*-----------------------------------------------------------------------------------------------*/
+// Data generated using https://www.epochconverter.com/
+template <typename T>
+class DurationSequencesTypedTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_SUITE(DurationSequencesTypedTest, cudf::test::DurationTypes);
+
+// Start time is 1638477473L - Thursday, December 2, 2021 8:37:53 PM.
+constexpr int64_t start_time = 1638477473L;
+
+TYPED_TEST(DurationSequencesTypedTest, SequencesNoNull)
+{
+  using T = TypeParam;
+
+  auto const starts = FWDCol<T, int64_t>{start_time, start_time, start_time};
+  auto const sizes  = IntsCol{1, 2, 3};
+
+  // Sequences with step == 1.
+  {
+    auto const expected_h = std::vector<int64_t>{start_time, start_time + 1L, start_time + 2L};
+    auto const expected =
+      ListsCol<T, int64_t>{ListsCol<T, int64_t>{expected_h.begin(), expected_h.begin() + 1},
+                           ListsCol<T, int64_t>{expected_h.begin(), expected_h.begin() + 2},
+                           ListsCol<T, int64_t>{expected_h.begin(), expected_h.begin() + 3}};
+    auto const result = cudf::lists::sequences(starts, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Sequences with various steps, including negative.
+  {
+    auto const steps    = FWDCol<T, int64_t>{10L, -155L, -13L};
+    auto const expected = ListsCol<T, int64_t>{
+      ListsCol<T, int64_t>{start_time},
+      ListsCol<T, int64_t>{start_time, start_time - 155L},
+      ListsCol<T, int64_t>{start_time, start_time - 13L, start_time - 13L * 2L}};
+    auto const result = cudf::lists::sequences(starts, steps, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+/*-----------------------------------------------------------------------------------------------*/
+class NumericSequencesTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(NumericSequencesTest, EmptyInput)
+{
+  auto const starts   = IntsCol{};
+  auto const sizes    = IntsCol{};
+  auto const steps    = IntsCol{};
+  auto const expected = ListsCol<int32_t>{};
+
+  // Sequences with step == 1.
+  {
+    auto const result = cudf::lists::sequences(starts, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Sequences with given steps.
+  {
+    auto const result = cudf::lists::sequences(starts, steps, sizes);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(NumericSequencesTest, InvalidSizesInput)
+{
+  auto const starts = IntsCol{};
+  auto const steps  = IntsCol{};
+  auto const sizes  = FWDCol<float>{};
+
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+}
+
+TEST_F(NumericSequencesTest, MismatchedColumnSizesInput)
+{
+  auto const starts = IntsCol{1, 2, 3};
+  auto const steps  = IntsCol{1, 2};
+  auto const sizes  = IntsCol{1, 2, 3, 4};
+
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+}
+
+TEST_F(NumericSequencesTest, MismatchedColumnTypesInput)
+{
+  auto const starts = IntsCol{1, 2, 3};
+  auto const steps  = FWDCol<float>{1, 2, 3};
+  auto const sizes  = IntsCol{1, 2, 3};
+
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+}
+
+TEST_F(NumericSequencesTest, InputHasNulls)
+{
+  constexpr int32_t null{0};
+
+  {
+    auto const starts = IntsCol{{null, 2, 3}, null_at(0)};
+    auto const sizes  = IntsCol{1, 2, 3};
+    EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
+  }
+
+  {
+    auto const starts = IntsCol{1, 2, 3};
+    auto const sizes  = IntsCol{{null, 2, 3}, null_at(0)};
+    EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
+  }
+
+  {
+    auto const starts = IntsCol{1, 2, 3};
+    auto const steps  = IntsCol{{null, 2, 3}, null_at(0)};
+    auto const sizes  = IntsCol{1, 2, 3};
+    EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  }
+}

From b1ae789e8542e39a8b6811624ee1dba4ff1b1915 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 4 Jan 2022 17:58:31 -0600
Subject: [PATCH 115/202] Enable transpose for string columns in cudf python
 (#9937)

Fixes: #9930

This PR enables string columns to transpose in cudf python.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Christopher Harris (https://github.com/cwharris)

URL: https://github.com/rapidsai/cudf/pull/9937
---
 python/cudf/cudf/_lib/transpose.pyx      | 11 +++++------
 python/cudf/cudf/tests/test_dataframe.py |  2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index b33a3cefba7..931a2702612 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -28,20 +28,19 @@ def transpose(source):
         return source
 
     cats = None
-    dtype = source._columns[0].dtype
+    columns = source._columns
+    dtype = columns[0].dtype
 
     if is_categorical_dtype(dtype):
-        if any(not is_categorical_dtype(c.dtype) for c in source._columns):
+        if any(not is_categorical_dtype(c.dtype) for c in columns):
             raise ValueError('Columns must all have the same dtype')
-        cats = list(c.categories for c in source._columns)
+        cats = list(c.categories for c in columns)
         cats = cudf.core.column.concat_columns(cats).unique()
         source = cudf.core.frame.Frame(index=source._index, data=[
             (name, col._set_categories(cats, is_unique=True).codes)
             for name, col in source._data.items()
         ])
-    elif dtype.kind in 'OU':
-        raise NotImplementedError('Cannot transpose string columns')
-    elif any(c.dtype != dtype for c in source._columns):
+    elif any(c.dtype != dtype for c in columns):
         raise ValueError('Columns must all have the same dtype')
 
     cdef pair[unique_ptr[column], table_view] c_result
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 33c993cc56a..f42920b7c50 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1769,7 +1769,7 @@ def test_dataframe_shape_empty():
 
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 20])
-@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("dtype", dtypes + ["object"])
 @pytest.mark.parametrize("nulls", ["none", "some", "all"])
 def test_dataframe_transpose(nulls, num_cols, num_rows, dtype):
     # In case of `bool` dtype: pandas <= 1.2.5 type-casts

From f7cc6a088ca61457a2a9154b8328fc10594da251 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Wed, 5 Jan 2022 18:50:10 +0530
Subject: [PATCH 116/202] Rename aggregate_metadata in writer to fix name
 collision (#9938)

Fixes  #9935

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/9938
---
 cpp/src/io/parquet/reader_impl.cu  |  8 ++++----
 cpp/src/io/parquet/reader_impl.hpp |  4 ++--
 cpp/src/io/parquet/writer_impl.cu  | 14 +++++++-------
 cpp/src/io/parquet/writer_impl.hpp |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 69d480edf85..957cc85454c 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -301,7 +301,7 @@ struct metadata : public FileMetaData {
   }
 };
 
-class aggregate_metadata {
+class aggregate_reader_metadata {
   std::vector<metadata> const per_file_metadata;
   std::map<std::string, std::string> const agg_keyval_map;
   size_type const num_rows;
@@ -357,7 +357,7 @@ class aggregate_metadata {
   }
 
  public:
-  aggregate_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
+  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
     : per_file_metadata(metadatas_from_sources(sources)),
       agg_keyval_map(merge_keyval_metadata()),
       num_rows(calc_num_rows()),
@@ -822,7 +822,7 @@ class aggregate_metadata {
  */
 void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
                                int src_col_schema,
-                               aggregate_metadata const& md)
+                               aggregate_reader_metadata const& md)
 {
   // already generated for this level
   if (remap.find(src_col_schema) != remap.end()) { return; }
@@ -1587,7 +1587,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   : _mr(mr), _sources(std::move(sources))
 {
   // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_metadata>(_sources);
+  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 6564c4120a8..01fca5a8b50 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -46,7 +46,7 @@ using namespace cudf::io::parquet;
 using namespace cudf::io;
 
 // Forward declarations
-class aggregate_metadata;
+class aggregate_reader_metadata;
 
 /**
  * @brief Implementation for Parquet reader
@@ -199,7 +199,7 @@ class reader::impl {
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
-  std::unique_ptr<aggregate_metadata> _metadata;
+  std::unique_ptr<aggregate_reader_metadata> _metadata;
 
   // input columns to be processed
   std::vector<input_column_info> _input_columns;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index aceb3bfbec1..c1b67cbda07 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -79,12 +79,12 @@ parquet::Compression to_parquet_compression(compression_type compression)
 
 }  // namespace
 
-struct aggregate_metadata {
-  aggregate_metadata(std::vector<partition_info> const& partitions,
-                     size_type num_columns,
-                     std::vector<SchemaElement> schema,
-                     statistics_freq stats_granularity,
-                     std::vector<std::map<std::string, std::string>> const& kv_md)
+struct aggregate_writer_metadata {
+  aggregate_writer_metadata(std::vector<partition_info> const& partitions,
+                            size_type num_columns,
+                            std::vector<SchemaElement> schema,
+                            statistics_freq stats_granularity,
+                            std::vector<std::map<std::string, std::string>> const& kv_md)
     : version(1), schema(std::move(schema)), files(partitions.size())
   {
     for (size_t i = 0; i < partitions.size(); ++i) {
@@ -1226,7 +1226,7 @@ void writer::impl::write(table_view const& table, std::vector<partition_info> co
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
   if (!md) {
-    md = std::make_unique<aggregate_metadata>(
+    md = std::make_unique<aggregate_writer_metadata>(
       partitions, num_columns, std::move(this_table_schema), stats_granularity_, kv_md);
   } else {
     // verify the user isn't passing mismatched tables
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 1cefb91c904..e41832aaabe 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -45,7 +45,7 @@ namespace detail {
 namespace parquet {
 // Forward internal classes
 struct parquet_column_view;
-struct aggregate_metadata;
+struct aggregate_writer_metadata;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
@@ -214,7 +214,7 @@ class writer::impl {
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
-  std::unique_ptr<aggregate_metadata> md;
+  std::unique_ptr<aggregate_writer_metadata> md;
   // File footer key-value metadata. Written during write_chunked_end()
   std::vector<std::map<std::string, std::string>> kv_md;
   // optional user metadata

From 6a6fbb3cea51022a11a733c3ae40b6045e907795 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 6 Jan 2022 00:02:20 +0800
Subject: [PATCH 117/202] Add jni for sequences (#9972)

This PR add java binding for sequences API. and to fix https://github.com/rapidsai/cudf/issues/9424.

Authors:
  - Bobby Wang (https://github.com/wbo4958)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9972
---
 .../java/ai/rapids/cudf/ColumnVector.java     | 41 +++++++++++++-
 java/src/main/native/src/ColumnVectorJni.cpp  | 25 ++++++++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 54 ++++++++++++++++++-
 3 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index c83fe6adca1..d1c3777f2c4 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -498,6 +498,42 @@ public static ColumnVector sequence(Scalar initialValue, int rows) {
     }
     return new ColumnVector(sequence(initialValue.getScalarHandle(), 0, rows));
   }
+
+  /**
+   * Create a list column in which each row is a sequence of values starting from a `start` value,
+   * incrementing by one, and its cardinality is specified by a `size` value. The `start` and `size`
+   * values used to generate each list is taken from the corresponding row of the input start and
+   * size columns.
+   * @param start first values in the result sequences
+   * @param size numbers of values in the result sequences
+   * @return the new ColumnVector.
+   */
+  public static ColumnVector sequence(ColumnView start, ColumnView size) {
+    assert start.getNullCount() == 0 || size.getNullCount() == 0 : "starts and sizes input " +
+        "columns must not have nulls.";
+    return new ColumnVector(sequences(start.getNativeView(), size.getNativeView(), 0));
+  }
+
+  /**
+   * Create a list column in which each row is a sequence of values starting from a `start` value,
+   * incrementing by a `step` value, and its cardinality is specified by a `size` value.
+   * The values `start`, `step`, and `size` used to generate each list is taken from the
+   * corresponding row of the input starts, steps, and sizes columns.
+   * @param start first values in the result sequences
+   * @param size numbers of values in the result sequences
+   * @param step increment values for the result sequences.
+   * @return the new ColumnVector.
+   */
+  public static ColumnVector sequence(ColumnView start, ColumnView size, ColumnView step) {
+    assert start.getNullCount() == 0 || size.getNullCount() == 0 || step.getNullCount() == 0:
+        "start, size and step must not have nulls.";
+    assert step.getType() == start.getType() : "start and step input columns must" +
+        " have the same type.";
+
+    return new ColumnVector(sequences(start.getNativeView(), size.getNativeView(),
+        step.getNativeView()));
+  }
+
   /**
    * Create a new vector by concatenating multiple columns together.
    * Note that all columns must have the same type.
@@ -789,6 +825,9 @@ public ColumnVector castTo(DType type) {
 
   private static native long sequence(long initialValue, long step, int rows);
 
+  private static native long sequences(long startHandle, long sizeHandle, long stepHandle)
+      throws CudfException;
+
   private static native long fromArrow(int type, long col_length,
       long null_count, ByteBuffer data, ByteBuffer validity,
       ByteBuffer offsets) throws CudfException;
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index cfad89cb399..e61ab8444d1 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/interop.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
+#include <cudf/lists/filling.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/reshape.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -54,6 +55,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env, jclass,
+                                                                   jlong j_start_handle,
+                                                                   jlong j_size_handle,
+                                                                   jlong j_step_handle) {
+  JNI_NULL_CHECK(env, j_start_handle, "start is null", 0);
+  JNI_NULL_CHECK(env, j_size_handle, "size is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto start = reinterpret_cast<cudf::column_view const *>(j_start_handle);
+    auto size = reinterpret_cast<cudf::column_view const *>(j_size_handle);
+    auto step = reinterpret_cast<cudf::column_view const *>(j_step_handle);
+    std::unique_ptr<cudf::column> col;
+    if (step) {
+      col = cudf::lists::sequences(*start, *step, *size);
+    } else {
+      col = cudf::lists::sequences(*start, *size);
+    }
+    return reinterpret_cast<jlong>(col.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
     JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj,
     jobject j_validity_obj, jobject j_offsets_obj) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0771de9492d..8d4bbff1542 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -1216,6 +1216,58 @@ void testSequenceOtherTypes() {
     });
   }
 
+  @Test
+  void testSequencesInt() {
+    try (ColumnVector start = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5);
+         ColumnVector size = ColumnVector.fromBoxedInts(2, 3, 2, 0, 1);
+         ColumnVector step = ColumnVector.fromBoxedInts(2, -1, 1, 1, 0);
+         ColumnVector cv = ColumnVector.sequence(start, size, step);
+         ColumnVector cv1 = ColumnVector.sequence(start, size);
+         ColumnVector expectCv = ColumnVector.fromLists(
+             new ListType(true, new BasicType(false, DType.INT32)),
+             Arrays.asList(1, 3),
+             Arrays.asList(2, 1, 0),
+             Arrays.asList(3, 4),
+             Arrays.asList(),
+             Arrays.asList(5));
+         ColumnVector expectCv1 = ColumnVector.fromLists(
+             new ListType(true, new BasicType(false, DType.INT32)),
+             Arrays.asList(1, 2),
+             Arrays.asList(2, 3, 4),
+             Arrays.asList(3, 4),
+             Arrays.asList(),
+             Arrays.asList(5))) {
+      assertColumnsAreEqual(expectCv, cv);
+      assertColumnsAreEqual(expectCv1, cv1);
+    }
+  }
+
+  @Test
+  void testSequencesDouble() {
+    try (ColumnVector start = ColumnVector.fromBoxedDoubles(1.2, 2.2, 3.2, 4.2, 5.2);
+         ColumnVector size = ColumnVector.fromBoxedInts(2, 3, 2, 0, 1);
+         ColumnVector step = ColumnVector.fromBoxedDoubles(2.1, -1.1, 1.1, 1.1, 0.1);
+         ColumnVector cv = ColumnVector.sequence(start, size, step);
+         ColumnVector cv1 = ColumnVector.sequence(start, size);
+         ColumnVector expectCv = ColumnVector.fromLists(
+             new ListType(true, new BasicType(false, DType.FLOAT64)),
+             Arrays.asList(1.2, 3.3),
+             Arrays.asList(2.2, 1.1, 0.0),
+             Arrays.asList(3.2, 4.3),
+             Arrays.asList(),
+             Arrays.asList(5.2));
+         ColumnVector expectCv1 = ColumnVector.fromLists(
+             new ListType(true, new BasicType(false, DType.FLOAT64)),
+             Arrays.asList(1.2, 2.2),
+             Arrays.asList(2.2, 3.2, 4.2),
+             Arrays.asList(3.2, 4.2),
+             Arrays.asList(),
+             Arrays.asList(5.2))) {
+      assertColumnsAreEqual(expectCv, cv);
+      assertColumnsAreEqual(expectCv1, cv1);
+    }
+  }
+
   @Test
   void testFromScalarZeroRows() {
     // magic number to invoke factory method specialized for decimal types

From 3e893a699311293abc34b032665cb613fc3e6eb5 Mon Sep 17 00:00:00 2001
From: Vibhu Jawa <vibhujawa@gmail.com>
Date: Wed, 5 Jan 2022 22:28:09 +0530
Subject: [PATCH 118/202] Remove str.subword_tokenize (#9968)

This PR resolves https://github.com/rapidsai/cudf/issues/8604 and https://github.com/rapidsai/cudf/issues/9447

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - David Wendt (https://github.com/davidwendt)
  - Christopher Harris (https://github.com/cwharris)

URL: https://github.com/rapidsai/cudf/pull/9968
---
 cpp/include/nvtext/subword_tokenize.hpp       |  23 +--
 cpp/src/text/subword/subword_tokenize.cu      |  22 ---
 cpp/tests/text/subword_tests.cpp              |  21 ++-
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |  35 ----
 python/cudf/cudf/_lib/strings/__init__.py     |   1 -
 python/cudf/cudf/core/column/string.py        | 113 ------------
 python/cudf/cudf/core/subword_tokenizer.py    |   2 +-
 .../cudf/cudf/tests/test_subword_tokenizer.py | 168 +++++++++++++++---
 python/cudf/cudf/tests/test_text.py           | 131 --------------
 9 files changed, 160 insertions(+), 356 deletions(-)

diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 8cc000ff095..2b09ec66203 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -130,9 +130,7 @@ struct tokenizer_result {
  *        larger than the max value for cudf::size_type
  *
  * @param strings The input strings to tokenize.
- * @param filename_hashed_vocabulary A path to the preprocessed vocab.txt file.
- *        Note that this is the file AFTER python/perfect_hash.py has been used
- *        for preprocessing.
+ * @param vocabulary_table The vocabulary table pre-loaded into this object.
  * @param max_sequence_length Limit of the number of token-ids per row in final tensor
  *        for each string.
  * @param stride Each row in the output token-ids will replicate `max_sequence_length - stride`
@@ -150,25 +148,6 @@ struct tokenizer_result {
  * @param mr Memory resource to allocate any returned objects.
  * @return token-ids, attention-mask, and metadata
  */
-tokenizer_result subword_tokenize(
-  cudf::strings_column_view const& strings,
-  std::string const& filename_hashed_vocabulary,
-  uint32_t max_sequence_length,
-  uint32_t stride,
-  bool do_lower_case,
-  bool do_truncate,
-  uint32_t max_rows_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc subword_tokenize()
- *
- * This function differs from the one above by only the hashed vocabulary parameter.
- * The file can be pre-loaded using the @ref load_vocabulary_file API and then
- * passed in place of the file name in a call to this API.
- *
- * @param vocabulary_table The vocabulary table pre-loaded into this object.
- */
 tokenizer_result subword_tokenize(
   cudf::strings_column_view const& strings,
   hashed_vocabulary const& vocabulary_table,
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 6de1044b492..193cd80d9a6 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -249,28 +249,6 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
 
 }  // namespace detail
 
-tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
-                                  std::string const& filename_hashed_vocabulary,
-                                  uint32_t max_sequence_length,
-                                  uint32_t stride,
-                                  bool do_lower_case,
-                                  bool do_truncate,
-                                  uint32_t max_rows_tensor,
-                                  rmm::mr::device_memory_resource* mr)
-{
-  auto vocab_table = load_vocabulary_file(filename_hashed_vocabulary, mr);
-  CUDF_FUNC_RANGE();
-  return detail::subword_tokenize(strings,
-                                  *vocab_table,
-                                  max_sequence_length,
-                                  stride,
-                                  do_lower_case,
-                                  do_truncate,
-                                  max_rows_tensor,
-                                  rmm::cuda_stream_default,
-                                  mr);
-}
-
 tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   hashed_vocabulary const& vocabulary_table,
                                   uint32_t max_sequence_length,
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 65cc466fee7..521a082faa2 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -67,12 +67,13 @@ TEST(TextSubwordTest, Tokenize)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 16;
   uint32_t stride              = 16;
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -119,12 +120,13 @@ TEST(TextSubwordTest, TokenizeMultiRow)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 8;
   uint32_t stride              = 6;
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -148,12 +150,13 @@ TEST(TextSubwordTest, TokenizeMaxEqualsTokens)
   cudf::test::strings_column_wrapper strings({"This is a test."});
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
 
   uint32_t max_sequence_length = 5;  // five tokens in strings;
   uint32_t stride              = 5;  // this should not effect the result
 
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          max_sequence_length,
                                          stride,
                                          true,   // do_lower_case
@@ -175,8 +178,10 @@ TEST(TextSubwordTest, ParameterErrors)
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                        hash_file,
+                                        *vocab,
                                         12,    // max_sequence_length
                                         13,    // stride <= max_sequence_length
                                         true,  // do_lower_case
@@ -185,7 +190,7 @@ TEST(TextSubwordTest, ParameterErrors)
                cudf::logic_error);
 
   EXPECT_THROW(nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                        hash_file,
+                                        *vocab,
                                         5,
                                         5,
                                         true,  // do_lower_case
@@ -199,8 +204,9 @@ TEST(TextSubwordTest, EmptyStrings)
   cudf::test::strings_column_wrapper strings;
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          16,
                                          16,
                                          true,   // do_lower_case
@@ -217,8 +223,9 @@ TEST(TextSubwordTest, AllNullStrings)
   cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
+  auto vocab  = nvtext::load_vocabulary_file(hash_file);
   auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                         hash_file,
+                                         *vocab,
                                          16,
                                          16,
                                          true,   // do_lower_case
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index 49f24436b88..426744ee46c 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -58,38 +58,3 @@ def subword_tokenize_inmem_hash(
     masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
     metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
     return tokens, masks, metadata
-
-
-def subword_tokenize_vocab_file(
-    Column strings,
-    object   hash_file,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-    uint32_t max_rows_tensor=500
-):
-    """
-        Subword tokenizes text series by using the hashed vocabulary
-        stored on disk
-    """
-    cdef column_view c_strings = strings.view()
-    cdef cpp_tokenizer_result c_result
-    cdef string c_hash_file = <string>str(hash_file).encode()
-    with nogil:
-        c_result = tr_move(
-            cpp_subword_tokenize(
-                c_strings,
-                c_hash_file,
-                max_sequence_length,
-                stride,
-                do_lower,
-                do_truncate,
-                max_rows_tensor
-            )
-        )
-    # return the 3 tensor components
-    tokens = Column.from_unique_ptr(move(c_result.tensor_token_ids))
-    masks = Column.from_unique_ptr(move(c_result.tensor_attention_mask))
-    metadata = Column.from_unique_ptr(move(c_result.tensor_metadata))
-    return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index fbc1538cc74..7911d0eff2a 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -12,7 +12,6 @@
     is_letter_multi,
     porter_stemmer_measure,
 )
-from cudf._lib.nvtext.subword_tokenize import subword_tokenize_vocab_file
 from cudf._lib.nvtext.tokenize import (
     _count_tokens_column,
     _count_tokens_scalar,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1c9a013810a..a83110d273c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4711,119 +4711,6 @@ def filter_tokens(
             ),
         )
 
-    def subword_tokenize(
-        self,
-        hash_file: str,
-        max_length: int = 64,
-        stride: int = 48,
-        do_lower: bool = True,
-        do_truncate: bool = False,
-        max_rows_tensor: int = 500,
-    ) -> Tuple[cupy.ndarray, cupy.ndarray, cupy.ndarray]:
-        """
-        Run CUDA BERT subword tokenizer on cuDF strings column.
-        Encodes words to token ids using vocabulary from a pretrained
-        tokenizer.
-
-        This function requires about 21x the number of character bytes
-        in the input strings column as working memory.
-
-        ``Series.str.subword_tokenize`` is deprecated and will be removed.
-        Use ``cudf.core.subword_tokenizer.SubwordTokenizer`` instead.
-
-        Parameters
-        ----------
-        hash_file : str
-            Path to hash file containing vocabulary of words with token-ids.
-            This can be created from the raw vocabulary
-            using the ``cudf.utils.hash_vocab_utils.hash_vocab`` function
-        max_length : int, Default is 64
-            Limits the length of the sequence returned.
-            If tokenized string is shorter than max_length,
-            output will be padded with 0s.
-            If the tokenized string is longer than max_length and
-            do_truncate == False, there will be multiple returned
-            sequences containing the overflowing token-ids.
-        stride : int, Default is 48
-            If do_truncate == False and the tokenized string is larger
-            than max_length, the sequences containing the overflowing
-            token-ids can contain duplicated token-ids from the main
-            sequence. If max_length is equal to stride there are no
-            duplicated-id tokens. If stride is 80% of max_length,
-            20% of the first sequence will be repeated on the second
-            sequence and so on until the entire sentence is encoded.
-        do_lower : bool, Default is True
-            If set to true, original text will be lowercased before encoding.
-        do_truncate : bool, Default is False
-            If set to true, strings will be truncated and padded to
-            max_length. Each input string will result in exactly one output
-            sequence. If set to false, there may be multiple output
-            sequences when the max_length is smaller than generated tokens.
-        max_rows_tensor : int, Default is 500
-            Maximum number of rows for the output token-ids expected
-            to be generated by the tokenizer.
-            Used for allocating temporary working memory on the GPU device.
-            If the output generates a larger number of rows, behavior
-            is undefined.
-            This will vary based on stride, truncation, and max_length.
-            For example, for non-overlapping sequences output rows
-            will be the same as input rows.
-
-        Returns
-        -------
-        token-ids : cupy.ndarray
-            The token-ids for each string padded with 0s to max_length.
-        attention-mask : cupy.ndarray
-            The mask for token-ids result where corresponding positions
-            identify valid token-id values.
-        metadata : cupy.ndarray
-            Each row contains the index id of the original string and the
-            first and last index of the token-ids that are non-padded and
-            non-overlapping.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> from cudf.utils.hash_vocab_utils import hash_vocab
-        >>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt')
-        >>> ser = cudf.Series(['this is the', 'best book'])
-        >>> stride, max_length = 8, 8
-        >>> max_rows_tensor = len(ser)
-        >>> tokens, masks, metadata = ser.str.subword_tokenize('voc_hash.txt',
-        ... max_length=max_length, stride=stride,
-        ... max_rows_tensor=max_rows_tensor)
-        >>> tokens.reshape(-1, max_length)
-        array([[2023, 2003, 1996,    0,    0,    0,    0,    0],
-               [2190, 2338,    0,    0,    0,    0,    0,    0]], dtype=uint32)
-        >>> masks.reshape(-1, max_length)
-        array([[1, 1, 1, 0, 0, 0, 0, 0],
-               [1, 1, 0, 0, 0, 0, 0, 0]], dtype=uint32)
-        >>> metadata.reshape(-1, 3)
-        array([[0, 0, 2],
-               [1, 0, 1]], dtype=uint32)
-        """
-        warnings.warn(
-            "`Series.str.subword_tokenize` is deprecated and will be removed "
-            "in future versions of cudf. Use "
-            "`cudf.core.subword_tokenizer.SubwordTokenizer` instead.",
-            FutureWarning,
-        )
-
-        tokens, masks, metadata = libstrings.subword_tokenize_vocab_file(
-            self._column,
-            hash_file,
-            max_length,
-            stride,
-            do_lower,
-            do_truncate,
-            max_rows_tensor,
-        )
-        return (
-            cupy.asarray(tokens),
-            cupy.asarray(masks),
-            cupy.asarray(metadata),
-        )
-
     def porter_stemmer_measure(self) -> SeriesOrIndex:
         """
         Compute the Porter Stemmer measure for each string.
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 3502fc9acae..782b74ef4a6 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -21,7 +21,7 @@ def _cast_to_appropriate_type(ar, cast_type):
         from torch.utils.dlpack import from_dlpack
 
     elif cast_type == "tf":
-        from tf.experimental.dlpack import from_dlpack
+        from tensorflow.experimental.dlpack import from_dlpack
 
     return from_dlpack(ar.astype("int32").toDlpack())
 
diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py
index 717b3de8479..ec6e0b30cb1 100644
--- a/python/cudf/cudf/tests/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/test_subword_tokenizer.py
@@ -1,12 +1,14 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 import os
 
+import cupy
 import numpy as np
 import pytest
 from transformers import BertTokenizer
 
 import cudf
 from cudf.core.subword_tokenizer import SubwordTokenizer
+from cudf.testing._utils import assert_eq
 
 
 @pytest.fixture(scope="module")
@@ -26,30 +28,6 @@ def assert_equal_tokenization_outputs(hf_output, cudf_output):
     )
 
 
-def test_subword_tokenize_on_disk_vocab_str_api(datadir):
-    """
-    Tests the subword-tokenizer API where
-    the vocabulary is not pre-loaded
-    and is accessed via the string accessor
-    """
-    with open(
-        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
-    ) as file:
-        input_sentence_ls = [line.strip() for line in file]
-
-    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
-    vocab_hash_path = os.path.join(vocab_dir, "vocab-hash.txt")
-
-    ser = cudf.Series(input_sentence_ls)
-    tokens, masks, metadata = ser.str.subword_tokenize(
-        vocab_hash_path,
-        max_length=32,
-        stride=32,
-        do_lower=True,
-        max_rows_tensor=len(ser),
-    )
-
-
 @pytest.mark.parametrize("seq_len", [32, 64])
 @pytest.mark.parametrize("stride", [0, 15, 30])
 @pytest.mark.parametrize("add_special_tokens", [True, False])
@@ -115,3 +93,145 @@ def test_subword_tokenize_with_truncation(datadir):
             truncation=False,
             add_special_tokens=True,
         )
+
+
+def test_text_subword_tokenize(tmpdir):
+    sr = cudf.Series(
+        [
+            "This is a test",
+            "A test this is",
+            "Is test a this",
+            "Test   test",
+            "this   This",
+        ]
+    )
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n23\n"
+    coefficients = [65559] * 23
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    # based on values from the bert_hash_table.txt file for the
+    # test words used here: 'this' 'is' 'a' test'
+    table = [0] * 23
+    table[0] = 3015668
+    table[1] = 6205475701751155871
+    table[5] = 6358029
+    table[16] = 451412625363
+    table[20] = 6206321707968235495
+    content = content + "23\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+
+    cudf_tokenizer = SubwordTokenizer(hash_file)
+
+    token_d = cudf_tokenizer(
+        sr, 8, 8, add_special_tokens=False, truncation=True
+    )
+    tokens, masks, metadata = (
+        token_d["input_ids"],
+        token_d["attention_mask"],
+        token_d["metadata"],
+    )
+    expected_tokens = cupy.asarray(
+        [
+            2023,
+            2003,
+            1037,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            1037,
+            3231,
+            2023,
+            2003,
+            0,
+            0,
+            0,
+            0,
+            2003,
+            3231,
+            1037,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            3231,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            2023,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_tokens = expected_tokens.reshape(-1, 8)
+    assert_eq(expected_tokens, tokens)
+
+    expected_masks = cupy.asarray(
+        [
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_masks = expected_masks.reshape(-1, 8)
+    assert_eq(expected_masks, masks)
+
+    expected_metadata = cupy.asarray(
+        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
+    )
+    expected_metadata = expected_metadata.reshape(-1, 3)
+    assert_eq(expected_metadata, metadata)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index fcae0a21b6a..a447a60c709 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2019, NVIDIA CORPORATION.
 
-import cupy
 import numpy as np
 import pytest
 
@@ -655,136 +654,6 @@ def test_text_filter_tokens_error_cases():
         sr.str.filter_tokens(3, delimiter=["a", "b"])
 
 
-def test_text_subword_tokenize(tmpdir):
-    sr = cudf.Series(
-        [
-            "This is a test",
-            "A test this is",
-            "Is test a this",
-            "Test   test",
-            "this   This",
-        ]
-    )
-    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
-    content = "1\n0\n23\n"
-    coefficients = [65559] * 23
-    for c in coefficients:
-        content = content + str(c) + " 0\n"
-    # based on values from the bert_hash_table.txt file for the
-    # test words used here: 'this' 'is' 'a' test'
-    table = [0] * 23
-    table[0] = 3015668
-    table[1] = 6205475701751155871
-    table[5] = 6358029
-    table[16] = 451412625363
-    table[20] = 6206321707968235495
-    content = content + "23\n"
-    for v in table:
-        content = content + str(v) + "\n"
-    content = content + "100\n101\n102\n\n"
-    hash_file.write(content)
-
-    tokens, masks, metadata = sr.str.subword_tokenize(str(hash_file), 8, 8)
-    expected_tokens = cupy.asarray(
-        [
-            2023,
-            2003,
-            1037,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            1037,
-            3231,
-            2023,
-            2003,
-            0,
-            0,
-            0,
-            0,
-            2003,
-            3231,
-            1037,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            3231,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            2023,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    assert_eq(expected_tokens, tokens)
-
-    expected_masks = cupy.asarray(
-        [
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    assert_eq(expected_masks, masks)
-
-    expected_metadata = cupy.asarray(
-        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
-    )
-    assert_eq(expected_metadata, metadata)
-
-
 def test_edit_distance():
     sr = cudf.Series(["kitten", "saturday", "address", "book"])
     tg = cudf.Series(["sitting", "sunday", "addressee", "back"])

From 2112757aae7f87f3176caac0d6bc158969c661cd Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 5 Jan 2022 11:32:21 -0600
Subject: [PATCH 119/202] Replace cudf's concurrent_ordered_map with
 cuco::static_map in semi/anti joins (#9666)

This PR resolves #9586, replacing the hash table used in semi and anti joins with cuco::static_map. It depends on NVIDIA/cuCollections#118. At present the code is slower than the original version, so we'll probably want to make some optimizations in cuco before merging this.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Conor Hoekstra (https://github.com/codereport)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9666
---
 cpp/cmake/thirdparty/get_cucollections.cmake |   2 +-
 cpp/src/join/hash_join.cu                    |  16 ---
 cpp/src/join/join_common_utils.cuh           |  16 +++
 cpp/src/join/semi_join.cu                    | 105 +++++++++++--------
 cpp/tests/join/semi_anti_join_tests.cpp      |  15 +++
 5 files changed, 94 insertions(+), 60 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index b58bdb55de3..16e7a58b020 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 6433e8ad7571f14cc5384051b049029c60dd1ce0
+    GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c6f842c6c55..c259be2a285 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -36,22 +36,6 @@ namespace detail {
 
 namespace {
 
-/**
- * @brief Device functor to determine if a row is valid.
- */
-class row_is_valid {
- public:
-  row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
-
-  __device__ __inline__ bool operator()(const size_type& i) const noexcept
-  {
-    return cudf::bit_is_set(_row_bitmask, i);
-  }
-
- private:
-  bitmask_type const* _row_bitmask;
-};
-
 }  // anonymous namespace
 
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 39a9f19c0ee..2fd0207a2c0 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -27,6 +27,22 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Device functor to determine if a row is valid.
+ */
+class row_is_valid {
+ public:
+  row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
+
+  __device__ __inline__ bool operator()(const size_type& i) const noexcept
+  {
+    return cudf::bit_is_set(_row_bitmask, i);
+  }
+
+ private:
+  bitmask_type const* _row_bitmask;
+};
+
 /**
  * @brief Device functor to determine if two pairs are identical.
  */
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index e781472e025..5eb8ca2452e 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -14,11 +14,12 @@
  * limitations under the License.
  */
 
-#include <hash/concurrent_unordered_map.cuh>
+#include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/structs/utilities.hpp>
@@ -34,10 +35,28 @@
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/sequence.h>
+#include <thrust/tuple.h>
+
+#include <cuco/static_map.cuh>
 
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+struct make_pair_function {
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // The value is irrelevant since we only ever use the hash map to check for
+    // membership of a particular row index.
+    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+  }
+};
+
+}  // namespace
+
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   join_kind const kind,
   cudf::table_view const& left_keys,
@@ -71,67 +90,67 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto right_flattened_keys = right_flattened_tables.flattened_columns();
   auto left_flattened_keys  = left_flattened_tables.flattened_columns();
 
-  // Only care about existence, so we'll use an unordered map (other joins need a multimap)
-  using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
+  // Create hash table.
+  auto hash_table = cuco::
+    static_map<hash_value_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>{
+      compute_hash_table_size(right_num_rows),
+      std::numeric_limits<hash_value_type>::max(),
+      cudf::detail::JoinNoneValue,
+      hash_table_allocator_type{default_allocator<char>{}, stream},
+      stream.value()};
 
   // Create hash table containing all keys found in right table
-  auto right_rows_d            = table_device_view::create(right_flattened_keys, stream);
-  size_t const hash_table_size = compute_hash_table_size(right_num_rows);
-  auto const right_nulls       = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)};
-  row_hash hash_build{right_nulls, *right_rows_d};
+  auto right_rows_d      = table_device_view::create(right_flattened_keys, stream);
+  auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)};
+  row_hash const hash_build{right_nulls, *right_rows_d};
   row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls};
+  make_pair_function pair_func_build{};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
 
-  // Going to join it with left table
-  auto left_rows_d      = table_device_view::create(left_flattened_keys, stream);
-  auto const left_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(left_flattened_keys)};
-  row_hash hash_probe{left_nulls, *left_rows_d};
-  row_equality equality_probe{left_nulls, *left_rows_d, *right_rows_d, compare_nulls};
-
-  auto hash_table_ptr = hash_table_type::create(hash_table_size,
-                                                stream,
-                                                std::numeric_limits<bool>::max(),
-                                                std::numeric_limits<cudf::size_type>::max(),
-                                                hash_build,
-                                                equality_build);
-  auto hash_table     = *hash_table_ptr;
-
-  // if compare_nulls == UNEQUAL, we can simply ignore any rows that
-  // contain a NULL in any column as they will never compare to equal.
-  auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
-                             ? rmm::device_buffer{}
-                             : cudf::detail::bitmask_and(right_flattened_keys, stream).first;
   // skip rows that are null here.
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    right_num_rows,
-    [hash_table, row_bitmask = static_cast<bitmask_type const*>(row_bitmask.data())] __device__(
-      size_type idx) mutable {
-      if (!row_bitmask || cudf::bit_is_set(row_bitmask, idx)) {
-        hash_table.insert(thrust::make_pair(idx, true));
-      }
-    });
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(right_keys))) {
+    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const [row_bitmask, _] = cudf::detail::bitmask_and(right_flattened_keys, stream);
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(
+      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+  }
 
-  //
   // Now we have a hash table, we need to iterate over the rows of the left table
   // and check to see if they are contained in the hash table
-  //
+  auto left_rows_d      = table_device_view::create(left_flattened_keys, stream);
+  auto const left_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(left_flattened_keys)};
+  row_hash hash_probe{left_nulls, *left_rows_d};
+  // Note: This equality comparator violates symmetry of equality and is
+  // therefore relying on the implementation detail of the order in which its
+  // operator is invoked. If cuco makes no promises about the order of
+  // invocation this seems a bit unsafe.
+  row_equality equality_probe{left_nulls, *right_rows_d, *left_rows_d, compare_nulls};
 
   // For semi join we want contains to be true, for anti join we want contains to be false
   bool const join_type_boolean = (kind == join_kind::LEFT_SEMI_JOIN);
 
+  auto hash_table_view = hash_table.get_device_view();
+
   auto gather_map =
     std::make_unique<rmm::device_uvector<cudf::size_type>>(left_num_rows, stream, mr);
 
   // gather_map_end will be the end of valid data in gather_map
   auto gather_map_end = thrust::copy_if(
     rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(left_num_rows),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(left_num_rows),
     gather_map->begin(),
-    [hash_table, join_type_boolean, hash_probe, equality_probe] __device__(size_type idx) {
-      auto pos = hash_table.find(idx, hash_probe, equality_probe);
-      return (pos != hash_table.end()) == join_type_boolean;
+    [hash_table_view, join_type_boolean, hash_probe, equality_probe] __device__(
+      size_type const idx) {
+      // Look up this row. The hash function used here needs to map a (left) row index to the hash
+      // of the row, so it's a row hash. The equality check needs to verify
+      return hash_table_view.contains(idx, hash_probe, equality_probe) == join_type_boolean;
     });
 
   auto join_size = thrust::distance(gather_map->begin(), gather_map_end);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 5b38bafb122..ff4270058cd 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -39,6 +39,21 @@ using Table          = cudf::table;
 struct JoinTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(JoinTest, TestSimple)
+{
+  column_wrapper<int32_t> left_col0{0, 1, 2};
+  column_wrapper<int32_t> right_col0{0, 1, 3};
+
+  auto left  = cudf::table_view{{left_col0}};
+  auto right = cudf::table_view{{right_col0}};
+
+  auto result    = cudf::left_semi_join(left, right);
+  auto result_cv = cudf::column_view(
+    cudf::data_type{cudf::type_to_id<cudf::size_type>()}, result->size(), result->data());
+  column_wrapper<cudf::size_type> expected{0, 1};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_cv);
+};
+
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
   std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
 {

From eba4f0312cc2b8315bbd950f7aa3c2680f70635f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 5 Jan 2022 12:58:11 -0500
Subject: [PATCH 120/202] Add cudf::strings::extract_all API (#9909)

Closes #9856

Adds a new `cudf::strings::extract_all` API that returns a LIST column of extracted strings given a regex pattern.

This is similar to nvstrings version of `extract` called `extract_record` but returns groups from all matches in each string instead of just the first match. Here is pseudo code of it's behavior on various strings input:
```
s = [ "ABC-200 DEF-400", "GHI-60", "JK-800", "900", NULL ]
r =  extract_all( s, "'(\w+)-(\d+)" )
r is a LIST column of strings that looks like this:

[ [ "ABC", "200", "DEF", "400" ], // 2 matches
  [ "GHI", "60" ], // 1 match
  [ "JK", "800" ], // 1 match
  NULL,            // no match
  NULL
]
```
Each match results in two groups as specified in the regex pattern.

Also reorganized the extract source code into `src/strings/extract` directory.
The match-counting has been factored out into new `count_matches.cuh` since it will become common code used with `findall_record` in a follow on PR.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9909
---
 cpp/CMakeLists.txt                            |   3 +-
 .../detail/strings_column_factories.cuh       |   8 +-
 cpp/include/cudf/strings/extract.hpp          |  50 ++++-
 cpp/src/strings/count_matches.cuh             | 105 ++++++++++
 cpp/src/strings/{ => extract}/extract.cu      |   0
 cpp/src/strings/extract/extract_all.cu        | 191 ++++++++++++++++++
 cpp/tests/strings/extract_tests.cpp           |  45 ++++-
 7 files changed, 386 insertions(+), 16 deletions(-)
 create mode 100644 cpp/src/strings/count_matches.cuh
 rename cpp/src/strings/{ => extract}/extract.cu (100%)
 create mode 100644 cpp/src/strings/extract/extract_all.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 624293ad87c..84e486c7e18 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -417,7 +417,8 @@ add_library(
   src/strings/copying/concatenate.cu
   src/strings/copying/copying.cu
   src/strings/copying/shift.cu
-  src/strings/extract.cu
+  src/strings/extract/extract.cu
+  src/strings/extract/extract_all.cu
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/findall.cu
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index b35f5df2903..9da3c6b0e91 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -33,6 +33,12 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Basic type expected for iterators passed to `make_strings_column` that represent string
+ * data in device memory.
+ */
+using string_index_pair = thrust::pair<const char*, size_type>;
+
 /**
  * @brief Average string byte-length threshold for deciding character-level
  * vs. row-level parallel algorithm.
@@ -64,8 +70,6 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   size_type strings_count = thrust::distance(begin, end);
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  using string_index_pair = thrust::pair<const char*, size_type>;
-
   // check total size is not too large for cudf column
   auto size_checker = [] __device__(string_index_pair const& item) {
     return (item.first != nullptr) ? item.second : 0;
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 6f5902266b2..466f71aace0 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,20 +27,21 @@ namespace strings {
  */
 
 /**
- * @brief Returns a vector of strings columns for each matching group specified in the given regular
- * expression pattern.
+ * @brief Returns a table of strings columns where each column corresponds to the matching
+ * group specified in the given regular expression pattern.
  *
  * All the strings for the first group will go in the first output column; the second group
- * go in the second column and so on. Null entries are added if the string does match.
+ * go in the second column and so on. Null entries are added to the columns in row `i` if
+ * the string at row `i` does not match.
  *
  * Any null string entries return corresponding null output column entries.
  *
  * @code{.pseudo}
  * Example:
- * s = ["a1","b2","c3"]
- * r = extract(s,"([ab])(\\d)")
- * r is now [["a","b",null],
- *           ["1","2",null]]
+ * s = ["a1", "b2", "c3"]
+ * r = extract(s, "([ab])(\\d)")
+ * r is now [ ["a", "b", null],
+ *            ["1", "2", null] ]
  * @endcode
  *
  * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
@@ -55,6 +56,39 @@ std::unique_ptr<table> extract(
   std::string const& pattern,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings where each string column row corresponds to the
+ * matching group specified in the given regular expression pattern.
+ *
+ * All the matching groups for the first row will go in the first row output column; the second
+ * row results will go into the second row output column and so on.
+ *
+ * A null output row will result if the corresponding input string row does not match or
+ * that input row is null.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1 b4", "b2", "c3 a5", "b", null]
+ * r = extract_all(s,"([ab])(\\d)")
+ * r is now [ ["a", "1", "b", "4"],
+ *            ["b", "2"],
+ *            ["a", "5"],
+ *            null,
+ *            null ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation.
+ * @param pattern The regular expression pattern with group indicators.
+ * @param mr Device memory resource used to allocate any returned device memory.
+ * @return Lists column containing strings extracted from the input column.
+ */
+std::unique_ptr<column> extract_all(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/count_matches.cuh b/cpp/src/strings/count_matches.cuh
new file mode 100644
index 00000000000..c14142f4779
--- /dev/null
+++ b/cpp/src/strings/count_matches.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <strings/regex/regex.cuh>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Functor counts the total matches to the given regex in each string.
+ */
+template <int stack_size>
+struct count_matches_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+
+  __device__ size_type operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+    size_type count  = 0;
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
+      ++count;
+      begin = end;
+      end   = d_str.length();
+    }
+    return count;
+  }
+};
+
+/**
+ * @brief Returns a column of regex match counts for each string in the given column.
+ *
+ * A null entry will result in a zero count for that output row.
+ *
+ * @param d_strings Device view of the input strings column.
+ * @param d_prog Regex instance to evaluate on each string.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+std::unique_ptr<column> count_matches(
+  column_device_view const& d_strings,
+  reprog_device const& d_prog,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  // Create output column
+  auto counts = make_numeric_column(
+    data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_counts = counts->mutable_view().data<offset_type>();
+
+  auto begin = thrust::make_counting_iterator<size_type>(0);
+  auto end   = thrust::make_counting_iterator<size_type>(d_strings.size());
+
+  // Count matches
+  auto const regex_insts = d_prog.insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    count_matches_fn<RX_STACK_SMALL> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    count_matches_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    count_matches_fn<RX_STACK_LARGE> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  } else {
+    count_matches_fn<RX_STACK_ANY> fn{d_strings, d_prog};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_counts, fn);
+  }
+
+  return counts;
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract/extract.cu
similarity index 100%
rename from cpp/src/strings/extract.cu
rename to cpp/src/strings/extract/extract.cu
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
new file mode 100644
index 00000000000..584741298c2
--- /dev/null
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/count_matches.cuh>
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Functor extracts matched string pointers for each input string.
+ *
+ * For regex match within a string, the specified groups are extracted into
+ * the `d_indices` output vector.
+ * The `d_offsets` are pre-computed to identify the location of where each
+ * string's output groups are to be written.
+ */
+template <int stack_size>
+struct extract_fn {
+  column_device_view const d_strings;
+  reprog_device d_prog;
+  offset_type const* d_offsets;
+  string_index_pair* d_indices;
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const groups    = d_prog.group_counts();
+    auto d_output        = d_indices + d_offsets[idx];
+    size_type output_idx = 0;
+
+    auto const d_str = d_strings.element<string_view>(idx);
+
+    int32_t begin = 0;
+    int32_t end   = d_str.length();
+    // match the regex
+    while ((begin < end) && d_prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+      // extract each group into the output
+      for (auto group_idx = 0; group_idx < groups; ++group_idx) {
+        // result is an optional containing the bounds of the extracted string at group_idx
+        auto const extracted = d_prog.extract<stack_size>(idx, d_str, begin, end, group_idx);
+
+        d_output[group_idx + output_idx] = [&] {
+          if (!extracted) { return string_index_pair{nullptr, 0}; }
+          auto const start_offset = d_str.byte_offset(extracted->first);
+          auto const end_offset   = d_str.byte_offset(extracted->second);
+          return string_index_pair{d_str.data() + start_offset, end_offset - start_offset};
+        }();
+      }
+      // continue to next match
+      begin = end;
+      end   = d_str.length();
+      output_idx += groups;
+    }
+  }
+};
+}  // namespace
+
+/**
+ * @copydoc cudf::strings::extract_all
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> extract_all(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto const strings_count = strings.size();
+  auto const d_strings     = column_device_view::create(strings.parent(), stream);
+
+  // Compile regex into device object.
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  // The extract pattern should always include groups.
+  auto const groups = d_prog->group_counts();
+  CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
+
+  // Get the match counts for each string.
+  // This column will become the output lists child offsets column.
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<offset_type>();
+
+  // Compute null output rows
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+    d_offsets, d_offsets + strings_count, [] __device__(auto v) { return v > 0; }, stream, mr);
+
+  // Return an empty lists column if there are no valid rows
+  if (strings_count == null_count) {
+    return make_lists_column(0,
+                             make_empty_column(type_to_id<offset_type>()),
+                             make_empty_column(type_id::STRING),
+                             0,
+                             rmm::device_buffer{},
+                             stream,
+                             mr);
+  }
+
+  // Convert counts into offsets.
+  // Multiply each count by the number of groups.
+  thrust::transform_exclusive_scan(
+    rmm::exec_policy(stream),
+    d_offsets,
+    d_offsets + strings_count + 1,
+    d_offsets,
+    [groups] __device__(auto v) { return v * groups; },
+    offset_type{0},
+    thrust::plus{});
+  auto const total_groups =
+    cudf::detail::get_value<offset_type>(offsets->view(), strings_count, stream);
+
+  // Create an indices vector with the total number of groups that will be extracted.
+  rmm::device_uvector<string_index_pair> indices(total_groups, stream);
+  auto d_indices = indices.data();
+  auto begin     = thrust::make_counting_iterator<size_type>(0);
+
+  // Call the extract functor to fill in the indices vector.
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    extract_fn<RX_STACK_SMALL> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    extract_fn<RX_STACK_MEDIUM> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    extract_fn<RX_STACK_LARGE> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  } else {
+    extract_fn<RX_STACK_ANY> fn{*d_strings, *d_prog, d_offsets, d_indices};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, fn);
+  }
+
+  // Build the child strings column from the indices.
+  auto strings_output = make_strings_column(indices.begin(), indices.end(), stream, mr);
+
+  // Build the lists column from the offsets and the strings.
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           null_count,
+                           std::move(null_mask),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> extract_all(strings_column_view const& strings,
+                                    std::string const& pattern,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_all(strings, pattern, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 824bf7deb34..2bb1c6dac8e 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,17 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/extract.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table_view.hpp>
+#include <tests/strings/utilities.h>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/strings/extract.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
 
 #include <vector>
 
@@ -169,6 +172,38 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
 }
 
+TEST_F(StringsExtractTests, ExtractAllTest)
+{
+  std::vector<const char*> h_input(
+    {"123 banana 7 eleven", "41 apple", "6 pear 0 pair", nullptr, "", "bees", "4 pare"});
+  auto validity =
+    thrust::make_transform_iterator(h_input.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)");
+
+  bool valids[] = {1, 1, 1, 0, 0, 0, 1};
+  using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{"123", "banana", "7", "eleven"},
+                LCW{"41", "apple"},
+                LCW{"6", "pear", "0", "pair"},
+                LCW{},
+                LCW{},
+                LCW{},
+                LCW{"4", "pare"}},
+               valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
+TEST_F(StringsExtractTests, Errors)
+{
+  cudf::test::strings_column_wrapper input({"this column intentionally left blank"});
+  auto sv = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all(sv, "\\w+"), cudf::logic_error);
+}
+
 TEST_F(StringsExtractTests, MediumRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.

From 33f7f0d032728494dcb9bb69eea96bfdc889084d Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 5 Jan 2022 15:15:43 -0600
Subject: [PATCH 121/202] Fix regression HostColumnVectorCore requiring native
 libs (#9948)

#9485 regressed the fix in #9332.  This restores the fix.  It also eliminates the problematic `ColumnVector.closeBuffers` which, despite the plurality in the name, only closed one buffer and had dubious utility in converting throwables into runtime exceptions.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/9948
---
 .../java/ai/rapids/cudf/ColumnVector.java     | 36 ++++++-------------
 .../ai/rapids/cudf/HostColumnVectorCore.java  | 12 +++++--
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index d1c3777f2c4..61981b34615 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -907,24 +907,6 @@ private static native long stringConcatenationSepCol(long[] columnViews,
   // INTERNAL/NATIVE ACCESS
   /////////////////////////////////////////////////////////////////////////////
 
-  /**
-   * Close all non-null buffers. Exceptions that occur during the process will
-   * be aggregated into a single exception thrown at the end.
-   */
-  static void closeBuffers(AutoCloseable buffer) {
-    Throwable toThrow = null;
-    if (buffer != null) {
-      try {
-        buffer.close();
-      } catch (Throwable t) {
-        toThrow = t;
-      }
-    }
-    if (toThrow != null) {
-      throw new RuntimeException(toThrow);
-    }
-  }
-
   ////////
   // Native methods specific to cudf::column. These either take or create a cudf::column
   // instead of a cudf::column_view so they need to be used with caution. These should
@@ -1114,13 +1096,17 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
       if (!toClose.isEmpty()) {
         try {
           for (MemoryBuffer toCloseBuff : toClose) {
-            closeBuffers(toCloseBuff);
-          }
-        } catch (Throwable t) {
-          if (toThrow != null) {
-            toThrow.addSuppressed(t);
-          } else {
-            toThrow = t;
+            if (toCloseBuff != null) {
+              try {
+                toCloseBuff.close();
+              } catch (Throwable t) {
+                if (toThrow != null) {
+                  toThrow.addSuppressed(t);
+                } else {
+                  toThrow = t;
+                }
+              }
+            }
           }
         } finally {
           toClose.clear();
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index dd07df16553..763ecc763a5 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -594,9 +594,15 @@ protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
       boolean neededCleanup = false;
       if (data != null || valid != null || offsets != null) {
         try {
-          ColumnVector.closeBuffers(data);
-          ColumnVector.closeBuffers(offsets);
-          ColumnVector.closeBuffers(valid);
+          if (data != null) {
+            data.close();
+          }
+          if (offsets != null) {
+            offsets.close();
+          }
+          if (valid != null) {
+            valid.close();
+          }
         } finally {
           // Always mark the resource as freed even if an exception is thrown.
           // We cannot know how far it progressed before the exception, and

From b1de945f9c6e3aa810d4b1ef7d53d0b4b88f7da3 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 6 Jan 2022 06:26:42 -0800
Subject: [PATCH 122/202] Remove deprecated method `one_hot_encoding` (#9977)

This PR removes deprecated method `one_hot_encoding` and its test cases. There is a test case for generic indexed dataframe in `one_hot_encoding` that was not covered in `get_dummies`, this PR adds that coverage.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9977
---
 python/cudf/cudf/core/dataframe.py    |  72 ------------------
 python/cudf/cudf/core/reshape.py      |  12 +--
 python/cudf/cudf/core/series.py       |  77 -------------------
 python/cudf/cudf/tests/test_onehot.py | 105 +++-----------------------
 4 files changed, 16 insertions(+), 250 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d97ea456f72..3366a0af4ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3051,78 +3051,6 @@ def as_matrix(self, columns=None):
         )
         return self.as_gpu_matrix(columns=columns).copy_to_host()
 
-    def one_hot_encoding(
-        self, column, prefix, cats, prefix_sep="_", dtype="float64"
-    ):
-        """
-        Expand a column with one-hot-encoding.
-
-        Parameters
-        ----------
-
-        column : str
-            the source column with binary encoding for the data.
-        prefix : str
-            the new column name prefix.
-        cats : sequence of ints
-            the sequence of categories as integers.
-        prefix_sep : str
-            the separator between the prefix and the category.
-        dtype :
-            the dtype for the outputs; defaults to float64.
-
-        Returns
-        -------
-
-        a new dataframe with new columns append for each category.
-
-        Examples
-        --------
-        >>> import pandas as pd
-        >>> import cudf
-        >>> pet_owner = [1, 2, 3, 4, 5]
-        >>> pet_type = ['fish', 'dog', 'fish', 'bird', 'fish']
-        >>> df = pd.DataFrame({'pet_owner': pet_owner, 'pet_type': pet_type})
-        >>> df.pet_type = df.pet_type.astype('category')
-
-        Create a column with numerically encoded category values
-
-        >>> df['pet_codes'] = df.pet_type.cat.codes
-        >>> gdf = cudf.from_pandas(df)
-
-        Create the list of category codes to use in the encoding
-
-        >>> codes = gdf.pet_codes.unique()
-        >>> gdf.one_hot_encoding('pet_codes', 'pet_dummy', codes).head()
-          pet_owner  pet_type  pet_codes  pet_dummy_0  pet_dummy_1  pet_dummy_2
-        0         1      fish          2          0.0          0.0          1.0
-        1         2       dog          1          0.0          1.0          0.0
-        2         3      fish          2          0.0          0.0          1.0
-        3         4      bird          0          1.0          0.0          0.0
-        4         5      fish          2          0.0          0.0          1.0
-        """
-
-        warnings.warn(
-            "DataFrame.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_arrow().to_pylist()
-        else:
-            cats = pd.Series(cats, dtype="object")
-
-        newnames = [
-            prefix_sep.join([prefix, "null" if cat is None else str(cat)])
-            for cat in cats
-        ]
-        newcols = self[column].one_hot_encoding(cats=cats, dtype=dtype)
-        outdf = self.copy()
-        for name, col in zip(newnames, newcols):
-            outdf.insert(len(outdf._data), name, col)
-        return outdf
-
     def label_encoding(
         self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
     ):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index b2fac7a6140..1733a6c0b9a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -600,18 +600,18 @@ def get_dummies(
     df : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
-        prefix to append. Either a str (to apply a constant prefix), dict
+        Prefix to append. Either a str (to apply a constant prefix), dict
         mapping column names to prefixes, or sequence of prefixes to apply with
         the same length as the number of columns. If not supplied, defaults
         to the empty string
     prefix_sep : str, dict, or sequence, optional, default '_'
-        separator to use when appending prefixes
+        Separator to use when appending prefixes
     dummy_na : boolean, optional
         Add a column to indicate Nones, if False Nones are ignored.
     cats : dict, optional
-        dictionary mapping column names to sequences of integers representing
-        that column's category. See `cudf.DataFrame.one_hot_encoding` for more
-        information. if not supplied, it will be computed
+        Dictionary mapping column names to sequences of values representing
+        that column's category. If not supplied, it is computed as the unique
+        values of the column.
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
@@ -621,7 +621,7 @@ def get_dummies(
         columns. Note this is different from pandas default behavior, which
         encodes all columns with dtype object or categorical
     dtype : str, optional
-        output dtype, default 'uint8'
+        Output dtype, default 'uint8'
 
     Examples
     --------
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index fb86cf85c4c..178c40b3cd8 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2264,83 +2264,6 @@ def reverse(self):
             {self.name: self._column[rinds]}, self.index._values[rinds]
         )
 
-    def one_hot_encoding(self, cats, dtype="float64"):
-        """Perform one-hot-encoding
-
-        Parameters
-        ----------
-        cats : sequence of values
-                values representing each category.
-        dtype : numpy.dtype
-                specifies the output dtype.
-
-        Returns
-        -------
-        Sequence
-            A sequence of new series for each category. Its length is
-            determined by the length of ``cats``.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series(['a', 'b', 'c', 'a'])
-        >>> s
-        0    a
-        1    b
-        2    c
-        3    a
-        dtype: object
-        >>> s.one_hot_encoding(['a', 'c', 'b'])
-        [0    1.0
-        1    0.0
-        2    0.0
-        3    1.0
-        dtype: float64, 0    0.0
-        1    0.0
-        2    1.0
-        3    0.0
-        dtype: float64, 0    0.0
-        1    1.0
-        2    0.0
-        3    0.0
-        dtype: float64]
-        """
-
-        warnings.warn(
-            "Series.one_hot_encoding is deprecated and will be removed in "
-            "future, use `get_dummies` instead.",
-            FutureWarning,
-        )
-
-        if hasattr(cats, "to_arrow"):
-            cats = cats.to_pandas()
-        else:
-            cats = pd.Series(cats, dtype="object")
-        dtype = cudf.dtype(dtype)
-
-        try:
-            cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype)
-        except TypeError:
-            raise ValueError("Cannot convert `cats` as cudf column.")
-
-        if self._column.size * cats_col.size >= np.iinfo("int32").max:
-            raise ValueError(
-                "Size limitation exceeded: series.size * category.size < "
-                "np.iinfo('int32').max. Consider reducing size of category"
-            )
-
-        res = libcudf.transform.one_hot_encode(self._column, cats_col)
-        if dtype.type == np.bool_:
-            return [
-                Series._from_data({None: x}, index=self._index)
-                for x in list(res.values())
-            ]
-        else:
-            return [
-                Series._from_data({None: x.astype(dtype)}, index=self._index)
-                for x in list(res.values())
-            ]
-
     def label_encoding(self, cats, dtype=None, na_sentinel=-1):
         """Perform label encoding.
 
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index f2a20a73b63..2b0422ffecb 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -7,108 +7,23 @@
 import pytest
 
 import cudf
-from cudf import DataFrame, Index, Series
+from cudf import DataFrame
 from cudf.testing import _utils as utils
 
 
-def test_onehot_simple():
-    np.random.seed(0)
-    df = DataFrame()
-    # Populate with data [0, 10)
-    df["vals"] = np.arange(10, dtype=np.int32)
-    # One Hot (Series)
-    for i, col in enumerate(df["vals"].one_hot_encoding(list(range(10)))):
-        arr = col.to_numpy()
-        # Verify 1 in the right position
-        np.testing.assert_equal(arr[i], 1)
-        # Every other slots are 0s
-        np.testing.assert_equal(arr[:i], 0)
-        np.testing.assert_equal(arr[i + 1 :], 0)
-    # One Hot (DataFrame)
-    df2 = df.one_hot_encoding(
-        column="vals", prefix="vals", cats=list(range(10))
-    )
-    assert df2.columns[0] == "vals"
-    for i in range(1, len(df2.columns)):
-        assert df2.columns[i] == "vals_%s" % (i - 1)
-    got = df2[df2.columns[1:]].values_host
-    expect = np.identity(got.shape[0])
-    np.testing.assert_equal(got, expect)
-
-
-def test_onehot_random():
-    df = DataFrame()
-    low = 10
-    high = 17
-    size = 10
-    df["src"] = src = np.random.randint(low=low, high=high, size=size)
-    df2 = df.one_hot_encoding(
-        column="src", prefix="out_", cats=tuple(range(10, 17))
-    )
-    mat = df2[df2.columns[1:]].values_host
-
-    for val in range(low, high):
-        colidx = val - low
-        arr = mat[:, colidx]
-        mask = src == val
-        np.testing.assert_equal(arr, mask)
-
-
-def test_onehot_masked():
-    np.random.seed(0)
-    high = 5
-    size = 100
-    arr = np.random.randint(low=0, high=high, size=size)
-    bitmask = utils.random_bitmask(size)
-    bytemask = np.asarray(
-        utils.expand_bits_to_bytes(bitmask)[:size], dtype=np.bool_
-    )
-    arr[~bytemask] = -1
-
-    df = DataFrame()
-    df["a"] = Series(arr).set_mask(bitmask)
-
-    out = df.one_hot_encoding(
-        "a", cats=list(range(high)), prefix="a", dtype=np.int32
-    )
-
-    assert tuple(out.columns) == ("a", "a_0", "a_1", "a_2", "a_3", "a_4")
-    np.testing.assert_array_equal((out["a_0"] == 1).to_numpy(), arr == 0)
-    np.testing.assert_array_equal((out["a_1"] == 1).to_numpy(), arr == 1)
-    np.testing.assert_array_equal((out["a_2"] == 1).to_numpy(), arr == 2)
-    np.testing.assert_array_equal((out["a_3"] == 1).to_numpy(), arr == 3)
-    np.testing.assert_array_equal((out["a_4"] == 1).to_numpy(), arr == 4)
-
-
-def test_onehot_generic_index():
-    np.random.seed(0)
-    size = 33
-    indices = np.random.randint(low=0, high=100, size=size)
-    df = DataFrame()
-    values = np.random.randint(low=0, high=4, size=size)
-    df["fo"] = Series(values, index=Index(indices))
-    out = df.one_hot_encoding(
-        "fo", cats=df.fo.unique(), prefix="fo", dtype=np.int32
-    )
-    assert set(out.columns) == {"fo", "fo_0", "fo_1", "fo_2", "fo_3"}
-    np.testing.assert_array_equal(values == 0, out.fo_0.to_numpy())
-    np.testing.assert_array_equal(values == 1, out.fo_1.to_numpy())
-    np.testing.assert_array_equal(values == 2, out.fo_2.to_numpy())
-    np.testing.assert_array_equal(values == 3, out.fo_3.to_numpy())
-
-
 @pytest.mark.parametrize(
-    "data",
+    "data, index",
     [
-        np.arange(10),
-        ["abc", "zyx", "pppp"],
-        [],
-        pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"),
+        (np.arange(10), None),
+        (["abc", "zyx", "pppp"], None),
+        ([], None),
+        (pd.Series(["cudf", "hello", "pandas"] * 10, dtype="category"), None),
+        (range(10), [1, 2, 3, 4, 5] * 2),
     ],
 )
-def test_get_dummies(data):
-    gdf = DataFrame({"x": data})
-    pdf = pd.DataFrame({"x": data})
+def test_get_dummies(data, index):
+    gdf = DataFrame({"x": data}, index=index)
+    pdf = pd.DataFrame({"x": data}, index=index)
 
     encoded_expected = pd.get_dummies(pdf, prefix="test")
     encoded_actual = cudf.get_dummies(gdf, prefix="test")

From a61fc558531693f68e18ad29dc0b73610c5d1c70 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 6 Jan 2022 10:44:09 -0600
Subject: [PATCH 123/202] Minor cleanup of unused Python functions (#9974)

This PR just removes some unused internal functions and inlines some single-use functions that were defined at the wrong levels of the class hierarchy (largely `Frame` internal methods that were exclusively called in a single `DataFrame` method).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9974
---
 python/cudf/cudf/core/dataframe.py       | 47 ++----------
 python/cudf/cudf/core/frame.py           | 91 ++++++----------------
 python/cudf/cudf/core/indexed_frame.py   | 64 ++++++++++++++++
 python/cudf/cudf/core/series.py          | 39 ----------
 python/cudf/cudf/tests/test_dataframe.py | 97 ------------------------
 5 files changed, 93 insertions(+), 245 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3366a0af4ba..197011e629d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4078,45 +4078,6 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    def hash_values(self, method="murmur3"):
-        """Compute the hash of values in each row.
-
-        Parameters
-        ----------
-        method : {'murmur3', 'md5'}, default 'murmur3'
-            Hash function to use:
-            * murmur3: MurmurHash3 hash function.
-            * md5: MD5 hash function.
-
-        Returns
-        -------
-        Series
-            A Series with hash values.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
-        >>> df
-             a     b
-        0   10  0.00
-        1  120  0.25
-        2   30  0.50
-        >>> df.hash_values(method="murmur3")
-        0    -330519225
-        1    -397962448
-        2   -1345834934
-        dtype: int32
-        >>> df.hash_values(method="md5")
-        0    57ce879751b5169c525907d5c563fae1
-        1    948d6221a7c4963d4be411bcead7e32b
-        2    fe061786ea286a515b772d91b0dfcd70
-        dtype: object
-        """
-        return Series._from_data(
-            {None: self._hash(method=method)}, index=self.index
-        )
-
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -4140,7 +4101,13 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
             else self._index._num_columns
         )
         key_indices = [self._data.names.index(k) + idx for k in columns]
-        outdf, offsets = self._hash_partition(key_indices, nparts, keep_index)
+
+        output_data, output_index, offsets = libcudf.hash.hash_partition(
+            self, key_indices, nparts, keep_index
+        )
+        outdf = self.__class__._from_data(output_data, output_index)
+        outdf._copy_type_metadata(self, include_index=keep_index)
+
         # Slice into partition
         return [outdf[s:e] for s, e in zip(offsets, offsets[1:] + [None])]
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index bae15c5e9fd..539408b6afb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -83,13 +83,6 @@ def __init__(self, data=None, index=None):
     def _num_columns(self) -> int:
         return len(self._data)
 
-    @property
-    def _num_indices(self) -> int:
-        if self._index is None:
-            return 0
-        else:
-            return len(self._index_names)
-
     @property
     def _num_rows(self) -> int:
         if self._index is not None:
@@ -269,15 +262,6 @@ def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
 
-    @property
-    def _is_homogeneous(self):
-        # make sure that the dataframe has columns
-        if not self._data.columns:
-            return True
-
-        first_type = self._data.columns[0].dtype.name
-        return all(x.dtype.name == first_type for x in self._data.columns)
-
     @property
     def empty(self):
         """
@@ -580,19 +564,6 @@ def _gather(
         result._copy_type_metadata(self)
         return result
 
-    def _hash(self, method):
-        return libcudf.hash.hash(self, method)
-
-    def _hash_partition(
-        self, columns_to_hash, num_partitions, keep_index=True
-    ):
-        output_data, output_index, offsets = libcudf.hash.hash_partition(
-            self, columns_to_hash, num_partitions, keep_index
-        )
-        output = self.__class__._from_data(output_data, output_index)
-        output._copy_type_metadata(self, include_index=keep_index)
-        return output, offsets
-
     def _as_column(self):
         """
         _as_column : Converts a single columned Frame to Column
@@ -1009,30 +980,6 @@ def mask(self, cond, other=None, inplace=False):
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    def _partition(self, scatter_map, npartitions, keep_index=True):
-
-        data, index, output_offsets = libcudf.partitioning.partition(
-            self, scatter_map, npartitions, keep_index
-        )
-        partitioned = self.__class__._from_data(data, index)
-
-        # due to the split limitation mentioned
-        # here: https://github.com/rapidsai/cudf/issues/4607
-        # we need to remove first & last elements in offsets.
-        # TODO: Remove this after the above issue is fixed.
-        output_offsets = output_offsets[1:-1]
-
-        result = partitioned._split(output_offsets, keep_index=keep_index)
-
-        for frame in result:
-            frame._copy_type_metadata(self, include_index=keep_index)
-
-        if npartitions:
-            for _ in range(npartitions - len(result)):
-                result.append(self._empty_like(keep_index))
-
-        return result
-
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1139,9 +1086,29 @@ def scatter_by_map(
                     f"ERROR: map_size must be >= {count} (got {map_size})."
                 )
 
-        tables = self._partition(map_index, map_size, keep_index)
+        data, index, output_offsets = libcudf.partitioning.partition(
+            self, map_index, map_size, keep_index
+        )
+        partitioned = self.__class__._from_data(data, index)
 
-        return tables
+        # due to the split limitation mentioned
+        # here: https://github.com/rapidsai/cudf/issues/4607
+        # we need to remove first & last elements in offsets.
+        # TODO: Remove this after the above issue is fixed.
+        output_offsets = output_offsets[1:-1]
+
+        result = partitioned._split(output_offsets, keep_index=keep_index)
+
+        for frame in result:
+            frame._copy_type_metadata(self, include_index=keep_index)
+
+        if map_size:
+            result += [
+                self._empty_like(keep_index)
+                for _ in range(map_size - len(result))
+            ]
+
+        return result
 
     def dropna(
         self, axis=0, how="any", thresh=None, subset=None, inplace=False
@@ -1499,8 +1466,6 @@ def _apply_boolean_mask(self, boolean_mask):
         Applies boolean mask to each row of `self`,
         rows corresponding to `False` is dropped
         """
-        boolean_mask = as_column(boolean_mask)
-
         result = self.__class__._from_data(
             *libcudf.stream_compaction.apply_boolean_mask(
                 self, as_column(boolean_mask)
@@ -2503,18 +2468,6 @@ def _copy_type_metadata(
 
         return self
 
-    def _copy_interval_data(self, other, include_index=True):
-        for name, col, other_col in zip(
-            self._data.keys(), self._data.values(), other._data.values()
-        ):
-            if isinstance(other_col, cudf.core.column.IntervalColumn):
-                self._data[name] = cudf.core.column.IntervalColumn(col)
-
-    def _postprocess_columns(self, other, include_index=True):
-        self._copy_categories(other, include_index=include_index)
-        self._copy_struct_names(other, include_index=include_index)
-        self._copy_interval_data(other, include_index=include_index)
-
     def isnull(self):
         """
         Identify missing values.
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 4be35d960ee..ecacb1ff326 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -445,6 +445,70 @@ def sort_index(
             out = out.reset_index(drop=True)
         return self._mimic_inplace(out, inplace=inplace)
 
+    def hash_values(self, method="murmur3"):
+        """Compute the hash of values in this column.
+
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            A Series with hash values.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([10, 120, 30])
+        >>> series
+        0     10
+        1    120
+        2     30
+        dtype: int64
+        >>> series.hash_values(method="murmur3")
+        0   -1930516747
+        1     422619251
+        2    -941520876
+        dtype: int32
+        >>> series.hash_values(method="md5")
+        0    7be4bbacbfdb05fb3044e36c22b41e8b
+        1    947ca8d2c5f0f27437f156cfbfab0969
+        2    d0580ef52d27c043c8e341fd5039b166
+        dtype: object
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
+        >>> df
+             a     b
+        0   10  0.00
+        1  120  0.25
+        2   30  0.50
+        >>> df.hash_values(method="murmur3")
+        0    -330519225
+        1    -397962448
+        2   -1345834934
+        dtype: int32
+        >>> df.hash_values(method="md5")
+        0    57ce879751b5169c525907d5c563fae1
+        1    948d6221a7c4963d4be411bcead7e32b
+        2    fe061786ea286a515b772d91b0dfcd70
+        dtype: object
+        """
+        # Note that both Series and DataFrame return Series objects from this
+        # calculation, necessitating the unfortunate circular reference to the
+        # child class here.
+        return cudf.Series._from_data(
+            {None: libcudf.hash.hash(self, method)}, index=self.index
+        )
+
     def _gather(
         self, gather_map, keep_index=True, nullify=False, check_bounds=True
     ):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 178c40b3cd8..a0e359d1278 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3043,45 +3043,6 @@ def value_counts(
             res = res / float(res._column.sum())
         return res
 
-    def hash_values(self, method="murmur3"):
-        """Compute the hash of values in this column.
-
-        Parameters
-        ----------
-        method : {'murmur3', 'md5'}, default 'murmur3'
-            Hash function to use:
-            * murmur3: MurmurHash3 hash function.
-            * md5: MD5 hash function.
-
-        Returns
-        -------
-        Series
-            A Series with hash values.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 120, 30])
-        >>> series
-        0     10
-        1    120
-        2     30
-        dtype: int64
-        >>> series.hash_values(method="murmur3")
-        0   -1930516747
-        1     422619251
-        2    -941520876
-        dtype: int32
-        >>> series.hash_values(method="md5")
-        0    7be4bbacbfdb05fb3044e36c22b41e8b
-        1    947ca8d2c5f0f27437f156cfbfab0969
-        2    d0580ef52d27c043c8e341fd5039b166
-        dtype: object
-        """
-        return Series._from_data(
-            {None: self._hash(method=method)}, index=self.index
-        )
-
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f42920b7c50..73f9cb858e1 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8696,103 +8696,6 @@ def test_dataframe_init_from_series(data, columns, index):
     )
 
 
-@pytest.mark.parametrize(
-    "data, expected",
-    [
-        ({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]}, False),
-        ({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, True),
-        ({"a": ["a", "b", "c"], "b": [4, 5, 6], "c": [7, 8, 9]}, False),
-        ({"a": [True, False, False], "b": [False, False, True]}, True),
-        ({"a": [True, False, False]}, True),
-        ({"a": [[1, 2], [3, 4]]}, True),
-        ({"a": [[1, 2], [3, 4]], "b": ["a", "b"]}, False),
-        ({"a": [{"c": 5}, {"e": 5}], "b": [{"c": 5}, {"g": 7}]}, True),
-        ({}, True),
-    ],
-)
-def test_is_homogeneous_dataframe(data, expected):
-    actual = cudf.DataFrame(data)._is_homogeneous
-
-    assert actual == expected
-
-
-@pytest.mark.parametrize(
-    "data, indexes, expected",
-    [
-        (
-            {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [1.2, 1, 2, 3]},
-            ["a", "b"],
-            True,
-        ),
-        (
-            {
-                "a": [1, 2, 3, 4],
-                "b": [5, 6, 7, 8],
-                "c": [1.2, 1, 2, 3],
-                "d": ["hello", "world", "cudf", "rapids"],
-            },
-            ["a", "b"],
-            False,
-        ),
-        (
-            {
-                "a": ["a", "b", "c"],
-                "b": [4, 5, 6],
-                "c": [7, 8, 9],
-                "d": [1, 2, 3],
-            },
-            ["a", "b"],
-            True,
-        ),
-    ],
-)
-def test_is_homogeneous_multiIndex_dataframe(data, indexes, expected):
-    test_dataframe = cudf.DataFrame(data).set_index(indexes)
-    actual = cudf.DataFrame(test_dataframe)._is_homogeneous
-
-    assert actual == expected
-
-
-@pytest.mark.parametrize(
-    "data, expected", [([1, 2, 3, 4], True), ([True, False], True)]
-)
-def test_is_homogeneous_series(data, expected):
-    actual = cudf.Series(data)._is_homogeneous
-
-    assert actual == expected
-
-
-@pytest.mark.parametrize(
-    "levels, codes, expected",
-    [
-        (
-            [["lama", "cow", "falcon"], ["speed", "weight", "length"]],
-            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
-            True,
-        ),
-        (
-            [[1, 2, 3], [True, False, True]],
-            [[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
-            False,
-        ),
-    ],
-)
-def test_is_homogeneous_multiIndex(levels, codes, expected):
-    actual = cudf.MultiIndex(levels=levels, codes=codes)._is_homogeneous
-
-    assert actual == expected
-
-
-@pytest.mark.parametrize(
-    "data, expected",
-    [([1, 2, 3], True), (["Hello", "World"], True), ([True, False], True)],
-)
-def test_is_homogeneous_index(data, expected):
-    actual = cudf.Index(data)._is_homogeneous
-
-    assert actual == expected
-
-
 def test_frame_series_where():
     gdf = cudf.DataFrame(
         {"a": [1.0, 2.0, None, 3.0, None], "b": [None, 10.0, 11.0, None, 23.0]}

From 23603d16804f13f16ed4cb2c45831836e080017e Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Thu, 6 Jan 2022 12:19:44 -0500
Subject: [PATCH 124/202] custreamz oauth callback for kafka (librdkafka)
 (#9486)

Previously it was impossible to use custreamz with oauth enabled Kafka brokers. This PR adds a feature so that the user can supply a Python function which is invoked to get the oauth token, from a http endpoint for example, and then supply that token to librdkafka to be used in both the initial connection to kafka and also subsequently as the token becomes stale.

This closes #9410

Authors:
  - Jeremy Dyer (https://github.com/jdye64)
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9486
---
 ci/gpu/build.sh                               |   2 +-
 conda/recipes/cudf_kafka/build.sh             |   2 +-
 conda/recipes/cudf_kafka/meta.yaml            |  15 ++--
 conda/recipes/custreamz/build.sh              |   2 +-
 conda/recipes/custreamz/meta.yaml             |  18 ++---
 conda/recipes/libcudf_kafka/build.sh          |   2 +-
 conda/recipes/libcudf_kafka/meta.yaml         |   4 +-
 cpp/libcudf_kafka/CMakeLists.txt              |  13 +++-
 .../cmake/thirdparty/get_cudf.cmake           |   2 +-
 .../cmake/thirdparty/get_rdkafka.cmake        |   2 +-
 .../include/cudf_kafka/kafka_callback.hpp     |  71 ++++++++++++++++++
 .../include/cudf_kafka/kafka_consumer.hpp     |  32 ++++++--
 cpp/libcudf_kafka/src/kafka_callback.cpp      |  48 ++++++++++++
 cpp/libcudf_kafka/src/kafka_consumer.cpp      |  46 +++++++++---
 cpp/libcudf_kafka/tests/CMakeLists.txt        |   7 +-
 .../tests/kafka_consumer_tests.cpp            |  36 ++++++---
 .../cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo | Bin 0 -> 12288 bytes
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  12 ++-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  36 +++++++--
 python/custreamz/custreamz/kafka.py           |  17 ++---
 .../custreamz/custreamz/tests/test_kafka.py   |   5 +-
 21 files changed, 295 insertions(+), 77 deletions(-)
 create mode 100644 cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
 create mode 100644 cpp/libcudf_kafka/src/kafka_callback.cpp
 create mode 100644 python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index a557a2ef066..4ac2fe79bf6 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -89,7 +89,7 @@ gpuci_mamba_retry install -y \
                   "ucx-py=${UCX_PY_VERSION}"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
-# gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
+# gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
 # gpuci_mamba_retry install -y "your-pkg=1.0.0"
 
 
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index 3db559c144d..5d8720f1c98 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index e450d306cbe..d434e53c9b1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -1,9 +1,9 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
+{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
+{% set py_version = environ.get('python', '3.8') %}
 
 package:
   name: cudf_kafka
@@ -14,7 +14,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - CC
     - CXX
@@ -26,14 +26,15 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - python
+    - python {{ py_version }}
     - cython >=0.29,<0.30
-    - setuptools
     - cudf {{ version }}
     - libcudf_kafka {{ version }}
+    - setuptools
   run:
+    - python {{ py_version }}
     - libcudf_kafka {{ version }}
-    - python-confluent-kafka
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
     - cudf {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/custreamz/build.sh b/conda/recipes/custreamz/build.sh
index 6ce9e4f21a9..88fccf90c69 100644
--- a/conda/recipes/custreamz/build.sh
+++ b/conda/recipes/custreamz/build.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 # This assumes the script is executed from the root of the repo directory
 ./build.sh -v custreamz
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index a8b096d4892..73f4727b70b 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -1,9 +1,9 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
+{% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
+{% set py_version = environ.get('python', '3.8') %}
 
 package:
   name: custreamz
@@ -14,7 +14,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
@@ -24,16 +24,16 @@ build:
 
 requirements:
   host:
-    - python
-    - python-confluent-kafka
+    - python {{ py_version }}
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
     - cudf_kafka {{ version }}
   run:
-    - python
-    - streamz
+    - python {{ py_version }}
+    - streamz 
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2
     - distributed>=2021.11.1,<=2021.11.2
-    - python-confluent-kafka
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
     - cudf_kafka {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/libcudf_kafka/build.sh b/conda/recipes/libcudf_kafka/build.sh
index cbe4584cb63..b656f55a64e 100644
--- a/conda/recipes/libcudf_kafka/build.sh
+++ b/conda/recipes/libcudf_kafka/build.sh
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     # This assumes the script is executed from the root of the repo directory
diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml
index 6b15890e7c7..0b274f3a41d 100644
--- a/conda/recipes/libcudf_kafka/meta.yaml
+++ b/conda/recipes/libcudf_kafka/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -26,7 +26,7 @@ requirements:
     - cmake >=3.20.1
   host:
     - libcudf {{version}}
-    - librdkafka >=1.6.0,<1.7.0a0
+    - librdkafka >=1.7.0,<1.8.0a0
   run:
     - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not
 
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index d0874b57c2d..e6abba207d9 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -29,6 +29,10 @@ project(
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
 
+# ##################################################################################################
+# * conda environment -----------------------------------------------------------------------------
+rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
+
 # ##################################################################################################
 # * Build options
 option(BUILD_TESTS "Build tests for libcudf_kafka" ON)
@@ -55,7 +59,7 @@ endif()
 
 # ##################################################################################################
 # * library target --------------------------------------------------------------------------------
-add_library(cudf_kafka SHARED src/kafka_consumer.cpp)
+add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
@@ -68,6 +72,11 @@ target_include_directories(
 # * library paths ---------------------------------------------------------------------------------
 target_link_libraries(cudf_kafka PUBLIC cudf::cudf RDKAFKA::RDKAFKA)
 
+# Add Conda library, and include paths if specified
+if(TARGET conda_env)
+  target_link_libraries(cudf_kafka PRIVATE conda_env)
+endif()
+
 set_target_properties(
   cudf_kafka
   PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index 1e04d40a7d5..aa4c5b60e7a 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
index 3b3342cb297..5c3c9f01f17 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
new file mode 100644
index 00000000000..a4ff18054b1
--- /dev/null
+++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_callback.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/io/datasource.hpp>
+
+#include <librdkafka/rdkafkacpp.h>
+
+#include <map>
+#include <memory>
+#include <string>
+
+namespace cudf {
+namespace io {
+namespace external {
+namespace kafka {
+
+/**
+ * @brief Python Callback function wrapper type used for Kafka OAuth events
+ *
+ * The KafkaConsumer calls the `kafka_oauth_callback_wrapper_type` when the existing
+ * oauth token is considered expired by the KafkaConsumer. Typically that
+ * means this will be invoked a single time when the KafkaConsumer is created
+ * to get the initial token and then intermediately as the token becomes
+ * expired.
+ *
+ * The callback function signature is:
+ *     `std::map<std::string, std::string> kafka_oauth_callback_wrapper_type(void*)`
+ *
+ * The callback function returns a std::map<std::string, std::string>,
+ * where the std::map consists of the Oauth token and its
+ * linux epoch expiration time. Generally the token and expiration
+ * time is retrieved from an external service by the callback.
+ * Ex: [token, token_expiration_in_epoch]
+ */
+using kafka_oauth_callback_wrapper_type = std::map<std::string, std::string> (*)(void*);
+using python_callable_type              = void*;
+
+/**
+ * @brief Callback to retrieve OAuth token from external source. Invoked when
+ * token refresh is required.
+ */
+class python_oauth_refresh_callback : public RdKafka::OAuthBearerTokenRefreshCb {
+ public:
+  python_oauth_refresh_callback(kafka_oauth_callback_wrapper_type callback_wrapper,
+                                python_callable_type python_callable);
+
+  void oauthbearer_token_refresh_cb(RdKafka::Handle* handle, const std::string& oauthbearer_config);
+
+ private:
+  kafka_oauth_callback_wrapper_type callback_wrapper_;
+  python_callable_type python_callable_;
+};
+
+}  // namespace kafka
+}  // namespace external
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
index 464d1cd71b1..c65774d2e1a 100644
--- a/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
+++ b/cpp/libcudf_kafka/include/cudf_kafka/kafka_consumer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,14 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <chrono>
+#include "kafka_callback.hpp"
+
 #include <cudf/io/datasource.hpp>
+
 #include <librdkafka/rdkafkacpp.h>
+
+#include <algorithm>
+#include <chrono>
 #include <map>
 #include <memory>
 #include <string>
@@ -48,8 +52,15 @@ class kafka_consumer : public cudf::io::datasource {
    *
    * @param configs key/value pairs of librdkafka configurations that will be
    *                passed to the librdkafka client
+   * @param python_callable `python_callable_type` pointer to a Python functools.partial object
+   * @param callable_wrapper `kafka_oauth_callback_wrapper_type` Cython wrapper that will
+   *                 be used to invoke the `python_callable`. This wrapper serves the purpose
+   *                 of preventing us from having to link against the Python development library
+   *                 in libcudf_kafka.
    */
-  kafka_consumer(std::map<std::string, std::string> const& configs);
+  kafka_consumer(std::map<std::string, std::string> configs,
+                 python_callable_type python_callable,
+                 kafka_oauth_callback_wrapper_type callable_wrapper);
 
   /**
    * @brief Instantiate a Kafka consumer object. Documentation for librdkafka configurations can be
@@ -57,6 +68,11 @@ class kafka_consumer : public cudf::io::datasource {
    *
    * @param configs key/value pairs of librdkafka configurations that will be
    *                passed to the librdkafka client
+   * @param python_callable `python_callable_type` pointer to a Python functools.partial object
+   * @param callable_wrapper `kafka_oauth_callback_wrapper_type` Cython wrapper that will
+   *                 be used to invoke the `python_callable`. This wrapper serves the purpose
+   *                 of preventing us from having to link against the Python development library
+   *                 in libcudf_kafka.
    * @param topic_name name of the Kafka topic to consume from
    * @param partition partition index to consume from between `0` and `TOPIC_NUM_PARTITIONS - 1`
    * inclusive
@@ -66,7 +82,9 @@ class kafka_consumer : public cudf::io::datasource {
    * before batch_timeout, a smaller subset will be returned
    * @param delimiter optional delimiter to insert into the output between kafka messages, Ex: "\n"
    */
-  kafka_consumer(std::map<std::string, std::string> const& configs,
+  kafka_consumer(std::map<std::string, std::string> configs,
+                 python_callable_type python_callable,
+                 kafka_oauth_callback_wrapper_type callable_wrapper,
                  std::string const& topic_name,
                  int partition,
                  int64_t start_offset,
@@ -178,6 +196,10 @@ class kafka_consumer : public cudf::io::datasource {
   std::unique_ptr<RdKafka::Conf> kafka_conf;  // RDKafka configuration object
   std::unique_ptr<RdKafka::KafkaConsumer> consumer;
 
+  std::map<std::string, std::string> configs;
+  python_callable_type python_callable_;
+  kafka_oauth_callback_wrapper_type callable_wrapper_;
+
   std::string topic_name;
   int partition;
   int64_t start_offset;
diff --git a/cpp/libcudf_kafka/src/kafka_callback.cpp b/cpp/libcudf_kafka/src/kafka_callback.cpp
new file mode 100644
index 00000000000..6b98747c145
--- /dev/null
+++ b/cpp/libcudf_kafka/src/kafka_callback.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cudf_kafka/kafka_callback.hpp"
+
+#include <librdkafka/rdkafkacpp.h>
+
+namespace cudf {
+namespace io {
+namespace external {
+namespace kafka {
+
+python_oauth_refresh_callback::python_oauth_refresh_callback(
+  kafka_oauth_callback_wrapper_type callback_wrapper, python_callable_type python_callable)
+  : callback_wrapper_(callback_wrapper), python_callable_(python_callable){};
+
+void python_oauth_refresh_callback::oauthbearer_token_refresh_cb(
+  RdKafka::Handle* handle, std::string const& oauthbearer_config)
+{
+  std::map<std::string, std::string> resp = callback_wrapper_(python_callable_);
+
+  // Build parameters to pass to librdkafka
+  std::string token         = resp["token"];
+  int64_t token_lifetime_ms = std::stoll(resp["token_expiration_in_epoch"]);
+  std::list<std::string> extensions;  // currently not supported
+  std::string errstr;
+  CUDF_EXPECTS(
+    RdKafka::ErrorCode::ERR_NO_ERROR ==
+      handle->oauthbearer_set_token(token, token_lifetime_ms, "kafka", extensions, errstr),
+    "Error occurred while setting the oauthbearer token");
+}
+
+}  // namespace kafka
+}  // namespace external
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp
index 4f7cdba632e..49e89a56e60 100644
--- a/cpp/libcudf_kafka/src/kafka_consumer.cpp
+++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,10 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "cudf_kafka/kafka_consumer.hpp"
-#include <chrono>
+
 #include <librdkafka/rdkafkacpp.h>
+
+#include <chrono>
 #include <memory>
 
 namespace cudf {
@@ -24,8 +25,13 @@ namespace io {
 namespace external {
 namespace kafka {
 
-kafka_consumer::kafka_consumer(std::map<std::string, std::string> const& configs)
-  : kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL))
+kafka_consumer::kafka_consumer(std::map<std::string, std::string> configs,
+                               python_callable_type python_callable,
+                               kafka_oauth_callback_wrapper_type callable_wrapper)
+  : configs(configs),
+    python_callable_(python_callable),
+    callable_wrapper_(callable_wrapper),
+    kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL))
 {
   for (auto const& key_value : configs) {
     std::string error_string;
@@ -34,6 +40,14 @@ kafka_consumer::kafka_consumer(std::map<std::string, std::string> const& configs
                  "Invalid Kafka configuration");
   }
 
+  if (python_callable_ != nullptr) {
+    std::string error_string;
+    python_oauth_refresh_callback cb(callable_wrapper_, python_callable_);
+    CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK ==
+                   kafka_conf->set("oauthbearer_token_refresh_cb", &cb, error_string),
+                 "Failed to set Kafka oauth callback");
+  }
+
   // Kafka 0.9 > requires group.id in the configuration
   std::string conf_val;
   CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == kafka_conf->get("group.id", conf_val),
@@ -44,22 +58,26 @@ kafka_consumer::kafka_consumer(std::map<std::string, std::string> const& configs
     RdKafka::KafkaConsumer::create(kafka_conf.get(), errstr));
 }
 
-kafka_consumer::kafka_consumer(std::map<std::string, std::string> const& configs,
+kafka_consumer::kafka_consumer(std::map<std::string, std::string> configs,
+                               python_callable_type python_callable,
+                               kafka_oauth_callback_wrapper_type callback_wrapper,
                                std::string const& topic_name,
                                int partition,
                                int64_t start_offset,
                                int64_t end_offset,
                                int batch_timeout,
                                std::string const& delimiter)
-  : topic_name(topic_name),
+  : configs(configs),
+    python_callable_(python_callable),
+    callable_wrapper_(callback_wrapper),
+    topic_name(topic_name),
     partition(partition),
     start_offset(start_offset),
     end_offset(end_offset),
     batch_timeout(batch_timeout),
-    delimiter(delimiter)
+    delimiter(delimiter),
+    kafka_conf(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL))
 {
-  kafka_conf = std::unique_ptr<RdKafka::Conf>(RdKafka::Conf::create(RdKafka::Conf::CONF_GLOBAL));
-
   for (auto const& key_value : configs) {
     std::string error_string;
     CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK ==
@@ -67,6 +85,14 @@ kafka_consumer::kafka_consumer(std::map<std::string, std::string> const& configs
                  "Invalid Kafka configuration");
   }
 
+  if (python_callable_ != nullptr) {
+    std::string error_string;
+    python_oauth_refresh_callback cb(callable_wrapper_, python_callable_);
+    CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK ==
+                   kafka_conf->set("oauthbearer_token_refresh_cb", &cb, error_string),
+                 "Failed to set Kafka oauth callback");
+  }
+
   // Kafka 0.9 > requires group.id in the configuration
   std::string conf_val;
   CUDF_EXPECTS(RdKafka::Conf::ConfResult::CONF_OK == kafka_conf->get("group.id", conf_val),
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 3920758f3f2..db2131ba00c 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -23,8 +23,9 @@ function(ConfigureTest test_name)
     ${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
                             "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
   )
-  target_link_libraries(${test_name} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
-
+  target_link_libraries(
+    ${test_name} PRIVATE GTest::gmock GTest::gmock_main GTest::gtest_main cudf_kafka
+  )
   add_test(NAME ${test_name} COMMAND ${test_name})
 endfunction()
 
diff --git a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
index ca4b70531db..613c2435f4d 100644
--- a/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
+++ b/cpp/libcudf_kafka/tests/kafka_consumer_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "cudf_kafka/kafka_consumer.hpp"
+#include <cudf_kafka/kafka_consumer.hpp>
 #include <gtest/gtest.h>
 #include <map>
 #include <memory>
@@ -32,25 +32,37 @@ TEST_F(KafkaDatasourceTest, MissingGroupID)
 {
   // group.id is a required configuration.
   std::map<std::string, std::string> kafka_configs;
-  kafka_configs.insert({"bootstrap.servers", "localhost:9092"});
+  kafka_configs["bootstrap.servers"] = "localhost:9092";
 
-  EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"),
-               cudf::logic_error);
+  kafka::python_callable_type python_callable;
+  kafka::kafka_oauth_callback_wrapper_type callback_wrapper;
+
+  EXPECT_THROW(
+    kafka::kafka_consumer kc(
+      kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"),
+    cudf::logic_error);
 }
 
 TEST_F(KafkaDatasourceTest, InvalidConfigValues)
 {
   // Give a made up configuration value
   std::map<std::string, std::string> kafka_configs;
-  kafka_configs.insert({"completely_made_up_config", "wrong"});
+  kafka_configs["completely_made_up_config"] = "wrong";
 
-  EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"),
-               cudf::logic_error);
+  kafka::python_callable_type python_callable;
+  kafka::kafka_oauth_callback_wrapper_type callback_wrapper;
 
-  kafka_configs.clear();
+  EXPECT_THROW(
+    kafka::kafka_consumer kc(
+      kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"),
+    cudf::logic_error);
 
   // Give a good config property with a bad value
-  kafka_configs.insert({"message.max.bytes", "this should be a number not text"});
-  EXPECT_THROW(kafka::kafka_consumer kc(kafka_configs, "csv-topic", 0, 0, 3, 5000, "\n"),
-               cudf::logic_error);
+  kafka_configs.clear();
+  kafka_configs["message.max.bytes"] = "this should be a number not text";
+
+  EXPECT_THROW(
+    kafka::kafka_consumer kc(
+      kafka_configs, python_callable, callback_wrapper, "csv-topic", 0, 0, 3, 5000, "\n"),
+    cudf::logic_error);
 }
diff --git a/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo b/python/cudf_kafka/cudf_kafka/_lib/.kafka.pxd.swo
new file mode 100644
index 0000000000000000000000000000000000000000..624b60798ae94ece673e4028670bd4d559d7c663
GIT binary patch
literal 12288
zcmeI2%ZnUE9LH<bL}MOG#EU4DvqWds-Pv^`5(YektdLcAH^giXS(m1#re>zI-CdpT
zn%N!j5fsFW2mKEc<0D2q=v7Y&9z2?eiYJd=a`6#JB!2r{-TRnbL=dTh5B;cL{i^C)
zk6-spH>Xa|&e0?FDS~uAAwS%F!+dbh!z0&^6LPjS;C^_2NOJpD%Q7ccg})L=Q(D5^
zxWKEC!^s)m=eF=1?#fBlYqcl6f$R!5IyY9#_KK;V8n)G(jPBOGwbsUt^<AxiR$yxd
zda^xnWdGvf=_jM5K3aK{K5%ex>oPi5E1(t73TOqi0$KsBfL1^&pcVLEDImQO@)~+{
zN7Ac><Tz3~-cHh~<o-jY{Nz#}v;tZIt$<cQE1(t73TOqi0$KsBfL1^&pcVKJDq!L-
z^+)a^q_+!=!~g&3`~Ua%5poS&0q=r7I05QlAJ_|iK0wGP;0pK{ybk8T3!nz7;J5vR
zd<nh)Z-Fyl3hV|y?jz(=a30KqIq=tBLT-WY!294F@W3<R*1d$>1fPS?fDb0XuX_l&
z2rhtEz{_9(%!9|kA#md!LVf|?fXm=*;DS@&MKB5~;P%~wTm_fFD(HhMcpUt+8*K*f
zfakz2@HKS%3S0yiK&az~-~%uK3>*hz-~hM|t*?Qr;1YNboCk~G5V&zCA%B9);3KdE
z&VYm9Tl`4#CU_RifKgBZf8fWO-_sv#wl&8n67yQTP5Z)XQD%#PR|9Uh$EoGY!_$VO
z(sH=y$r>Rg<DMJzngR2zW?Uq(2(5OKnnKu=nXJoOHL{a}scosD<NwHsXZq3#zg&sk
z#M(Hfw-iX<ayyYMPd{bIW!mA=ST!a09n)ViM7teunMfDgQYk!(jjOt;j{2NQ;V&hv
zUtXrR6-Xm06O{aTHOki*_aq%ty`gB>l&B?4IF2PHZ*9?%Y)G?C4jZzxhPZb_h9@0|
z%=?g$D5?~WraBX&{uDZsbUmD`EmN9-==qF~MZM3jBjsw;6&=e~3M6&4xNSiue9RP8
z7rIJY-83cZ8p&iYWJelvH&u3h9=>(*Wzy6_ZnS<fIr6Nm=M+ipv|fIC`Gs>LNqYIE
z3j90E58E>drDowq&~vSGJ#Ki?U&^<WWy*~F3Q;_?WVRUu^!e}u#hGj)s^+J)M!fTc
zu9dKw?lfsYP51OCO4e#z%{G3;D0I8Mq=weKw(XTwtx7E)vahQpv-8wNwj1V7)S#G*
z_h!*Giou1r8djh>*gC2z<#(*?x*jMV(rzZr-y|X{wI@#nX}Qudb(4hMZ4KSJ=H&v%
zbFdgtH*QNbVKBL^j$2KXjAPYuEP60bO9InuKm$b&QtD88Sq{TWk<_GgOuJPG&w1C@
zTbH8sP*XK{Z)IGJ<(pb&p&fgR;hGNL)KWCh;t7xnvnRU-Gi|$RvX$IA`qJ580uR{^
z_p9n0Plp7VZL1S(K9@GEVgt>0Tvz<e!aOhD3XA2H-7gd%@Jz;OCH%m<(Xd3L!PoEr
z=9+e+5tXaN4{MEhZ;dl)+ZPV46#dq_o>!qS;k_ox4IS3QkHYJgsONh$WtQU!U($R(
zF59EsVHr8_c#xUr9uoiX92&;+Ju{0hV~>eNGURE!@nyln;g0YJnU6fu6s=4zGr@4H
zdD;DNf#w<YK(??zvfN~`WG^XrGo;6aH}I`aSJEo0(dnt_sc|}gdUj^^C_T1t^2EZ)
Lqo-yU=Ii8dr$l%Z

literal 0
HcmV?d00001

diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index fc985e58b68..e64d8f82739 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -11,14 +11,21 @@ from cudf._lib.cpp.io.types cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
+cdef extern from "kafka_callback.hpp" \
+        namespace "cudf::io::external::kafka" nogil:
+    ctypedef object (*python_callable_type)()
+
+
 cdef extern from "kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
 
-        kafka_consumer(map[string, string] configs) except +
+        kafka_consumer(map[string, string] configs,
+                       python_callable_type python_callable) except +
 
         kafka_consumer(map[string, string] configs,
+                       python_callable_type python_callable,
                        string topic_name,
                        int32_t partition,
                        int64_t start_offset,
@@ -49,7 +56,6 @@ cdef extern from "kafka_consumer.hpp" \
 cdef class KafkaDatasource(Datasource):
 
     cdef unique_ptr[datasource] c_datasource
-    cdef map[string, string] kafka_configs
     cdef string topic
     cdef int32_t partition
     cdef int64_t start_offset
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 5588b69938b..24d072c544e 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,7 +1,7 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
-from libcpp cimport bool
+from libcpp cimport bool, nullptr
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
@@ -11,19 +11,43 @@ from cudf._lib.cpp.io.types cimport datasource
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
 
+# To avoid including <python.h> in libcudf_kafka
+# we introduce this wrapper in Cython
+cdef map[string, string] oauth_callback_wrapper(void *ctx):
+    return (<object>(ctx))()
+
+
 cdef class KafkaDatasource(Datasource):
 
     def __cinit__(self,
-                  map[string, string] kafka_configs,
+                  object kafka_configs,
                   string topic=b"",
                   int32_t partition=-1,
                   int64_t start_offset=0,
                   int64_t end_offset=0,
                   int32_t batch_timeout=10000,
                   string delimiter=b"",):
+
+        cdef map[string, string] configs
+        cdef void* python_callable = nullptr
+        cdef map[string, string] (*python_callable_wrapper)(void *)
+
+        for key in kafka_configs:
+            if key == 'oauth_cb':
+                if callable(kafka_configs[key]):
+                    python_callable = <void *>kafka_configs[key]
+                    python_callable_wrapper = &oauth_callback_wrapper
+                else:
+                    raise TypeError("'oauth_cb' configuration must \
+                                      be a Python callable object")
+            else:
+                configs[key.encode()] = kafka_configs[key].encode()
+
         if topic != b"" and partition != -1:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](kafka_configs,
+                make_unique[kafka_consumer](configs,
+                                            python_callable,
+                                            python_callable_wrapper,
                                             topic,
                                             partition,
                                             start_offset,
@@ -32,7 +56,9 @@ cdef class KafkaDatasource(Datasource):
                                             delimiter)
         else:
             self.c_datasource = <unique_ptr[datasource]> \
-                make_unique[kafka_consumer](kafka_configs)
+                make_unique[kafka_consumer](configs,
+                                            python_callable,
+                                            python_callable_wrapper)
 
     cdef datasource* get_datasource(self) nogil:
         return <datasource *> self.c_datasource.get()
diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py
index a301660a2e4..f5d5031602f 100644
--- a/python/custreamz/custreamz/kafka.py
+++ b/python/custreamz/custreamz/kafka.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 import confluent_kafka as ck
 from cudf_kafka._lib.kafka import KafkaDatasource
 
@@ -25,13 +25,7 @@ def __init__(self, kafka_configs):
         """
 
         self.kafka_configs = kafka_configs
-
-        self.kafka_confs = {
-            str.encode(key): str.encode(value)
-            for key, value in self.kafka_configs.items()
-        }
-
-        self.kafka_meta_client = KafkaDatasource(self.kafka_confs)
+        self.kafka_meta_client = KafkaDatasource(kafka_configs)
 
     def list_topics(self, specific_topic=None):
 
@@ -145,7 +139,7 @@ def read_gdf(
             )
 
         kafka_datasource = KafkaDatasource(
-            self.kafka_confs,
+            self.kafka_configs,
             topic.encode(),
             partition,
             start,
@@ -173,7 +167,10 @@ def read_gdf(
         kafka_datasource.close(batch_timeout)
 
         if result is not None:
-            return cudf.DataFrame._from_table(result)
+            if isinstance(result, cudf.DataFrame):
+                return result
+            else:
+                return cudf.DataFrame._from_data(result)
         else:
             # empty Dataframe
             return cudf.DataFrame()
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index d29ebf8db8b..ad3b829544b 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -5,11 +5,10 @@
 from cudf.testing._utils import assert_eq
 
 
-@pytest.mark.parametrize("commit_offset", [-1, 0, 1, 1000])
+@pytest.mark.parametrize("commit_offset", [1, 45, 100, 22, 1000, 10])
 @pytest.mark.parametrize("topic", ["cudf-kafka-test-topic"])
 def test_kafka_offset(kafka_client, topic, commit_offset):
-    ck_top = ck.TopicPartition(topic, 0, commit_offset)
-    offsets = [ck_top]
+    offsets = [ck.TopicPartition(topic, 0, commit_offset)]
     kafka_client.commit(offsets=offsets)
 
     # Get the offsets that were just committed to Kafka

From 61199ea1196b4ac355c2746a43c6ffc007c44d52 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 6 Jan 2022 14:53:01 -0500
Subject: [PATCH 125/202] Fix groupby shift/diff/fill after selecting from a 
 `GroupBy` (#9984)

Fixes https://github.com/rapidsai/cudf/issues/9969

Due to a bug in `GroupBy.__getitem__`, selecting a column of a `GroupBy` and then doing a shift, diff, or fill operation would result in the operation being performed on the wrong values. This PR fixes `GroupBy.__getitem__` so we now have the right behaviour.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9984
---
 python/cudf/cudf/core/groupby/groupby.py |  4 +--
 python/cudf/cudf/tests/test_groupby.py   | 40 ++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index f1d622362e2..08ef3f07776 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1190,7 +1190,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
 
         result = self.obj.__class__._from_data(
             *self._groupby.shift(
-                cudf.core.frame.Frame(value_columns), periods, fill_value
+                cudf.core.frame.Frame(value_columns._data), periods, fill_value
             )
         )
         result = self._mimic_pandas_order(result)
@@ -1299,7 +1299,7 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
 
     def __getitem__(self, key):
         return self.obj[key].groupby(
-            self.grouping, dropna=self._dropna, sort=self._sort
+            by=self.grouping.keys, dropna=self._dropna, sort=self._sort
         )
 
     def nunique(self):
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 1feaddf74e2..c73e96de470 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2362,4 +2362,44 @@ def test_groupby_get_group(pdf, group, name, obj):
     assert_groupby_results_equal(expected, actual)
 
 
+def test_groupby_select_then_ffill():
+    pdf = pd.DataFrame(
+        {
+            "a": [1, 1, 1, 2, 2],
+            "b": [1, None, None, 2, None],
+            "c": [3, None, None, 4, None],
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.groupby("a")["c"].ffill()
+    actual = gdf.groupby("a")["c"].ffill()
+
+    assert_groupby_results_equal(expected, actual)
+
+
+def test_groupby_select_then_shift():
+    pdf = pd.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]}
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.groupby("a")["c"].shift(1)
+    actual = gdf.groupby("a")["c"].shift(1)
+
+    assert_groupby_results_equal(expected, actual)
+
+
+def test_groupby_select_then_diff():
+    pdf = pd.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5], "c": [3, 4, 5, 6, 7]}
+    )
+    gdf = cudf.from_pandas(pdf)
+
+    expected = pdf.groupby("a")["c"].diff(1)
+    actual = gdf.groupby("a")["c"].diff(1)
+
+    assert_groupby_results_equal(expected, actual)
+
+
 # TODO: Add a test including datetime64[ms] column in input data

From 7392f9f5f10dc6efb4d21cfcef18a14a0df421c3 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 6 Jan 2022 13:01:03 -0800
Subject: [PATCH 126/202] use ninja in java ci build (#9933)

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)
  - Gera Shegalov (https://github.com/gerashegalov)
  - Peixin (https://github.com/pxLi)

URL: https://github.com/rapidsai/cudf/pull/9933
---
 java/ci/Dockerfile.centos7 |  2 +-
 java/ci/build-in-docker.sh | 14 ++++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
index 2ee57bfaeab..c1d29468f65 100644
--- a/java/ci/Dockerfile.centos7
+++ b/java/ci/Dockerfile.centos7
@@ -28,7 +28,7 @@ FROM gpuci/cuda:$CUDA_VERSION-devel-centos7
 ### Install basic requirements
 RUN yum install -y centos-release-scl
 RUN yum install -y devtoolset-9 epel-release
-RUN yum install -y git zlib-devel maven tar wget patch
+RUN yum install -y git zlib-devel maven tar wget patch ninja-build
 
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index a99b6900830..ac8b2584091 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -19,7 +19,6 @@
 set -ex
 gcc --version
 
-PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true}
 BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF}
 ENABLE_CUDA_STATIC_RUNTIME=${ENABLE_CUDA_STATIC_RUNTIME:-ON}
@@ -28,6 +27,7 @@ RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF}
 ENABLE_NVTX=${ENABLE_NVTX:-ON}
 ENABLE_GDS=${ENABLE_GDS:-OFF}
 OUT=${OUT:-out}
+CMAKE_GENERATOR=${CMAKE_GENERATOR:-Ninja}
 
 SIGN_FILE=$1
 #Set absolute path for OUT_PATH
@@ -54,7 +54,9 @@ export LIBCUDF_KERNEL_CACHE_PATH=/rapids
 rm -rf "$WORKSPACE/cpp/build"
 mkdir -p "$WORKSPACE/cpp/build"
 cd "$WORKSPACE/cpp/build"
-cmake .. -DUSE_NVTX=$ENABLE_NVTX \
+cmake .. -G"${CMAKE_GENERATOR}" \
+         -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
+         -DUSE_NVTX=$ENABLE_NVTX \
          -DCUDF_USE_ARROW_STATIC=ON \
          -DCUDF_ENABLE_ARROW_S3=OFF \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \
@@ -62,8 +64,12 @@ cmake .. -DUSE_NVTX=$ENABLE_NVTX \
          -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \
          -DBUILD_SHARED_LIBS=OFF
 
-make -j$PARALLEL_LEVEL
-make install DESTDIR=$INSTALL_PREFIX
+if [[ -z "${PARALLEL_LEVEL}" ]]; then
+    cmake --build .
+else
+    cmake --build . --parallel $PARALLEL_LEVEL
+fi
+cmake --install .
 
 ###### Build cudf jar ######
 BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\"\

From 120aa62decc7a23919a9b669dbb49f63a698b47d Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 6 Jan 2022 16:37:31 -0600
Subject: [PATCH 127/202] Fixed issue with percentile_approx where output
 tdigests could have uninitialized data at the end. (#9931)

Fixes https://github.com/NVIDIA/spark-rapids/issues/4060

Issue was relatively straightforward.  There is a section of code in the bucket generation step that detects "gaps" that would be generated during the reduction step.   It was incorrectly indexing into the list of cumulative weights for input values.  Fundamental change was to change the `TotalWeightIter` iterator which was just returning the total weight for an input group into a `GroupInfoFunc` functor that returns total weight as well as group size info that is used to index cumulative weights correctly.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9931
---
 cpp/src/groupby/sort/group_tdigest.cu | 154 +++++++++++++++-----------
 1 file changed, 92 insertions(+), 62 deletions(-)

diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index ecb18c09f9d..b7b45341ad2 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -101,10 +101,14 @@ struct merge_centroids {
  * nearest whole number <= it is floor(3.56) == 3.
  */
 struct nearest_value_scalar_weights {
-  thrust::pair<double, int> operator() __device__(double next_limit, size_type)
+  offset_type const* group_offsets;
+
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index)
   {
-    double const f = floor(next_limit);
-    return {f, max(0, static_cast<int>(next_limit) - 1)};
+    double const f                   = floor(next_limit);
+    auto const relative_weight_index = max(0, static_cast<int>(next_limit) - 1);
+    auto const group_size            = group_offsets[group_index + 1] - group_offsets[group_index];
+    return {f, relative_weight_index < group_size ? relative_weight_index : group_size - 1};
   }
 };
 
@@ -136,7 +140,8 @@ struct nearest_value_centroid_weights {
                         group_cumulative_weights);
 
     return index == 0 ? thrust::pair<double, int>{0, 0}
-                      : thrust::pair<double, int>{group_cumulative_weights[index - 1], index - 1};
+                      : thrust::pair<double, int>{group_cumulative_weights[index - 1],
+                                                  static_cast<int>(index) - 1};
   }
 };
 
@@ -187,6 +192,39 @@ struct cumulative_centroid_weight {
   }
 };
 
+// retrieve group info of scalar inputs by group index
+struct scalar_group_info {
+  size_type const* group_valid_counts;
+  offset_type const* group_offsets;
+
+  __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index)
+  {
+    return {static_cast<double>(group_valid_counts[group_index]),
+            group_offsets[group_index + 1] - group_offsets[group_index],
+            group_offsets[group_index]};
+  }
+};
+
+// retrieve group info of centroid inputs by group index
+struct centroid_group_info {
+  double const* cumulative_weights;
+  offset_type const* outer_offsets;
+  offset_type const* inner_offsets;
+
+  __device__ thrust::tuple<double, size_type, size_type> operator()(size_type group_index)
+  {
+    // if there's no weights in this group of digests at all, return 0.
+    auto const group_start       = inner_offsets[outer_offsets[group_index]];
+    auto const group_end         = inner_offsets[outer_offsets[group_index + 1]];
+    auto const num_weights       = group_end - group_start;
+    auto const last_weight_index = group_end - 1;
+    return num_weights == 0
+             ? thrust::tuple<double, size_type, size_type>{0, num_weights, group_start}
+             : thrust::tuple<double, size_type, size_type>{
+                 cumulative_weights[last_weight_index], num_weights, group_start};
+  }
+};
+
 struct tdigest_min {
   __device__ double operator()(thrust::tuple<double, size_type> const& t)
   {
@@ -231,37 +269,40 @@ __device__ double scale_func_k1(double quantile, double delta_norm)
  * cluster sizes and total # of clusters, and once to compute the actual
  * weight limits per cluster.
  *
- * @param delta_              tdigest compression level
+ * @param delta               tdigest compression level
  * @param num_groups          The number of input groups
- * @param nearest_weight_     A functor which returns the nearest weight in the input
+ * @param nearest_weight      A functor which returns the nearest weight in the input
  * stream that falls before our current cluster limit
- * @param total_weight_       A functor which returns the expected total weight for
- * the entire stream of input values for the specified group.
+ * @param group_info          A functor which returns the info for the specified group (total
+ * weight, size and start offset)
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
  * @param has_nulls Whether or not the input contains nulls
  *
  */
-template <typename TotalWeightIter, typename NearestWeightFunc, typename CumulativeWeight>
-__global__ void generate_cluster_limits_kernel(int delta_,
+
+template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
+__global__ void generate_cluster_limits_kernel(int delta,
                                                size_type num_groups,
                                                NearestWeightFunc nearest_weight,
-                                               TotalWeightIter total_weight_,
+                                               GroupInfo group_info,
                                                CumulativeWeight cumulative_weight,
                                                double* group_cluster_wl,
                                                size_type* group_num_clusters,
                                                offset_type const* group_cluster_offsets,
                                                bool has_nulls)
 {
-  int const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+
   auto const group_index = tid;
   if (group_index >= num_groups) { return; }
 
   // we will generate at most delta clusters.
-  double const delta        = static_cast<double>(delta_);
-  double const delta_norm   = delta / (2.0 * M_PI);
-  double const total_weight = total_weight_[group_index];
+  double const delta_norm = static_cast<double>(delta) / (2.0 * M_PI);
+  double total_weight;
+  size_type group_size, group_start;
+  thrust::tie(total_weight, group_size, group_start) = group_info(group_index);
 
   // start at the correct place based on our cluster offset.
   double* cluster_wl =
@@ -281,11 +322,11 @@ __global__ void generate_cluster_limits_kernel(int delta_,
   double cur_limit        = 0.0;
   double cur_weight       = 0.0;
   double next_limit       = -1.0;
-  int last_inserted_index = -1;
+  int last_inserted_index = -1;  // group-relative index into the input stream
 
   // compute the first cluster limit
   double nearest_w;
-  int nearest_w_index;
+  int nearest_w_index;  // group-relative index into the input stream
   while (1) {
     cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
     if (cur_weight >= total_weight) { break; }
@@ -331,12 +372,19 @@ __global__ void generate_cluster_limits_kernel(int delta_,
       // during the reduction step to be trivial.
       //
       double adjusted_next_limit = next_limit;
-      if (nearest_w_index == last_inserted_index || last_inserted_index < 0) {
-        nearest_w_index       = last_inserted_index + 1;
-        auto [r, i, adjusted] = cumulative_weight(nearest_w_index);
-        adjusted_next_limit   = max(next_limit, adjusted);
-        (void)r;
-        (void)i;
+      if ((last_inserted_index < 0) ||  // if we haven't inserted anything yet
+          (nearest_w_index ==
+           last_inserted_index)) {  // if we land in the same bucket as the previous cap
+
+        // force the value into this bucket
+        nearest_w_index =
+          (last_inserted_index == group_size - 1) ? last_inserted_index : last_inserted_index + 1;
+
+        // the "adjusted" weight must be high enough so that this value will fall in the bucket.
+        // NOTE: cumulative_weight expects an absolute index into the input value stream, not a
+        // group-relative index
+        [[maybe_unused]] auto [r, i, adjusted] = cumulative_weight(nearest_w_index + group_start);
+        adjusted_next_limit                    = max(next_limit, adjusted);
       }
       cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit;
       last_inserted_index                         = nearest_w_index;
@@ -360,8 +408,8 @@ __global__ void generate_cluster_limits_kernel(int delta_,
  * @param num_groups         The number of input groups
  * @param nearest_weight     A functor which returns the nearest weight in the input
  * stream that falls before our current cluster limit
- * @param total_weight       A functor which returns the expected total weight for
- * the entire stream of input values for the specified group.
+ * @param group_info         A functor which returns the info for the specified group (total weight,
+ * size and start offset)
  * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
@@ -369,12 +417,12 @@ __global__ void generate_cluster_limits_kernel(int delta_,
  * @returns A tuple containing the set of cluster weight limits for each group, a set of
  * list-style offsets indicating group sizes, and the total number of clusters
  */
-template <typename TotalWeightIter, typename NearestWeight, typename CumulativeWeight>
+template <typename GroupInfo, typename NearestWeight, typename CumulativeWeight>
 std::tuple<rmm::device_uvector<double>, std::unique_ptr<column>, size_type>
 generate_group_cluster_info(int delta,
                             size_type num_groups,
                             NearestWeight nearest_weight,
-                            TotalWeightIter total_weight,
+                            GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
                             bool has_nulls,
                             rmm::cuda_stream_view stream,
@@ -390,7 +438,7 @@ generate_group_cluster_info(int delta,
     delta,
     num_groups,
     nearest_weight,
-    total_weight,
+    group_info,
     cumulative_weight,
     nullptr,
     group_num_clusters.begin(),
@@ -420,7 +468,7 @@ generate_group_cluster_info(int delta,
     delta,
     num_groups,
     nearest_weight,
-    total_weight,
+    group_info,
     cumulative_weight,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
@@ -583,9 +631,8 @@ std::unique_ptr<column> compute_tdigests(int delta,
      group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
      group_cumulative_weight] __device__(size_type value_index) -> size_type {
       // get group index, relative value index within the group and cumulative weight.
-      auto [group_index, relative_value_index, cumulative_weight] =
+      [[maybe_unused]] auto [group_index, relative_value_index, cumulative_weight] =
         group_cumulative_weight(value_index);
-      (void)relative_value_index;
 
       auto const num_clusters =
         group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index];
@@ -616,8 +663,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   cudf::mutable_column_view weight_col(*centroid_weights);
 
   // reduce the centroids into the clusters
-  auto output           = thrust::make_zip_iterator(thrust::make_tuple(
+  auto output = thrust::make_zip_iterator(thrust::make_tuple(
     mean_col.begin<double>(), weight_col.begin<double>(), thrust::make_discard_iterator()));
+
   auto const num_values = std::distance(centroids_begin, centroids_end);
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         keys,
@@ -640,12 +688,6 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              mr);
 }
 
-// retrieve total weight of scalar inputs by group index
-struct scalar_total_weight {
-  size_type const* group_valid_counts;
-  __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; }
-};
-
 // return the min/max value of scalar inputs by group index
 template <typename T>
 struct get_scalar_minmax {
@@ -678,17 +720,15 @@ struct typed_group_tdigest {
                                      rmm::mr::device_memory_resource* mr)
   {
     // first, generate cluster weight information for each input group
-    auto total_weight = cudf::detail::make_counting_transform_iterator(
-      0, scalar_total_weight{group_valid_counts.begin()});
-    auto [group_cluster_wl, group_cluster_offsets, total_clusters] =
-      generate_group_cluster_info(delta,
-                                  num_groups,
-                                  nearest_value_scalar_weights{},
-                                  total_weight,
-                                  cumulative_scalar_weight{group_offsets, group_labels},
-                                  col.null_count() > 0,
-                                  stream,
-                                  mr);
+    auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
+      delta,
+      num_groups,
+      nearest_value_scalar_weights{group_offsets.begin()},
+      scalar_group_info{group_valid_counts.begin(), group_offsets.begin()},
+      cumulative_scalar_weight{group_offsets, group_labels},
+      col.null_count() > 0,
+      stream,
+      mr);
 
     // device column view. handy because the .element() function
     // automatically handles fixed-point conversions for us
@@ -927,25 +967,15 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   auto const delta = max_centroids;
 
   // generate cluster info
-  auto total_group_weight = cudf::detail::make_counting_transform_iterator(
-    0,
-    [outer_offsets = group_offsets.data(),
-     inner_offsets = tdigest_offsets.begin<size_type>(),
-     cumulative_weights =
-       cumulative_weights->view().begin<double>()] __device__(size_type group_index) -> double {
-      // if there's no weights in this group of digests at all, return 0.
-      auto const num_weights =
-        inner_offsets[outer_offsets[group_index + 1]] - inner_offsets[outer_offsets[group_index]];
-      auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1;
-      return num_weights == 0 ? 0 : cumulative_weights[last_weight_index];
-    });
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
     num_groups,
     nearest_value_centroid_weights{cumulative_weights->view().begin<double>(),
                                    group_offsets.data(),
                                    tdigest_offsets.begin<size_type>()},
-    total_group_weight,
+    centroid_group_info{cumulative_weights->view().begin<double>(),
+                        group_offsets.data(),
+                        tdigest_offsets.begin<size_type>()},
     cumulative_centroid_weight{
       cumulative_weights->view().begin<double>(),
       group_labels,

From de8c0b8ee90629d1880953413de4b47907627958 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 6 Jan 2022 20:07:23 -0800
Subject: [PATCH 128/202] Resolve racecheck errors in ORC kernels (#9916)

Running ORC Python tests with `compute-sanitizer --tool racecheck` results in a number of errors/warnings.
This PR resolves the errors originating in ORC kernels. Remaining errors come from `gpu_inflate`.

Adds a few missing block/warp syncs and minor clean up in the affected code.

Causes ~4~2% slowdown on average in ORC reader benchmarks. Not negligible, will double check whether the changes are required, or just resolving false positives in `racecheck`.
Ran the benchmarks many more times, and the average time difference is smaller than variations between runs.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Elias Stehle (https://github.com/elstehle)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/9916
---
 cpp/src/io/comp/gpuinflate.cu | 17 +++++++----------
 cpp/src/io/orc/stripe_data.cu | 35 ++++++++++++++++++-----------------
 cpp/src/io/orc/stripe_enc.cu  |  7 ++-----
 3 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 338af72e4c9..dab8ce1afa5 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -780,22 +780,19 @@ __device__ void process_symbols(inflate_state_s* s, int t)
 
   do {
     volatile uint32_t* b = &s->x.u.symqueue[batch * batch_size];
-    int batch_len, pos;
-    int32_t symt;
-    uint32_t lit_mask;
-
+    int batch_len        = 0;
     if (t == 0) {
       while ((batch_len = s->x.batch_len[batch]) == 0) {}
-    } else {
-      batch_len = 0;
     }
     batch_len = shuffle(batch_len);
     if (batch_len < 0) { break; }
 
-    symt     = (t < batch_len) ? b[t] : 256;
-    lit_mask = ballot(symt >= 256);
-    pos      = min((__ffs(lit_mask) - 1) & 0xff, 32);
+    auto const symt     = (t < batch_len) ? b[t] : 256;
+    auto const lit_mask = ballot(symt >= 256);
+    auto pos            = min((__ffs(lit_mask) - 1) & 0xff, 32);
+
     if (t == 0) { s->x.batch_len[batch] = 0; }
+
     if (t < pos && out + t < outend) { out[t] = symt; }
     out += pos;
     batch_len -= pos;
@@ -825,7 +822,7 @@ __device__ void process_symbols(inflate_state_s* s, int t)
       }
     }
     batch = (batch + 1) & (batch_count - 1);
-  } while (1);
+  } while (true);
 
   if (t == 0) { s->out = out; }
 }
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 8f8bb87d9e4..05bc25597c2 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -409,7 +409,7 @@ inline __device__ int decode_base128_varint(volatile orc_bytestream_s* bs, int p
         if (b > 0x7f) {
           b = bytestream_readbyte(bs, pos++);
           v = (v & 0x0fffffff) | (b << 28);
-          if (sizeof(T) > 4) {
+          if constexpr (sizeof(T) > 4) {
             uint32_t lo = v;
             uint64_t hi;
             v = b >> 4;
@@ -650,13 +650,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
                                          int t,
                                          bool has_buffered_values = false)
 {
-  uint32_t numvals, numruns;
-  int r, tr;
-
   if (t == 0) {
     uint32_t maxpos  = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
     uint32_t lastpos = bs->pos;
-    numvals = numruns = 0;
+    auto numvals     = 0;
+    auto numruns     = 0;
     // Find the length and start location of each run
     while (numvals < maxvals) {
       uint32_t pos   = lastpos;
@@ -713,9 +711,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
   }
   __syncthreads();
   // Process the runs, 1 warp per run
-  numruns = rle->num_runs;
-  r       = t >> 5;
-  tr      = t & 0x1f;
+  auto const numruns = rle->num_runs;
+  auto const r       = t >> 5;
+  auto const tr      = t & 0x1f;
   for (uint32_t run = r; run < numruns; run += num_warps) {
     uint32_t base, pos, w, n;
     int mode;
@@ -731,7 +729,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
         w = 8 + (byte0 & 0x38);  // 8 to 64 bits
         n = 3 + (byte0 & 7);     // 3 to 10 values
         bytestream_readbe(bs, pos * 8, w, baseval);
-        if (sizeof(T) <= 4) {
+        if constexpr (sizeof(T) <= 4) {
           rle->baseval.u32[r] = baseval;
         } else {
           rle->baseval.u64[r] = baseval;
@@ -746,7 +744,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
             uint32_t byte3 = bytestream_readbyte(bs, pos++);
             uint32_t bw    = 1 + (byte2 >> 5);        // base value width, 1 to 8 bytes
             uint32_t pw    = kRLEv2_W[byte2 & 0x1f];  // patch width, 1 to 64 bits
-            if (sizeof(T) <= 4) {
+            if constexpr (sizeof(T) <= 4) {
               uint32_t baseval, mask;
               bytestream_readbe(bs, pos * 8, bw * 8, baseval);
               mask                = (1 << (bw * 8 - 1)) - 1;
@@ -766,7 +764,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
             int64_t delta;
             // Delta
             pos = decode_varint(bs, pos, baseval);
-            if (sizeof(T) <= 4) {
+            if constexpr (sizeof(T) <= 4) {
               rle->baseval.u32[r] = baseval;
             } else {
               rle->baseval.u64[r] = baseval;
@@ -782,8 +780,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
     pos  = shuffle(pos);
     n    = shuffle(n);
     w    = shuffle(w);
+    __syncwarp();  // Not required, included to fix the racecheck warning
     for (uint32_t i = tr; i < n; i += 32) {
-      if (sizeof(T) <= 4) {
+      if constexpr (sizeof(T) <= 4) {
         if (mode == 0) {
           vals[base + i] = rle->baseval.u32[r];
         } else if (mode == 1) {
@@ -860,7 +859,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
           if (j & i) vals[base + j] += vals[base + ((j & ~i) | (i - 1))];
         }
       }
-      if (sizeof(T) <= 4)
+      if constexpr (sizeof(T) <= 4)
         baseval = rle->baseval.u32[r];
       else
         baseval = rle->baseval.u64[r];
@@ -868,6 +867,7 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
         vals[base + j] += baseval;
       }
     }
+    __syncwarp();
   }
   __syncthreads();
   return rle->num_vals;
@@ -1679,11 +1679,12 @@ __global__ void __launch_bounds__(block_size)
           }
         }
       }
-      if (t == 0 && numvals + vals_skipped > 0 && numvals < s->top.data.max_vals) {
-        if (s->chunk.type_kind == TIMESTAMP) {
-          s->top.data.buffered_count = s->top.data.max_vals - numvals;
+      if (t == 0 && numvals + vals_skipped > 0) {
+        auto const max_vals = s->top.data.max_vals;
+        if (max_vals > numvals) {
+          if (s->chunk.type_kind == TIMESTAMP) { s->top.data.buffered_count = max_vals - numvals; }
+          s->top.data.max_vals = numvals;
         }
-        s->top.data.max_vals = numvals;
       }
       __syncthreads();
       // Use the valid bits to compute non-null row positions until we get a full batch of values to
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 829e4877c44..660ec025d00 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -414,7 +414,7 @@ static __device__ uint32_t IntegerRLE(
         uint32_t mode1_w, mode2_w;
         typename std::make_unsigned<T>::type vrange_mode1, vrange_mode2;
         block_vmin = static_cast<uint64_t>(vmin);
-        if (sizeof(T) > 4) {
+        if constexpr (sizeof(T) > 4) {
           vrange_mode1 = (is_signed) ? max(zigzag(vmin), zigzag(vmax)) : vmax;
           vrange_mode2 = vmax - vmin;
           mode1_w      = 8 - min(CountLeadingBytes64(vrange_mode1), 7);
@@ -705,10 +705,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s,
   }
 
   // reset shared state
-  if (t == 0) {
-    s->nnz     = 0;
-    s->numvals = 0;
-  }
+  if (t == 0) { s->nnz = 0; }
 }
 
 /**

From 42a0e55b34cfe8b08872cd05bc3e8a60faa266fb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jan 2022 13:20:24 -0800
Subject: [PATCH 129/202] Clean up CUDA stream use in cuIO (#9991)

Removes defaults for internal stream parameters.
Fixes cases where default stream was used instead of propagating the stream.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9991
---
 cpp/src/io/avro/avro_gpu.h                 | 12 +++---
 cpp/src/io/avro/reader_impl.cu             |  8 ++--
 cpp/src/io/comp/gpuinflate.h               | 46 +++++++++++-----------
 cpp/src/io/csv/reader_impl.cu              |  4 +-
 cpp/src/io/csv/writer_impl.cu              |  9 ++---
 cpp/src/io/orc/timezone.cuh                |  7 +++-
 cpp/src/io/orc/writer_impl.hpp             |  4 +-
 cpp/src/io/parquet/parquet_gpu.hpp         |  4 +-
 cpp/src/io/parquet/reader_impl.cu          |  6 +--
 cpp/src/io/parquet/writer_impl.hpp         |  4 +-
 cpp/src/io/utilities/column_buffer.hpp     | 39 ++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp | 13 ++----
 cpp/tests/io/comp/decomp_test.cpp          |  9 +++--
 cpp/tests/utilities_tests/span_tests.cu    | 16 ++++----
 14 files changed, 87 insertions(+), 94 deletions(-)

diff --git a/cpp/src/io/avro/avro_gpu.h b/cpp/src/io/avro/avro_gpu.h
index c87ac8afb13..3811132435b 100644
--- a/cpp/src/io/avro/avro_gpu.h
+++ b/cpp/src/io/avro/avro_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,17 +47,17 @@ struct schemadesc_s {
  * @param[in] max_rows Maximum number of rows to load
  * @param[in] first_row Crop all rows below first_row
  * @param[in] min_row_size Minimum size in bytes of a row
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] stream CUDA stream to use
  */
 void DecodeAvroColumnData(cudf::device_span<block_desc_s const> blocks,
                           schemadesc_s* schema,
                           cudf::device_span<string_index_pair const> global_dictionary,
                           uint8_t const* avro_data,
                           uint32_t schema_len,
-                          size_t max_rows              = ~0,
-                          size_t first_row             = 0,
-                          uint32_t min_row_size        = 0,
-                          rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                          size_t max_rows,
+                          size_t first_row,
+                          uint32_t min_row_size,
+                          rmm::cuda_stream_view stream);
 
 }  // namespace gpu
 }  // namespace avro
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index d908e6c8ed5..0fa5680c5d2 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,8 +159,8 @@ rmm::device_buffer decompress_data(datasource& source,
   if (meta.codec == "deflate") {
     size_t uncompressed_data_size = 0;
 
-    auto inflate_in  = hostdevice_vector<gpu_inflate_input_s>(meta.block_list.size());
-    auto inflate_out = hostdevice_vector<gpu_inflate_status_s>(meta.block_list.size());
+    auto inflate_in  = hostdevice_vector<gpu_inflate_input_s>(meta.block_list.size(), stream);
+    auto inflate_out = hostdevice_vector<gpu_inflate_status_s>(meta.block_list.size(), stream);
 
     // Guess an initial maximum uncompressed block size
     uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff;
@@ -343,7 +343,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
   }
 
   // Build gpu schema
-  auto schema_desc = hostdevice_vector<gpu::schemadesc_s>(meta.schema.size());
+  auto schema_desc = hostdevice_vector<gpu::schemadesc_s>(meta.schema.size(), stream);
 
   uint32_t min_row_data_size = 0;
   int skip_field_cnt         = 0;
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index a37d282997e..3ca9c9eee10 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,26 +49,26 @@ struct gpu_inflate_status_s {
  *
  * @param[in] inputs List of input argument structures
  * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures, default 1
- * @param[in] parse_hdr Whether or not to parse GZIP header, default false
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] count Number of input/output structures
+ * @param[in] parse_hdr Whether or not to parse GZIP header
+ * @param[in] stream CUDA stream to use
  */
 cudaError_t gpuinflate(gpu_inflate_input_s* inputs,
                        gpu_inflate_status_s* outputs,
-                       int count                    = 1,
-                       int parse_hdr                = 0,
-                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                       int count,
+                       int parse_hdr,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for copying uncompressed byte blocks
  *
  * @param[in] inputs List of input argument structures
- * @param[in] count Number of input structures, default 1
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] count Number of input structures
+ * @param[in] stream CUDA stream to use
  */
 cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
-                                         int count                    = 1,
-                                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                                         int count,
+                                         rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for decompressing Snappy-compressed data
@@ -78,13 +78,13 @@ cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs,
  *
  * @param[in] inputs List of input argument structures
  * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures, default 1
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] count Number of input/output structures
+ * @param[in] stream CUDA stream to use
  */
 cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs,
                        gpu_inflate_status_s* outputs,
-                       int count                    = 1,
-                       rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                       int count,
+                       rmm::cuda_stream_view stream);
 
 /**
  * @brief Computes the size of temporary memory for Brotli decompression
@@ -105,15 +105,15 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @param[out] outputs List of output status structures
  * @param[in] scratch Temporary memory for intermediate work
  * @param[in] scratch_size Size in bytes of the temporary memory
- * @param[in] count Number of input/output structures, default 1
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] count Number of input/output structures
+ * @param[in] stream CUDA stream to use
  */
 cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs,
                          gpu_inflate_status_s* outputs,
                          void* scratch,
                          size_t scratch_size,
-                         int count                    = 1,
-                         rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                         int count,
+                         rmm::cuda_stream_view stream);
 
 /**
  * @brief Interface for compressing data with Snappy
@@ -123,13 +123,13 @@ cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs,
  *
  * @param[in] inputs List of input argument structures
  * @param[out] outputs List of output status structures
- * @param[in] count Number of input/output structures, default 1
- * @param[in] stream CUDA stream to use, default 0
+ * @param[in] count Number of input/output structures
+ * @param[in] stream CUDA stream to use
  */
 cudaError_t gpu_snap(gpu_inflate_input_s* inputs,
                      gpu_inflate_status_s* outputs,
-                     int count                    = 1,
-                     rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                     int count,
+                     rmm::cuda_stream_view stream);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 7f032b6987c..0e50bb46232 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -237,7 +237,7 @@ std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
   size_t max_blocks =
     std::max<size_t>((buffer_size / cudf::io::csv::gpu::rowofs_block_bytes) + 1, 2);
-  hostdevice_vector<uint64_t> row_ctx(max_blocks);
+  hostdevice_vector<uint64_t> row_ctx(max_blocks, stream);
   size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
   size_t pos         = std::min(range_begin, data.size());
   size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index b9b6fc6cf94..1b66df860a3 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -137,10 +137,9 @@ struct column_to_strings_fn {
       (cudf::is_timestamp<column_type>()) || (cudf::is_duration<column_type>()));
   }
 
-  explicit column_to_strings_fn(
-    csv_writer_options const& options,
-    rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  explicit column_to_strings_fn(csv_writer_options const& options,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
     : options_(options), stream_(stream), mr_(mr)
   {
   }
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index 77c2bd4ffa0..e15144f9ea5 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,10 +107,13 @@ inline __device__ int32_t get_gmt_offset(cudf::device_span<int64_t const> ttimes
   return get_gmt_offset_impl(ttimes.begin(), offsets.begin(), ttimes.size(), ts);
 }
 
-struct timezone_table {
+class timezone_table {
   int32_t gmt_offset = 0;
   rmm::device_uvector<int64_t> ttimes;
   rmm::device_uvector<int32_t> offsets;
+
+ public:
+  // Safe to use the default stream, device_uvectors will not change after they are created empty
   timezone_table() : ttimes{0, rmm::cuda_stream_default}, offsets{0, rmm::cuda_stream_default} {}
   timezone_table(int32_t gmt_offset,
                  rmm::device_uvector<int64_t>&& ttimes,
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 80c22b09927..d989721334e 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -349,7 +349,7 @@ class writer::impl {
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
   // Cuda stream to be used
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+  rmm::cuda_stream_view stream;
 
   stripe_size_limits max_stripe_size;
   size_type row_index_stride;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 53bb11c8b70..b77eeac68f5 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -479,7 +479,7 @@ struct dremel_data {
 dremel_data get_dremel_data(column_view h_col,
                             rmm::device_uvector<uint8_t> const& d_nullability,
                             std::vector<uint8_t> const& nullability,
-                            rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+                            rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder page fragments
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 957cc85454c..fc4afe951db 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1427,8 +1427,8 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths);
+  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, stream);
+  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, stream);
   auto chunk_offsets       = std::vector<size_t>();
 
   // Update chunks with pointers to column data.
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index e41832aaabe..405ab0c2880 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -206,7 +206,7 @@ class writer::impl {
   // TODO : figure out if we want to keep this. It is currently unused.
   rmm::mr::device_memory_resource* _mr = nullptr;
   // Cuda stream to be used
-  rmm::cuda_stream_view stream = rmm::cuda_stream_default;
+  rmm::cuda_stream_view stream;
 
   size_t max_row_group_size          = default_row_group_size_bytes;
   size_type max_row_group_rows       = default_row_group_size_rows;
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 9300bd0f8b2..17df49009c2 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,11 +45,10 @@ namespace detail {
  *
  * @return `rmm::device_buffer` Device buffer allocation
  */
-inline rmm::device_buffer create_data(
-  data_type type,
-  size_type size,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+inline rmm::device_buffer create_data(data_type type,
+                                      size_type size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   std::size_t data_size = size_of(type) * size;
 
@@ -75,9 +74,9 @@ struct column_buffer {
   // construct with a known size. allocates memory
   column_buffer(data_type _type,
                 size_type _size,
-                bool _is_nullable                   = true,
-                rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                bool _is_nullable,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr)
     : type(_type), is_nullable(_is_nullable)
   {
     create(_size, stream, mr);
@@ -93,9 +92,7 @@ struct column_buffer {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size,
-              rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
 
   auto data() { return _strings ? _strings->data() : _data.data(); }
   auto data_size() const { return _strings ? _strings->size() : _data.size(); }
@@ -134,11 +131,10 @@ struct column_buffer {
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> make_column(
-  column_buffer& buffer,
-  column_name_info* schema_info       = nullptr,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> make_column(column_buffer& buffer,
+                                    column_name_info* schema_info,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Creates an equivalent empty column from an existing set of device memory buffers.
@@ -155,11 +151,10 @@ std::unique_ptr<column> make_column(
  *
  * @return `std::unique_ptr<cudf::column>` Column from the existing device data
  */
-std::unique_ptr<column> empty_like(
-  column_buffer& buffer,
-  column_name_info* schema_info       = nullptr,
-  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> empty_like(column_buffer& buffer,
+                                   column_name_info* schema_info,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index a7f9aec7bb4..cbf914b8da6 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,15 +45,12 @@ class hostdevice_vector {
     return *this;
   }
 
-  explicit hostdevice_vector(size_t max_size,
-                             rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+  explicit hostdevice_vector(size_t max_size, rmm::cuda_stream_view stream)
     : hostdevice_vector(max_size, max_size, stream)
   {
   }
 
-  explicit hostdevice_vector(size_t initial_size,
-                             size_t max_size,
-                             rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+  explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
     : num_elements(initial_size), max_elements(max_size)
   {
     if (max_elements != 0) {
@@ -148,9 +145,7 @@ namespace detail {
 template <typename T>
 class hostdevice_2dvector {
  public:
-  hostdevice_2dvector(size_t rows,
-                      size_t columns,
-                      rmm::cuda_stream_view stream = rmm::cuda_stream_default)
+  hostdevice_2dvector(size_t rows, size_t columns, rmm::cuda_stream_view stream)
     : _size{rows, columns}, _data{rows * columns, stream}
   {
   }
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 8247ced4629..dd00b201df9 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
   cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
                        cudf::io::gpu_inflate_status_s* d_inf_stat)
   {
-    return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1);
+    return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1, rmm::cuda_stream_default);
   }
 };
 
@@ -108,7 +108,7 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
   cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args,
                        cudf::io::gpu_inflate_status_s* d_inf_stat)
   {
-    return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1);
+    return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1, rmm::cuda_stream_default);
   }
 };
 
@@ -122,7 +122,8 @@ struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
     rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
                                  rmm::cuda_stream_default};
 
-    return cudf::io::gpu_debrotli(d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1);
+    return cudf::io::gpu_debrotli(
+      d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1, rmm::cuda_stream_default);
   }
 };
 
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index a9a5151e7c3..044ac3e60f7 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -248,9 +248,9 @@ class MdSpanTest : public cudf::test::BaseFixture {
 
 TEST(MdSpanTest, CanDetermineEmptiness)
 {
-  auto const vector            = hostdevice_2dvector<int>(1, 2);
-  auto const no_rows_vector    = hostdevice_2dvector<int>(0, 2);
-  auto const no_columns_vector = hostdevice_2dvector<int>(1, 0);
+  auto const vector            = hostdevice_2dvector<int>(1, 2, rmm::cuda_stream_default);
+  auto const no_rows_vector    = hostdevice_2dvector<int>(0, 2, rmm::cuda_stream_default);
+  auto const no_columns_vector = hostdevice_2dvector<int>(1, 0, rmm::cuda_stream_default);
 
   EXPECT_FALSE(host_2dspan<int const>{vector}.is_empty());
   EXPECT_FALSE(device_2dspan<int const>{vector}.is_empty());
@@ -271,7 +271,7 @@ __global__ void readwrite_kernel(device_2dspan<int> result)
 
 TEST(MdSpanTest, DeviceReadWrite)
 {
-  auto vector = hostdevice_2dvector<int>(11, 23);
+  auto vector = hostdevice_2dvector<int>(11, 23, rmm::cuda_stream_default);
 
   readwrite_kernel<<<1, 1>>>(vector);
   readwrite_kernel<<<1, 1>>>(vector);
@@ -281,7 +281,7 @@ TEST(MdSpanTest, DeviceReadWrite)
 
 TEST(MdSpanTest, HostReadWrite)
 {
-  auto vector = hostdevice_2dvector<int>(11, 23);
+  auto vector = hostdevice_2dvector<int>(11, 23, rmm::cuda_stream_default);
   auto span   = host_2dspan<int>{vector};
   span[5][6]  = 5;
   if (span[5][6] == 5) { span[5][6] *= 6; }
@@ -291,7 +291,7 @@ TEST(MdSpanTest, HostReadWrite)
 
 TEST(MdSpanTest, CanGetSize)
 {
-  auto const vector = hostdevice_2dvector<int>(1, 2);
+  auto const vector = hostdevice_2dvector<int>(1, 2, rmm::cuda_stream_default);
 
   EXPECT_EQ(host_2dspan<int const>{vector}.size(), vector.size());
   EXPECT_EQ(device_2dspan<int const>{vector}.size(), vector.size());
@@ -299,7 +299,7 @@ TEST(MdSpanTest, CanGetSize)
 
 TEST(MdSpanTest, CanGetCount)
 {
-  auto const vector = hostdevice_2dvector<int>(11, 23);
+  auto const vector = hostdevice_2dvector<int>(11, 23, rmm::cuda_stream_default);
 
   EXPECT_EQ(host_2dspan<int const>{vector}.count(), 11ul * 23);
   EXPECT_EQ(device_2dspan<int const>{vector}.count(), 11ul * 23);

From 7656277658a7c1bc90f144cc8e435f90bb17cac5 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Sat, 8 Jan 2022 03:59:47 +0530
Subject: [PATCH 130/202] Use default value for decimal precision in parquet
 writer when not specified (#9963)

Fixes #9962

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9963
---
 .../io/parquet/parquet_reader_benchmark.cpp   | 20 ++++++++------
 cpp/src/io/parquet/writer_impl.cu             | 27 ++++++++++---------
 cpp/tests/io/parquet_test.cpp                 |  3 ---
 3 files changed, 27 insertions(+), 23 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index a68ce2bd1a1..888102c03be 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -89,14 +89,14 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const use_pandas_metadata = (flags & 2) != 0;
   auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
-  auto const data_types =
-    dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL),
-                                                   int32_t(type_group_id::FLOATING_POINT),
-                                                   int32_t(type_group_id::FIXED_POINT),
-                                                   int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING),
-                                                   int32_t(cudf::type_id::LIST)}),
-                                col_sel);
+  auto const data_types = dtypes_for_column_selection(
+    get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
+                       static_cast<int32_t>(type_group_id::FLOATING_POINT),
+                       static_cast<int32_t>(type_group_id::FIXED_POINT),
+                       static_cast<int32_t>(type_group_id::TIMESTAMP),
+                       static_cast<int32_t>(cudf::type_id::STRING),
+                       static_cast<int32_t>(cudf::type_id::LIST)}),
+    col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
 
@@ -181,6 +181,9 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
+// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
+// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
+/*
 BENCHMARK_DEFINE_F(ParquetRead, row_selection)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
 BENCHMARK_REGISTER_F(ParquetRead, row_selection)
@@ -191,6 +194,7 @@ BENCHMARK_REGISTER_F(ParquetRead, row_selection)
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
+*/
 
 BENCHMARK_DEFINE_F(ParquetRead, misc_options)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c1b67cbda07..b302516ba39 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -447,25 +447,28 @@ struct leaf_schema_fn {
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
     if (std::is_same_v<T, numeric::decimal32>) {
-      col_schema.type        = Type::INT32;
-      col_schema.stats_dtype = statistics_dtype::dtype_int32;
+      col_schema.type              = Type::INT32;
+      col_schema.stats_dtype       = statistics_dtype::dtype_int32;
+      col_schema.decimal_precision = 9;
     } else if (std::is_same_v<T, numeric::decimal64>) {
-      col_schema.type        = Type::INT64;
-      col_schema.stats_dtype = statistics_dtype::dtype_decimal64;
+      col_schema.type              = Type::INT64;
+      col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
+      col_schema.decimal_precision = 18;
     } else if (std::is_same_v<T, numeric::decimal128>) {
-      col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
-      col_schema.type_length = sizeof(__int128_t);
-      col_schema.stats_dtype = statistics_dtype::dtype_decimal128;
+      col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
+      col_schema.type_length       = sizeof(__int128_t);
+      col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
+      col_schema.decimal_precision = 38;
     } else {
       CUDF_FAIL("Unsupported fixed point type for parquet writer");
     }
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
-    CUDF_EXPECTS(col_meta.is_decimal_precision_set(),
-                 "Precision must be specified for decimal columns");
-    CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
-                 "Precision must be equal to or greater than scale!");
-    col_schema.decimal_precision = col_meta.get_decimal_precision();
+    if (col_meta.is_decimal_precision_set()) {
+      CUDF_EXPECTS(col_meta.get_decimal_precision() >= col_schema.decimal_scale,
+                   "Precision must be equal to or greater than scale!");
+      col_schema.decimal_precision = col_meta.get_decimal_precision();
+    }
   }
 
   template <typename T>
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 75ff39cbe70..9c656abb666 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2021,9 +2021,6 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   cudf_io::parquet_writer_options args =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table);
 
-  // verify failure if no decimal precision given
-  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
-
   cudf_io::table_input_metadata expected_metadata(table);
 
   // verify failure if too small a precision is given

From 84c8cdeb535113f910d272885395c615923d4efd Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 7 Jan 2022 15:17:59 -0800
Subject: [PATCH 131/202] Use addressed-ordered first fit for the pinned memory
 pool (#9989)

The current `PinnedMemoryPool` always allocate from the largest free buffer, which might cause more fragmentation than necessary. Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to memory fragmentation*, and slightly cheaper to implement. It is also used by some popular allocators such as jemalloc.

This seems like a low risk change, and I do see a few percentage of performance improvement on my desktop with Q50, although the effect is less obvious on the DGX-2.

* Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved? ACM Sigplan Notices, 34(3), 26-36.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9989
---
 .../java/ai/rapids/cudf/PinnedMemoryPool.java | 83 ++++++++++---------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 865a668156f..6eee935748e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -25,7 +25,9 @@
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.Objects;
-import java.util.PriorityQueue;
+import java.util.Optional;
+import java.util.SortedSet;
+import java.util.TreeSet;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -43,19 +45,21 @@ public final class PinnedMemoryPool implements AutoCloseable {
   private static Future<PinnedMemoryPool> initFuture = null;
 
   private final long pinnedPoolBase;
-  private final PriorityQueue<MemorySection> freeHeap = new PriorityQueue<>(new SortedBySize());
+  private final SortedSet<MemorySection> freeHeap = new TreeSet<>(new SortedByAddress());
   private int numAllocatedSections = 0;
   private long availableBytes;
 
   private static class SortedBySize implements Comparator<MemorySection> {
     @Override
     public int compare(MemorySection s0, MemorySection s1) {
-      // We want the largest ones first...
-      int ret = Long.compare(s1.size, s0.size);
-      if (ret == 0) {
-        ret = Long.compare(s0.baseAddress, s1.baseAddress);
-      }
-      return ret;
+      return Long.compare(s0.size, s1.size);
+    }
+  }
+
+  private static class SortedByAddress implements Comparator<MemorySection> {
+    @Override
+    public int compare(MemorySection s0, MemorySection s1) {
+      return Long.compare(s0.baseAddress, s1.baseAddress);
     }
   }
 
@@ -162,6 +166,7 @@ private static void freeInternal(MemorySection section) {
 
   /**
    * Initialize the pool.
+   *
    * @param poolSize size of the pool to initialize.
    */
   public static synchronized void initialize(long poolSize) {
@@ -170,8 +175,9 @@ public static synchronized void initialize(long poolSize) {
 
   /**
    * Initialize the pool.
+   *
    * @param poolSize size of the pool to initialize.
-   * @param gpuId gpu id to set to get memory pool from, -1 means to use default
+   * @param gpuId    gpu id to set to get memory pool from, -1 means to use default
    */
   public static synchronized void initialize(long poolSize, int gpuId) {
     if (isInitialized()) {
@@ -207,11 +213,12 @@ public static synchronized void shutdown() {
 
   /**
    * Factory method to create a pinned host memory buffer.
+   *
    * @param bytes size in bytes to allocate
    * @return newly created buffer or null if insufficient pinned memory
    */
   public static HostMemoryBuffer tryAllocate(long bytes) {
-    HostMemoryBuffer result  = null;
+    HostMemoryBuffer result = null;
     PinnedMemoryPool pool = getSingleton();
     if (pool != null) {
       result = pool.tryAllocateInternal(bytes);
@@ -222,6 +229,7 @@ public static HostMemoryBuffer tryAllocate(long bytes) {
   /**
    * Factory method to create a host buffer but preferably pointing to pinned memory.
    * It is not guaranteed that the returned buffer will be pointer to pinned memory.
+   *
    * @param bytes size in bytes to allocate
    * @return newly created buffer
    */
@@ -235,6 +243,7 @@ public static HostMemoryBuffer allocate(long bytes) {
 
   /**
    * Get the number of bytes free in the pinned memory pool.
+   *
    * @return amount of free memory in bytes or 0 if the pool is not initialized
    */
   public static long getAvailableBytes() {
@@ -246,7 +255,7 @@ public static long getAvailableBytes() {
   }
 
   private PinnedMemoryPool(long poolSize, int gpuId) {
-    if (gpuId > -1 ) {
+    if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
@@ -269,20 +278,28 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
     }
     // Align the allocation
     long alignedBytes = ((bytes + ALIGNMENT - 1) / ALIGNMENT) * ALIGNMENT;
-    MemorySection largest = freeHeap.peek();
-    if (largest.size < alignedBytes) {
-      log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
+    Optional<MemorySection> firstFit = freeHeap.stream()
+        .filter(section -> section.size >= alignedBytes)
+        .findFirst();
+    if (!firstFit.isPresent()) {
+      if (log.isDebugEnabled()) {
+        MemorySection largest = freeHeap.stream()
+            .max(new SortedBySize())
+            .orElse(new MemorySection(0, 0));
+        log.debug("Insufficient pinned memory. {} needed, {} found", alignedBytes, largest.size);
+      }
       return null;
     }
+    MemorySection first = firstFit.get();
     log.debug("Allocating {}/{} bytes pinned from {} FREE COUNT {} OUTSTANDING COUNT {}",
-        bytes, alignedBytes, largest, freeHeap.size(), numAllocatedSections);
-    freeHeap.remove(largest);
+        bytes, alignedBytes, first, freeHeap.size(), numAllocatedSections);
+    freeHeap.remove(first);
     MemorySection allocated;
-    if (largest.size == alignedBytes) {
-      allocated = largest;
+    if (first.size == alignedBytes) {
+      allocated = first;
     } else {
-      allocated = largest.splitOff(alignedBytes);
-      freeHeap.add(largest);
+      allocated = first.splitOff(alignedBytes);
+      freeHeap.add(first);
     }
     numAllocatedSections++;
     availableBytes -= allocated.size;
@@ -293,25 +310,15 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
 
   private synchronized void free(MemorySection section) {
     log.debug("Freeing {} with {} outstanding {}", section, freeHeap, numAllocatedSections);
-    // This looks inefficient, but in reality it will only walk through the heap about 2 times.
-    // Because we keep entries up to date, each new entry will at most combine with one above it
-    // and one below it. That will happen in a single pass through the heap.  We do a second pass
-    // simply out of an abundance of caution.
-    // Adding it in will be a log(N) operation because it is a heap.
     availableBytes += section.size;
-    boolean anyReplaced;
-    do {
-      anyReplaced = false;
-      Iterator<MemorySection> it = freeHeap.iterator();
-      while(it.hasNext()) {
-        MemorySection current = it.next();
-        if (section.canCombine(current)) {
-          it.remove();
-          anyReplaced = true;
-          section.combineWith(current);
-        }
+    Iterator<MemorySection> it = freeHeap.iterator();
+    while(it.hasNext()) {
+      MemorySection current = it.next();
+      if (section.canCombine(current)) {
+        it.remove();
+        section.combineWith(current);
       }
-    } while(anyReplaced);
+    }
     freeHeap.add(section);
     numAllocatedSections--;
     log.debug("After freeing {} outstanding {}", freeHeap, numAllocatedSections);

From 3192aceaa7393a0692874ca7f3ce0508e54e0492 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sat, 8 Jan 2022 07:25:01 +0800
Subject: [PATCH 132/202] Java bindings for JSON reader support (#9940)

This PR is trying to add JSON reader support for java bindings.

Authors:
  - Bobby Wang (https://github.com/wbo4958)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9940
---
 .../main/java/ai/rapids/cudf/JSONOptions.java |  58 ++++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  98 ++++++++++++++-
 java/src/main/native/src/TableJni.cpp         | 117 +++++++++++++++++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 112 ++++++++++++++++-
 java/src/test/resources/people.json           |   3 +
 5 files changed, 383 insertions(+), 5 deletions(-)
 create mode 100644 java/src/test/resources/people.json

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 4748231d8b4..85a9eb7beb3 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,15 +18,35 @@
 
 package ai.rapids.cudf;
 
+import java.util.Collection;
+
 /**
  * Options for reading in JSON encoded data.
  */
-public final class JSONOptions  extends ColumnFilterOptions {
+public final class JSONOptions extends ColumnFilterOptions {
 
   public static JSONOptions DEFAULT = new JSONOptions(builder());
 
+  private final boolean dayFirst;
+  private final boolean lines;
+
   private JSONOptions(Builder builder) {
     super(builder);
+    dayFirst = builder.dayFirst;
+    lines = builder.lines;
+  }
+
+  public boolean isDayFirst() {
+    return dayFirst;
+  }
+
+  public boolean isLines() {
+    return lines;
+  }
+
+  @Override
+  String[] getIncludeColumnNames() {
+    throw new UnsupportedOperationException("JSON reader didn't support column prune");
   }
 
   public static Builder builder() {
@@ -34,6 +54,40 @@ public static Builder builder() {
   }
 
   public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
+    private boolean dayFirst = false;
+    private boolean lines = true;
+
+    /**
+     * Whether to parse dates as DD/MM versus MM/DD
+     * @param dayFirst true: DD/MM, false, MM/DD
+     * @return
+     */
+    public Builder withDayFirst(boolean dayFirst) {
+      this.dayFirst = dayFirst;
+      return this;
+    }
+
+    /**
+     * Whether to read the file as a json object per line
+     * @param perLine true: per line, false: multi-line
+     * @return builder for chaining
+     */
+    public Builder withLines(boolean perLine) {
+      assert perLine == true : "Cudf does not support multi-line";
+      this.lines = perLine;
+      return this;
+    }
+
+    @Override
+    public Builder includeColumn(String... names) {
+      throw new UnsupportedOperationException("JSON reader didn't support column prune");
+    }
+
+    @Override
+    public Builder includeColumn(Collection<String> names) {
+      throw new UnsupportedOperationException("JSON reader didn't support column prune");
+    }
+
     public JSONOptions build() {
       return new JSONOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 00c98c4fef8..9014e69ee74 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -234,6 +234,11 @@ private static native long[] readCSV(String[] columnNames,
                                        byte comment, String[] nullValues,
                                        String[] trueValues, String[] falseValues) throws CudfException;
 
+  private static native long[] readJSON(String[] columnNames,
+                                        int[] dTypeIds, int[] dTypeScales,
+                                        String filePath, long address, long length,
+                                        boolean dayFirst, boolean lines) throws CudfException;
+
   /**
    * Read in Parquet formatted data.
    * @param filterColumnNames  name of the columns to read, or an empty array if we want to read
@@ -797,6 +802,97 @@ public static Table readCSV(Schema schema, CSVOptions opts, HostMemoryBuffer buf
         opts.getFalseValues()));
   }
 
+  /**
+   * Read a JSON file using the default JSONOptions.
+   * @param schema the schema of the file.  You may use Schema.INFERRED to infer the schema.
+   * @param path the local file to read.
+   * @return the file parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, File path) {
+    return readJSON(schema, JSONOptions.DEFAULT, path);
+  }
+
+  /**
+   * Read JSON formatted data using the default JSONOptions.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param buffer raw UTF8 formatted bytes.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, byte[] buffer) {
+    return readJSON(schema, JSONOptions.DEFAULT, buffer, 0, buffer.length);
+  }
+
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param buffer raw UTF8 formatted bytes.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) {
+    return readJSON(schema, opts, buffer, 0, buffer.length);
+  }
+
+  /**
+   * Read a JSON file.
+   * @param schema the schema of the file.  You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param path the local file to read.
+   * @return the file parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, File path) {
+    return new Table(
+        readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
+            path.getAbsolutePath(),
+            0, 0,
+            opts.isDayFirst(), opts.isLines()));
+  }
+
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param buffer raw UTF8 formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
+                               long len) {
+    if (len <= 0) {
+      len = buffer.length - offset;
+    }
+    assert len > 0;
+    assert len <= buffer.length - offset;
+    assert offset >= 0 && offset < buffer.length;
+    try (HostMemoryBuffer newBuf = HostMemoryBuffer.allocate(len)) {
+      newBuf.setBytes(0, buffer, offset, len);
+      return readJSON(schema, opts, newBuf, 0, len);
+    }
+  }
+
+  /**
+   * Read JSON formatted data.
+   * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
+   * @param opts various JSON parsing options.
+   * @param buffer raw UTF8 formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
+                              long offset, long len) {
+    if (len <= 0) {
+      len = buffer.length - offset;
+    }
+    assert len > 0;
+    assert len <= buffer.length - offset;
+    assert offset >= 0 && offset < buffer.length;
+    return new Table(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(),
+        null, buffer.getAddress() + offset, len,
+        opts.isDayFirst(), opts.isLines()));
+  }
+
   /**
    * Read a Parquet file using the default ParquetOptions.
    * @param path the local file to read.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0e6425ea7a2..b7bb6880731 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <cudf/interop.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/data_sink.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/join.hpp>
@@ -1294,6 +1295,120 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
   CATCH_STD(env, NULL);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readJSON(
+    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
+    jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines) {
+
+  bool read_buffer = true;
+  if (buffer == 0) {
+    JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
+    read_buffer = false;
+  } else if (inputfilepath != NULL) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "cannot pass in both a buffer and an inputfilepath", NULL);
+  } else if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported",
+                  NULL);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_col_names(env, col_names);
+    cudf::jni::native_jintArray n_types(env, j_types);
+    cudf::jni::native_jintArray n_scales(env, j_scales);
+    if (n_types.is_null() != n_scales.is_null()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match null",
+                    NULL);
+    }
+    std::vector<cudf::data_type> data_types;
+    if (!n_types.is_null()) {
+      if (n_types.size() != n_scales.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "types and scales must match size",
+                      NULL);
+      }
+      data_types.reserve(n_types.size());
+      for (int index = 0; index < n_types.size(); index++) {
+        data_types.emplace_back(cudf::jni::make_data_type(n_types[index], n_scales[index]));
+      }
+    }
+
+    cudf::jni::native_jstring filename(env, inputfilepath);
+    if (!read_buffer && filename.is_empty()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inputfilepath can't be empty",
+                    NULL);
+    }
+
+    std::unique_ptr<cudf::io::source_info> source;
+    if (read_buffer) {
+      source.reset(new cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length));
+    } else {
+      source.reset(new cudf::io::source_info(filename.get()));
+    }
+
+    cudf::io::json_reader_options_builder opts = cudf::io::json_reader_options::builder(*source)
+                                                     .dayfirst(static_cast<bool>(day_first))
+                                                     .lines(static_cast<bool>(lines));
+
+    if (!n_col_names.is_null() && data_types.size() > 0) {
+      if (n_col_names.size() != n_types.size()) {
+        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                      "types and column names must match size", NULL);
+      }
+
+      std::map<std::string, cudf::data_type> map;
+
+      auto col_names_vec = n_col_names.as_cpp_vector();
+      std::transform(col_names_vec.begin(), col_names_vec.end(), data_types.begin(),
+                     std::inserter(map, map.end()),
+                     [](std::string a, cudf::data_type b) { return std::make_pair(a, b); });
+      opts.dtypes(map);
+    } else if (data_types.size() > 0) {
+      opts.dtypes(data_types);
+    } else {
+      // should infer the types
+    }
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(opts.build());
+
+    // there is no need to re-order columns when inferring schema
+    if (result.metadata.column_names.empty() || n_col_names.size() <= 0) {
+      return cudf::jni::convert_table_for_return(env, result.tbl);
+    } else {
+      // json reader will not return the correct column order,
+      // so we need to re-order the column of table according to table meta.
+
+      // turn name and its index in table into map<name, index>
+      std::map<std::string, cudf::size_type> m;
+      for (size_t i = 0; i < result.metadata.column_names.size(); i++) {
+        m.insert(std::make_pair(result.metadata.column_names[i], i));
+      }
+
+      auto col_names_vec = n_col_names.as_cpp_vector();
+      std::vector<cudf::size_type> indices;
+
+      bool match = true;
+      for (size_t i = 0; i < col_names_vec.size(); i++) {
+        if (m.find(col_names_vec[i]) == m.end()) {
+          match = false;
+          break;
+        } else {
+          indices.push_back(m.at(col_names_vec[i]));
+        }
+      }
+
+      if (!match) {
+        // can't find some input column names in table meta, return what json reader reads.
+        return cudf::jni::convert_table_for_return(env, result.tbl);
+      } else {
+        auto tbv = result.tbl->view().select(std::move(indices));
+        auto table = std::make_unique<cudf::table>(tbv);
+        return cudf::jni::convert_table_for_return(env, table);
+      }
+    }
+  }
+  CATCH_STD(env, NULL);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv *env, jclass,
                                                                    jobjectArray filter_col_names,
                                                                    jstring inputfilepath,
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 86c55e19776..b2b51553217 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -80,6 +80,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
   private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv");
+  private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json");
 
   private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder()
       .column(DType.INT32, "A")
@@ -292,6 +293,115 @@ void testGetNumberOfColumns() {
     }
   }
 
+  @Test
+  void testReadJSONFile() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "name")
+        .column(DType.INT32, "age")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withLines(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column("Michael", "Andy", "Justin")
+        .column(null, 30, 19)
+        .build();
+        Table table = Table.readJSON(schema, opts, TEST_SIMPLE_JSON_FILE)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadJSONFileWithDifferentColumnOrder() {
+    Schema schema = Schema.builder()
+        .column(DType.INT32, "age")
+        .column(DType.STRING, "name")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withLines(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(null, 30, 19)
+        .column("Michael", "Andy", "Justin")
+        .build();
+         Table table = Table.readJSON(schema, opts, TEST_SIMPLE_JSON_FILE)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadJSONBufferInferred() {
+    JSONOptions opts = JSONOptions.builder()
+        .withDayFirst(true)
+        .build();
+    byte[] data = ("[false,A,1,2,05/03/2001]\n" +
+        "[true,B,2,3,31/10/2010]'\n" +
+        "[false,C,3,4,20/10/1994]\n" +
+        "[true,D,4,5,18/10/1990]").getBytes(StandardCharsets.UTF_8);
+    try (Table expected = new Table.TestBuilder()
+        .column(false, true, false, true)
+        .column("A", "B", "C", "D")
+        .column(1L, 2L, 3L, 4L)
+        .column(2L, 3L, 4L, 5L)
+        .timestampMillisecondsColumn(983750400000L, 1288483200000L, 782611200000L, 656208000000L)
+        .build();
+         Table table = Table.readJSON(Schema.INFERRED, opts, data)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadJSONBuffer() {
+    // JSON reader will set the column according to the iterator if can't infer the name
+    // So we must set the same name accordingly
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "0")
+        .column(DType.INT32, "1")
+        .column(DType.INT32, "2")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .build();
+    byte[] data = ("[A,1,2]\n" +
+        "[B,2,3]'\n" +
+        "[C,3,4]\n" +
+        "[D,4,5]").getBytes(StandardCharsets.UTF_8);
+    try (Table expected = new Table.TestBuilder()
+        .column("A", "B", "C", "D")
+        .column(1, 2, 3, 4)
+        .column(2, 3, 4, 5)
+        .build();
+         Table table = Table.readJSON(schema, opts, data)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testReadJSONBufferWithOffset() {
+    // JSON reader will set the column according to the iterator if can't infer the name
+    // So we must set the same name accordingly
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "0")
+        .column(DType.INT32, "1")
+        .column(DType.INT32, "2")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .build();
+    int bytesToIgnore = 8;
+    byte[] data = ("[A,1,2]\n" +
+        "[B,2,3]'\n" +
+        "[C,3,4]\n" +
+        "[D,4,5]").getBytes(StandardCharsets.UTF_8);
+    try (Table expected = new Table.TestBuilder()
+        .column("B", "C", "D")
+        .column(2, 3, 4)
+        .column(3, 4, 5)
+        .build();
+         Table table = Table.readJSON(schema, opts, data,
+             bytesToIgnore, data.length - bytesToIgnore)) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   @Test
   void testReadCSVPrune() {
     Schema schema = Schema.builder()
diff --git a/java/src/test/resources/people.json b/java/src/test/resources/people.json
new file mode 100644
index 00000000000..50a859cbd7e
--- /dev/null
+++ b/java/src/test/resources/people.json
@@ -0,0 +1,3 @@
+{"name":"Michael"}
+{"name":"Andy", "age":30}
+{"name":"Justin", "age":19}

From 0722e2027f6c80e494bda6b23dea98f80f0a4134 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Fri, 7 Jan 2022 20:25:00 -0800
Subject: [PATCH 133/202] Consolidate and improve `reset_index` (#9750)

Partial of #9038

This is a rewrite of `reset_index` to share some common logics between `Series` and `DataFrame`. It extends it's capability to handle `level` argument for multi-level index, `col_level` and `col_fill` for multi-level column name support. And adds `name` argument support for series api.

Authors:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9750
---
 python/cudf/cudf/core/_base_index.py     |  10 ++
 python/cudf/cudf/core/dataframe.py       | 111 +++++++++---------
 python/cudf/cudf/core/frame.py           |   4 +-
 python/cudf/cudf/core/index.py           |   7 ++
 python/cudf/cudf/core/indexed_frame.py   |  94 +++++++++++++++-
 python/cudf/cudf/core/multiindex.py      |  36 ++++++
 python/cudf/cudf/core/series.py          |  97 ++++++++++------
 python/cudf/cudf/tests/test_dataframe.py | 136 ++++++++++++++++++++---
 python/cudf/cudf/tests/test_series.py    | 116 +++++++++++++++++++
 python/cudf/cudf/utils/docutils.py       |  10 ++
 10 files changed, 503 insertions(+), 118 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ed1cc74db71..aa89b8f849f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1414,6 +1414,16 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def _split_columns_by_levels(self, levels):
+        if isinstance(levels, int) and levels > 0:
+            raise ValueError(f"Out of bound level: {levels}")
+        return (
+            [self._data[self.name]],
+            [],
+            ["index" if self.name is None else self.name],
+            [],
+        )
+
 
 def _get_result_name(left_name, right_name):
     if left_name == right_name:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 197011e629d..fe6ac8e1529 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -58,6 +58,7 @@
     _FrameIndexer,
     _get_label_range_or_mask,
     _indices_from_labels,
+    doc_reset_index_template,
 )
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import DataFrameResampler
@@ -2429,29 +2430,13 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
-    def reset_index(
-        self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
-    ):
-        """
-        Reset the index.
-
-        Reset the index of the DataFrame, and use the default one instead.
-
-        Parameters
-        ----------
-        drop : bool, default False
-            Do not try to insert index into dataframe columns. This resets
-            the index to the default integer index.
-        inplace : bool, default False
-            Modify the DataFrame in place (do not create a new object).
-
-        Returns
-        -------
-        DataFrame or None
-            DataFrame with the new index or None if ``inplace=True``.
-
-        Examples
-        --------
+    @docutils.doc_apply(
+        doc_reset_index_template.format(
+            klass="DataFrame",
+            argument="",
+            return_type="DataFrame or None",
+            return_doc="",
+            example="""
         >>> df = cudf.DataFrame([('bird', 389.0),
         ...                    ('bird', 24.0),
         ...                    ('mammal', 80.5),
@@ -2476,45 +2461,51 @@ class max_speed
         1    bird      24.0
         2  mammal      80.5
         3  mammal      <NA>
-        """
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        if col_level != 0:
-            raise NotImplementedError(
-                "col_level parameter is not supported yet."
-            )
 
-        if col_fill != "":
-            raise NotImplementedError(
-                "col_fill parameter is not supported yet."
-            )
-
-        result = self if inplace else self.copy()
-
-        if not drop:
-            if isinstance(self.index, MultiIndex):
-                names = tuple(
-                    name if name is not None else f"level_{i}"
-                    for i, name in enumerate(self.index.names)
+        You can also use ``reset_index`` with MultiIndex.
+
+        >>> index = cudf.MultiIndex.from_tuples([('bird', 'falcon'),
+        ...                                     ('bird', 'parrot'),
+        ...                                     ('mammal', 'lion'),
+        ...                                     ('mammal', 'monkey')],
+        ...                                     names=['class', 'name'])
+        >>> df = cudf.DataFrame([(389.0, 'fly'),
+        ...                      ( 24.0, 'fly'),
+        ...                      ( 80.5, 'run'),
+        ...                      (np.nan, 'jump')],
+        ...                      index=index,
+        ...                      columns=('speed', 'type'))
+        >>> df
+                       speed  type
+        class  name
+        bird   falcon  389.0   fly
+               parrot   24.0   fly
+        mammal lion     80.5   run
+               monkey   <NA>  jump
+        >>> df.reset_index(level='class')
+                 class  speed  type
+        name
+        falcon    bird  389.0   fly
+        parrot    bird   24.0   fly
+        lion    mammal   80.5   run
+        monkey  mammal   <NA>  jump
+        """,
+        )
+    )
+    def reset_index(
+        self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+    ):
+        return self._mimic_inplace(
+            DataFrame._from_data(
+                *self._reset_index(
+                    level=level,
+                    drop=drop,
+                    col_level=col_level,
+                    col_fill=col_fill,
                 )
-            else:
-                if self.index.name is None:
-                    if "index" in self._data.names:
-                        names = ("level_0",)
-                    else:
-                        names = ("index",)
-                else:
-                    names = (self.index.name,)
-
-            index_columns = self.index._data.columns
-            for name, index_column in zip(
-                reversed(names), reversed(index_columns)
-            ):
-                result.insert(0, name, index_column)
-        result.index = RangeIndex(len(self))
-        if not inplace:
-            return result
+            ),
+            inplace=inplace,
+        )
 
     def take(self, indices, axis=0, keep_index=None):
         axis = self._get_axis_from_axis_arg(axis)
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 539408b6afb..0345966d6bd 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -150,8 +150,8 @@ def _from_columns(
         n_index_columns = 0
         if index_names is not None:
             n_index_columns = len(index_names)
-            index = cudf.core.index._index_from_data(
-                dict(zip(range(n_index_columns), columns))
+            index = cudf.core.index._index_from_columns(
+                columns[:n_index_columns]
             )
             if isinstance(index, cudf.MultiIndex):
                 index.names = index_names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 362c96ebbeb..859a81bc5f4 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -114,6 +114,13 @@ def _index_from_data(data: MutableMapping, name: Any = None):
     return index_class_type._from_data(data, None, name)
 
 
+def _index_from_columns(
+    columns: List[cudf.core.column.ColumnBase], name: Any = None
+):
+    """Construct an index from ``columns``, with levels named 0, 1, 2..."""
+    return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
+
+
 class RangeIndex(BaseIndex):
     """
     Immutable Index implementing a monotonic integer range.
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ecacb1ff326..2f4d4a88195 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5,7 +5,7 @@
 
 import operator
 import warnings
-from collections import abc
+from collections import Counter, abc
 from typing import Callable, Type, TypeVar
 from uuid import uuid4
 
@@ -24,11 +24,37 @@
     is_list_like,
 )
 from cudf.core.column import arange
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
-from cudf.core.index import Index
+from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
 from cudf.utils.utils import _gather_map_is_valid, cached_property
 
+doc_reset_index_template = """
+        Reset the index of the {klass}, or a level of it.
+
+        Parameters
+        ----------
+        level : int, str, tuple, or list, default None
+            Only remove the given levels from the index. Removes all levels by
+            default.
+        drop : bool, default False
+            Do not try to insert index into dataframe columns. This resets
+            the index to the default integer index.
+{argument}
+        inplace : bool, default False
+            Modify the DataFrame in place (do not create a new object).
+
+        Returns
+        -------
+        {return_type}
+            {klass} with the new index or None if ``inplace=True``.{return_doc}
+
+        Examples
+        --------
+        {example}
+"""
+
 
 def _indices_from_labels(obj, labels):
     from cudf.core.column import column
@@ -1171,6 +1197,53 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _reset_index(self, level, drop, col_level=0, col_fill=""):
+        """Shared path for DataFrame.reset_index and Series.reset_index."""
+        if level is not None and not isinstance(level, (tuple, list)):
+            level = (level,)
+        _check_duplicate_level_names(level, self._index.names)
+
+        # Split the columns in the index into data and index columns
+        (
+            data_columns,
+            index_columns,
+            data_names,
+            index_names,
+        ) = self._index._split_columns_by_levels(level)
+        if index_columns:
+            index = _index_from_columns(index_columns, name=self._index.name,)
+            if isinstance(index, MultiIndex):
+                index.names = index_names
+            else:
+                index.name = index_names[0]
+        else:
+            index = RangeIndex(len(self))
+
+        if drop:
+            return self._data, index
+
+        new_column_data = {}
+        for name, col in zip(data_names, data_columns):
+            if name == "index" and "index" in self._data:
+                name = "level_0"
+            name = (
+                tuple(
+                    name if i == col_level else col_fill
+                    for i in range(self._data.nlevels)
+                )
+                if self._data.multiindex
+                else name
+            )
+            new_column_data[name] = col
+        # This is to match pandas where the new data columns are always
+        # inserted to the left of existing data columns.
+        return (
+            ColumnAccessor(
+                {**new_column_data, **self._data}, self._data.multiindex
+            ),
+            index,
+        )
+
     def _first_or_last(
         self, offset, idx: int, op: Callable, side: str, slice_func: Callable
     ) -> "IndexedFrame":
@@ -1292,3 +1365,20 @@ def last(self, offset):
             side="right",
             slice_func=lambda i: self.iloc[i:],
         )
+
+
+def _check_duplicate_level_names(specified, level_names):
+    """Raise if any of `specified` has duplicates in `level_names`."""
+    if specified is None:
+        return
+    if len(set(level_names)) == len(level_names):
+        return
+    duplicates = {key for key, val in Counter(level_names).items() if val > 1}
+
+    duplicates_specified = [spec for spec in specified if spec in duplicates]
+    if not len(duplicates_specified) == 0:
+        # Note: pandas raises first encountered duplicates, cuDF raises all.
+        raise ValueError(
+            f"The names {duplicates_specified} occurs multiple times, use a"
+            " level number"
+        )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index c403c697e3d..b333c862f21 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1743,3 +1743,39 @@ def _intersection(self, other, sort=None):
         if sort is None and len(other):
             return midx.sort_values()
         return midx
+
+    def _split_columns_by_levels(self, levels):
+        # This function assumes that for levels with duplicate names, they are
+        # specified by indices, not name by ``levels``. E.g. [None, None] can
+        # only be specified by 0, 1, not "None".
+
+        if levels is None:
+            return (
+                list(self._data.columns),
+                [],
+                [
+                    f"level_{i}" if name is None else name
+                    for i, name in enumerate(self.names)
+                ],
+                [],
+            )
+
+        # Normalize named levels into indices
+        level_names = list(self.names)
+        level_indices = {
+            lv if isinstance(lv, int) else level_names.index(lv)
+            for lv in levels
+        }
+
+        # Split the columns
+        data_columns, index_columns = [], []
+        data_names, index_names = [], []
+        for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
+            if i in level_indices:
+                name = f"level_{i}" if name is None else name
+                data_columns.append(col)
+                data_names.append(name)
+            else:
+                index_columns.append(col)
+                index_names.append(name)
+        return data_columns, index_columns, data_names, index_names
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a0e359d1278..11166320760 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -64,6 +64,7 @@
     _FrameIndexer,
     _get_label_range_or_mask,
     _indices_from_labels,
+    doc_reset_index_template,
 )
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils import cudautils, docutils
@@ -830,30 +831,22 @@ def reindex(self, index=None, copy=True):
         series.name = self.name
         return series
 
-    def reset_index(self, drop=False, inplace=False):
-        """
-        Reset index to RangeIndex
-
-        Parameters
-        ----------
-        drop : bool, default False
-            Just reset the index, without inserting it as a column in
-            the new DataFrame.
-        inplace : bool, default False
-            Modify the Series in place (do not create a new object).
-
-        Returns
-        -------
-        Series or DataFrame or None
-            When `drop` is False (the default), a DataFrame is returned.
-            The newly created columns will come first in the DataFrame,
-            followed by the original Series values.
-            When `drop` is True, a `Series` is returned.
-            In either case, if ``inplace=True``, no value is returned.
-
-        Examples
-        --------
-        >>> import cudf
+    @docutils.doc_apply(
+        doc_reset_index_template.format(
+            klass="Series",
+            argument="""
+        name : object, optional
+            The name to use for the column containing the original Series
+            values. Uses self.name by default. This argument is ignored when
+            ``drop`` is True.""",
+            return_type="Series or DataFrame or None",
+            return_doc=""" For Series, When drop is False (the default), a DataFrame
+            is returned. The newly created columns will come first in the
+            DataFrame, followed by the original Series values. When `drop` is
+            True, a `Series` is returned. In either case, if ``inplace=True``,
+            no value is returned.
+""",
+            example="""
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
         >>> series
         10    a
@@ -873,19 +866,51 @@ def reset_index(self, drop=False, inplace=False):
         2    c
         3    d
         dtype: object
-        """
+
+        You can also use ``reset_index`` with MultiIndex.
+
+        >>> s2 = cudf.Series(
+        ...             range(4), name='foo',
+        ...             index=cudf.MultiIndex.from_tuples([
+        ...                     ('bar', 'one'), ('bar', 'two'),
+        ...                     ('baz', 'one'), ('baz', 'two')],
+        ...                     names=['a', 'b']
+        ...      ))
+        >>> s2
+        a    b
+        bar  one    0
+             two    1
+        baz  one    2
+             two    3
+        Name: foo, dtype: int64
+        >>> s2.reset_index(level='a')
+               a  foo
+        b
+        one  bar    0
+        two  bar    1
+        one  baz    2
+        two  baz    3
+""",
+        )
+    )
+    def reset_index(self, level=None, drop=False, name=None, inplace=False):
+        if not drop and inplace:
+            raise TypeError(
+                "Cannot reset_index inplace on a Series "
+                "to create a DataFrame"
+            )
+        data, index = self._reset_index(level=level, drop=drop)
         if not drop:
-            if inplace is True:
-                raise TypeError(
-                    "Cannot reset_index inplace on a Series "
-                    "to create a DataFrame"
-                )
-            return self.to_frame().reset_index(drop=drop)
-        else:
-            if inplace is True:
-                self._index = RangeIndex(len(self))
-            else:
-                return self._from_data(self._data, index=RangeIndex(len(self)))
+            if name is None:
+                name = 0 if self.name is None else self.name
+            data[name] = data.pop(self.name)
+            return cudf.core.dataframe.DataFrame._from_data(data, index)
+        # For ``name`` behavior, see:
+        # https://github.com/pandas-dev/pandas/issues/44575
+        return self._mimic_inplace(
+            Series._from_data(data, index, name if inplace else None),
+            inplace=inplace,
+        )
 
     def set_index(self, index):
         """Returns a new Series with a different index.
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 73f9cb858e1..e5b298a8448 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2513,37 +2513,137 @@ def test_tail_for_string():
     assert_eq(gdf.tail(3), gdf.to_pandas().tail(3))
 
 
+@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]])
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_index(pdf, gdf, drop):
-    assert_eq(
-        pdf.reset_index(drop=drop, inplace=False),
-        gdf.reset_index(drop=drop, inplace=False),
+@pytest.mark.parametrize(
+    "column_names",
+    [
+        ["v0", "v1"],
+        ["v0", "index"],
+        pd.MultiIndex.from_tuples([("x0", "x1"), ("y0", "y1")]),
+    ],
+)
+@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("col_level", [0, 1])
+@pytest.mark.parametrize("col_fill", ["", "some_lv"])
+def test_reset_index(level, drop, column_names, inplace, col_level, col_fill):
+    midx = pd.MultiIndex.from_tuples(
+        [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None]
     )
-    assert_eq(
-        pdf.x.reset_index(drop=drop, inplace=False),
-        gdf.x.reset_index(drop=drop, inplace=False),
+    pdf = pd.DataFrame(
+        [[1, 2], [3, 4], [5, 6], [7, 8]], index=midx, columns=column_names
     )
+    gdf = cudf.from_pandas(pdf)
+
+    expect = pdf.reset_index(
+        level=level,
+        drop=drop,
+        inplace=inplace,
+        col_level=col_level,
+        col_fill=col_fill,
+    )
+    got = gdf.reset_index(
+        level=level,
+        drop=drop,
+        inplace=inplace,
+        col_level=col_level,
+        col_fill=col_fill,
+    )
+    if inplace:
+        expect = pdf
+        got = gdf
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("level", [None, 0, 1, [None]])
+@pytest.mark.parametrize("drop", [False, True])
+@pytest.mark.parametrize("inplace", [False, True])
+@pytest.mark.parametrize("col_level", [0, 1])
+@pytest.mark.parametrize("col_fill", ["", "some_lv"])
+def test_reset_index_dup_level_name(level, drop, inplace, col_level, col_fill):
+    # midx levels are named [None, None]
+    midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
+    pdf = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]], index=midx)
+    gdf = cudf.from_pandas(pdf)
+    if level == [None]:
+        assert_exceptions_equal(
+            lfunc=pdf.reset_index,
+            rfunc=gdf.reset_index,
+            lfunc_args_and_kwargs=(
+                [],
+                {"level": level, "drop": drop, "inplace": inplace},
+            ),
+            rfunc_args_and_kwargs=(
+                [],
+                {"level": level, "drop": drop, "inplace": inplace},
+            ),
+            expected_error_message="occurs multiple times, use a level number",
+        )
+        return
+
+    expect = pdf.reset_index(
+        level=level,
+        drop=drop,
+        inplace=inplace,
+        col_level=col_level,
+        col_fill=col_fill,
+    )
+    got = gdf.reset_index(
+        level=level,
+        drop=drop,
+        inplace=inplace,
+        col_level=col_level,
+        col_fill=col_fill,
+    )
+    if inplace:
+        expect = pdf
+        got = gdf
+
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_named_index(pdf, gdf, drop):
+@pytest.mark.parametrize("inplace", [False, True])
+@pytest.mark.parametrize("col_level", [0, 1])
+@pytest.mark.parametrize("col_fill", ["", "some_lv"])
+def test_reset_index_named(pdf, gdf, drop, inplace, col_level, col_fill):
     pdf.index.name = "cudf"
     gdf.index.name = "cudf"
-    assert_eq(
-        pdf.reset_index(drop=drop, inplace=False),
-        gdf.reset_index(drop=drop, inplace=False),
+
+    expect = pdf.reset_index(
+        drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill
     )
-    assert_eq(
-        pdf.x.reset_index(drop=drop, inplace=False),
-        gdf.x.reset_index(drop=drop, inplace=False),
+    got = gdf.reset_index(
+        drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill
     )
+    if inplace:
+        expect = pdf
+        got = gdf
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_index_inplace(pdf, gdf, drop):
-    pdf.reset_index(drop=drop, inplace=True)
-    gdf.reset_index(drop=drop, inplace=True)
-    assert_eq(pdf, gdf)
+@pytest.mark.parametrize("inplace", [False, True])
+@pytest.mark.parametrize("column_names", [["x", "y"], ["index", "y"]])
+@pytest.mark.parametrize("col_level", [0, 1])
+@pytest.mark.parametrize("col_fill", ["", "some_lv"])
+def test_reset_index_unnamed(
+    pdf, gdf, drop, inplace, column_names, col_level, col_fill
+):
+    pdf.columns = column_names
+    gdf.columns = column_names
+
+    expect = pdf.reset_index(
+        drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill
+    )
+    got = gdf.reset_index(
+        drop=drop, inplace=inplace, col_level=col_level, col_fill=col_fill
+    )
+    if inplace:
+        expect = pdf
+        got = gdf
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 583d2c7a8dd..ffdd53c58ac 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1345,6 +1345,122 @@ def test_nullable_bool_dtype_series(data, bool_dtype):
     assert_eq(psr, gsr.to_pandas(nullable=True))
 
 
+@pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]])
+@pytest.mark.parametrize("drop", [True, False])
+@pytest.mark.parametrize("original_name", [None, "original_ser"])
+@pytest.mark.parametrize("name", [None, "ser"])
+@pytest.mark.parametrize("inplace", [True, False])
+def test_reset_index(level, drop, inplace, original_name, name):
+    midx = pd.MultiIndex.from_tuples(
+        [("a", 1), ("a", 2), ("b", 1), ("b", 2)], names=["l0", None]
+    )
+    ps = pd.Series(range(4), index=midx, name=original_name)
+    gs = cudf.from_pandas(ps)
+
+    if not drop and inplace:
+        pytest.skip(
+            "For exception checks, see "
+            "test_reset_index_dup_level_name_exceptions"
+        )
+
+    expect = ps.reset_index(level=level, drop=drop, name=name, inplace=inplace)
+    got = gs.reset_index(level=level, drop=drop, name=name, inplace=inplace)
+    if inplace:
+        expect = ps
+        got = gs
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("level", [None, 0, 1, [None]])
+@pytest.mark.parametrize("drop", [False, True])
+@pytest.mark.parametrize("inplace", [False, True])
+@pytest.mark.parametrize("original_name", [None, "original_ser"])
+@pytest.mark.parametrize("name", [None, "ser"])
+def test_reset_index_dup_level_name(level, drop, inplace, original_name, name):
+    # midx levels are named [None, None]
+    midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
+    ps = pd.Series(range(4), index=midx, name=original_name)
+    gs = cudf.from_pandas(ps)
+    if level == [None] or not drop and inplace:
+        pytest.skip(
+            "For exception checks, see "
+            "test_reset_index_dup_level_name_exceptions"
+        )
+
+    expect = ps.reset_index(level=level, drop=drop, inplace=inplace, name=name)
+    got = gs.reset_index(level=level, drop=drop, inplace=inplace, name=name)
+    if inplace:
+        expect = ps
+        got = gs
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("drop", [True, False])
+@pytest.mark.parametrize("inplace", [True, False])
+@pytest.mark.parametrize("original_name", [None, "original_ser"])
+@pytest.mark.parametrize("name", [None, "ser"])
+def test_reset_index_named(drop, inplace, original_name, name):
+    ps = pd.Series(range(4), index=["x", "y", "z", "w"], name=original_name)
+    gs = cudf.from_pandas(ps)
+
+    ps.index.name = "cudf"
+    gs.index.name = "cudf"
+
+    if not drop and inplace:
+        pytest.skip(
+            "For exception checks, see "
+            "test_reset_index_dup_level_name_exceptions"
+        )
+
+    expect = ps.reset_index(drop=drop, inplace=inplace, name=name)
+    got = gs.reset_index(drop=drop, inplace=inplace, name=name)
+
+    if inplace:
+        expect = ps
+        got = gs
+
+    assert_eq(expect, got)
+
+
+def test_reset_index_dup_level_name_exceptions():
+    midx = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)])
+    ps = pd.Series(range(4), index=midx)
+    gs = cudf.from_pandas(ps)
+
+    # Should specify duplicate level names with level number.
+    assert_exceptions_equal(
+        lfunc=ps.reset_index,
+        rfunc=gs.reset_index,
+        lfunc_args_and_kwargs=([], {"level": [None]},),
+        rfunc_args_and_kwargs=([], {"level": [None]},),
+        expected_error_message="occurs multiple times, use a level number",
+    )
+
+    # Cannot use drop=False and inplace=True to turn a series into dataframe.
+    assert_exceptions_equal(
+        lfunc=ps.reset_index,
+        rfunc=gs.reset_index,
+        lfunc_args_and_kwargs=([], {"drop": False, "inplace": True},),
+        rfunc_args_and_kwargs=([], {"drop": False, "inplace": True},),
+    )
+
+    # Pandas raises the above exception should these two inputs crosses.
+    assert_exceptions_equal(
+        lfunc=ps.reset_index,
+        rfunc=gs.reset_index,
+        lfunc_args_and_kwargs=(
+            [],
+            {"level": [None], "drop": False, "inplace": True},
+        ),
+        rfunc_args_and_kwargs=(
+            [],
+            {"level": [None], "drop": False, "inplace": True},
+        ),
+    )
+
+
 def test_series_add_prefix():
     cd_s = cudf.Series([1, 2, 3, 4])
     pd_s = cd_s.to_pandas()
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 57ad612846d..7a4a2673f9b 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -68,6 +68,16 @@ def wrapper(func):
     return wrapper
 
 
+def doc_apply(doc):
+    """Set `__doc__` attribute of `func` to `doc`."""
+
+    def wrapper(func):
+        func.__doc__ = doc
+        return func
+
+    return wrapper
+
+
 doc_describe = docfmt_partial(
     docstring="""
         Generate descriptive statistics.

From 3b4f90321e3a4d23604bd9c2210e9e4ed2bf4b33 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Mon, 10 Jan 2022 12:02:18 +0800
Subject: [PATCH 134/202] Fix the overflow problem of decimal rescale (#9966)

Fixes #9965

Current PR is to fix the potential overflow problem caused by re-scaling decimal with a scale difference exceeding the supported max digits.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9966
---
 cpp/src/unary/cast_ops.cu      | 22 +++++++++++++++++-----
 cpp/tests/unary/cast_tests.cpp | 16 ++++++++++++++++
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index f41ebacce53..5b8f3d1ce9f 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/detail/binaryop.hpp>
+#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
@@ -175,16 +176,27 @@ std::unique_ptr<column> rescale(column_view input,
                                 rmm::mr::device_memory_resource* mr)
 {
   using namespace numeric;
+  using RepType = device_storage_type_t<T>;
 
+  auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
   if (input.type().scale() >= scale) {
-    auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale}, rmm::cuda_stream_default);
-    auto const type   = cudf::data_type{cudf::type_to_id<T>(), scale};
+    auto const scalar = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
     return detail::binary_operation(input, *scalar, binary_operator::ADD, type, stream, mr);
   } else {
     auto const diff = input.type().scale() - scale;
-    auto const scalar =
-      make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, rmm::cuda_stream_default);
-    auto const type = cudf::data_type{cudf::type_to_id<T>(), scale};
+    // The value of fixed point scalar will overflow if the scale difference is larger than the
+    // max digits of underlying integral type. Under this condition, the output values can be
+    // nothing other than zero value. Therefore, we simply return a zero column.
+    if (-diff > cuda::std::numeric_limits<RepType>::digits10) {
+      auto const scalar  = make_fixed_point_scalar<T>(0, scale_type{scale}, stream);
+      auto output_column = make_column_from_scalar(*scalar, input.size(), stream, mr);
+      if (input.nullable()) {
+        auto const null_mask = copy_bitmask(input, stream, mr);
+        output_column->set_null_mask(std::move(null_mask));
+      }
+      return output_column;
+    }
+    auto const scalar = make_fixed_point_scalar<T>(std::pow(10, -diff), scale_type{diff}, stream);
     return detail::binary_operation(input, *scalar, binary_operator::DIV, type, stream, mr);
   }
 };
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index db457623d8d..906380f5e87 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1004,6 +1004,22 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTests, DecimalRescaleOverflowAndNullMask)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
+  auto const scale    = cuda::std::numeric_limits<RepType>::digits10 + 1;
+  auto const input    = fp_wrapper{vec.cbegin(), vec.cend(), {1, 0, 0, 1}, scale_type{0}};
+  auto const expected = fp_wrapper{{0, 0, 0, 0}, {1, 0, 0, 1}, scale_type{scale}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(scale));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TEST_F(FixedPointTestSingleType, Int32ToInt64Convert)
 {
   using namespace numeric;

From 8ba8774a9d8e8cefbc5733148a3edb50cfb42cb1 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Mon, 10 Jan 2022 18:17:38 +0800
Subject: [PATCH 135/202] Avoid overflow for fixed_point round (#9809)

Current PR is to get rid of the potential overflow on fixed_point round caused by too large scale movement.

If the scale movement is larger than the max precision of current type, the pow operation (`std::pow(10, scale_movement)`) will produce an overflow number as current type.  Fortunately, we can simply output a zero column because no digits can survive such a large scale movement.

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9809
---
 cpp/src/round/round.cu          |  23 +++++--
 cpp/tests/round/round_tests.cpp | 111 ++++++++++++++++++++++----------
 2 files changed, 94 insertions(+), 40 deletions(-)

diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index ef021ca8a35..3f9b7ebe0d3 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -23,7 +23,6 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/fixed_point/temporary.hpp>
 #include <cudf/round.hpp>
-#include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -32,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cudf/detail/fill.hpp>
 #include <type_traits>
 
 namespace cudf {
@@ -253,13 +253,22 @@ std::unique_ptr<column> round_with(column_view const& input,
     result_type, input.size(), copy_bitmask(input, stream, mr), input.null_count(), stream, mr);
 
   auto out_view = result->mutable_view();
-  Type const n  = std::pow(10, std::abs(decimal_places + input.type().scale()));
 
-  thrust::transform(rmm::exec_policy(stream),
-                    input.begin<Type>(),
-                    input.end<Type>(),
-                    out_view.begin<Type>(),
-                    FixedPointRoundFunctor{n});
+  auto const scale_movement = -decimal_places - input.type().scale();
+  // If scale_movement is larger than max precision of current type, the pow operation will
+  // overflow. Under this circumstance, we can simply output a zero column because no digits can
+  // survive such a large scale movement.
+  if (scale_movement > cuda::std::numeric_limits<Type>::digits10) {
+    auto zero_scalar = make_fixed_point_scalar<T>(0, scale_type{-decimal_places});
+    detail::fill_in_place(out_view, 0, out_view.size(), *zero_scalar, stream);
+  } else {
+    Type const n = std::pow(10, scale_movement);
+    thrust::transform(rmm::exec_policy(stream),
+                      input.begin<Type>(),
+                      input.end<Type>(),
+                      out_view.begin<Type>(),
+                      FixedPointRoundFunctor{n});
+  }
 
   return result;
 }
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 6b2febb9b5c..c20aab6a5b8 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -51,9 +51,11 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUpZero)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1140, 1150, 1160, 1240, 1250, 1260}, scale_type{-2}};
-  auto const expected = fp_wrapper{{11, 12, 12, 12, 13, 13}, scale_type{0}};
-  auto const result   = cudf::round(input);
+  auto const input = fp_wrapper{
+    {1140, 1150, 1160, 1240, 1250, 1260, -1140, -1150, -1160, -1240, -1250, -1260}, scale_type{-2}};
+  auto const expected =
+    fp_wrapper{{11, 12, 12, 12, 13, 13, -11, -12, -12, -12, -13, -13}, scale_type{0}};
+  auto const result = cudf::round(input);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -65,7 +67,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUpZeroNoOp)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input  = fp_wrapper{{1125, 1150, 1160, 1240, 1250, 1260}, scale_type{0}};
+  auto const input = fp_wrapper{
+    {1125, 1150, 1160, 1240, 1250, 1260, -1125, -1150, -1160, -1240, -1250, -1260}, scale_type{0}};
   auto const result = cudf::round(input);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(input, result->view());
@@ -93,9 +96,11 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfEvenZero)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1140, 1150, 1160, 1240, 1250, 1260}, scale_type{-2}};
-  auto const expected = fp_wrapper{{11, 12, 12, 12, 12, 13}, scale_type{0}};
-  auto const result   = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN);
+  auto const input = fp_wrapper{
+    {1140, 1150, 1160, 1240, 1250, 1260, -1140, -1150, -1160, -1240, -1250, -1260}, scale_type{-2}};
+  auto const expected =
+    fp_wrapper{{11, 12, 12, 12, 12, 13, -11, -12, -12, -12, -12, -13}, scale_type{0}};
+  auto const result = cudf::round(input, 0, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -107,8 +112,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUp)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1140, 1150, 1160}, scale_type{-3}};
-  auto const expected = fp_wrapper{{11, 12, 12}, scale_type{-1}};
+  auto const input    = fp_wrapper{{1140, 1150, 1160, -1140, -1150, -1160}, scale_type{-3}};
+  auto const expected = fp_wrapper{{11, 12, 12, -11, -12, -12}, scale_type{-1}};
   auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -121,8 +126,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUp2)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{114, 115, 116}, scale_type{-2}};
-  auto const expected = fp_wrapper{{11, 12, 12}, scale_type{-1}};
+  auto const input    = fp_wrapper{{114, 115, 116, -114, -115, -116}, scale_type{-2}};
+  auto const expected = fp_wrapper{{11, 12, 12, -11, -12, -12}, scale_type{-1}};
   auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -135,8 +140,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUp3)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1, 2, 3}, scale_type{1}};
-  auto const expected = fp_wrapper{{100, 200, 300}, scale_type{-1}};
+  auto const input    = fp_wrapper{{1, 2, 3, -1, -2, -3}, scale_type{1}};
+  auto const expected = fp_wrapper{{100, 200, 300, -100, -200, -300}, scale_type{-1}};
   auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -149,9 +154,11 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfEven)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1140, 1150, 1160, 1240, 1250, 1260}, scale_type{-3}};
-  auto const expected = fp_wrapper{{11, 12, 12, 12, 12, 13}, scale_type{-1}};
-  auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_EVEN);
+  auto const input = fp_wrapper{
+    {1140, 1150, 1160, 1240, 1250, 1260, -1140, -1150, -1160, -1240, -1250, -1260}, scale_type{-3}};
+  auto const expected =
+    fp_wrapper{{11, 12, 12, 12, 12, 13, -11, -12, -12, -12, -12, -13}, scale_type{-1}};
+  auto const result = cudf::round(input, 1, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -163,9 +170,11 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfEven2)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{114, 115, 116, 124, 125, 126}, scale_type{-2}};
-  auto const expected = fp_wrapper{{11, 12, 12, 12, 12, 13}, scale_type{-1}};
-  auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_EVEN);
+  auto const input =
+    fp_wrapper{{114, 115, 116, 124, 125, 126, -114, -115, -116, -124, -125, -126}, scale_type{-2}};
+  auto const expected =
+    fp_wrapper{{11, 12, 12, 12, 12, 13, -11, -12, -12, -12, -12, -13}, scale_type{-1}};
+  auto const result = cudf::round(input, 1, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
@@ -177,8 +186,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfEven3)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1, 2, 3}, scale_type{1}};
-  auto const expected = fp_wrapper{{100, 200, 300}, scale_type{-1}};
+  auto const input    = fp_wrapper{{1, 2, 3, -1, -2, -3}, scale_type{1}};
+  auto const expected = fp_wrapper{{100, 200, 300, -100, -200, -300}, scale_type{-1}};
   auto const result   = cudf::round(input, 1, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -207,8 +216,9 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestNegHalfUp)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{14, 15, 16, 24, 25, 26}, scale_type{2}};
-  auto const expected = fp_wrapper{{1, 2, 2, 2, 3, 3}, scale_type{3}};
+  auto const input =
+    fp_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{2}};
+  auto const expected = fp_wrapper{{1, 2, 2, 2, 3, 3, -1, -2, -2, -2, -3, -3}, scale_type{3}};
   auto const result   = cudf::round(input, -3, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -221,8 +231,9 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestNegHalfUp2)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{14, 15, 16, 24, 25, 26}, scale_type{3}};
-  auto const expected = fp_wrapper{{1, 2, 2, 2, 3, 3}, scale_type{4}};
+  auto const input =
+    fp_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{3}};
+  auto const expected = fp_wrapper{{1, 2, 2, 2, 3, 3, -1, -2, -2, -2, -3, -3}, scale_type{4}};
   auto const result   = cudf::round(input, -4, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -235,8 +246,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfNegUp3)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1, 2, 3}, scale_type{2}};
-  auto const expected = fp_wrapper{{10, 20, 30}, scale_type{1}};
+  auto const input    = fp_wrapper{{1, 2, 3, -1, -2, -3}, scale_type{2}};
+  auto const expected = fp_wrapper{{10, 20, 30, -10, -20, -30}, scale_type{1}};
   auto const result   = cudf::round(input, -1, cudf::rounding_method::HALF_UP);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -249,8 +260,9 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestNegHalfEven)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{14, 15, 16, 24, 25, 26}, scale_type{2}};
-  auto const expected = fp_wrapper{{1, 2, 2, 2, 2, 3}, scale_type{3}};
+  auto const input =
+    fp_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{2}};
+  auto const expected = fp_wrapper{{1, 2, 2, 2, 2, 3, -1, -2, -2, -2, -2, -3}, scale_type{3}};
   auto const result   = cudf::round(input, -3, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -263,8 +275,9 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestNegHalfEven2)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{14, 15, 16, 24, 25, 26}, scale_type{3}};
-  auto const expected = fp_wrapper{{1, 2, 2, 2, 2, 3}, scale_type{4}};
+  auto const input =
+    fp_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{3}};
+  auto const expected = fp_wrapper{{1, 2, 2, 2, 2, 3, -1, -2, -2, -2, -2, -3}, scale_type{4}};
   auto const result   = cudf::round(input, -4, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -277,8 +290,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfNegEven3)
   using RepType    = cudf::device_storage_type_t<decimalXX>;
   using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
 
-  auto const input    = fp_wrapper{{1, 2, 3}, scale_type{2}};
-  auto const expected = fp_wrapper{{10, 20, 30}, scale_type{1}};
+  auto const input    = fp_wrapper{{1, 2, 3, -1, -2, -3}, scale_type{2}};
+  auto const expected = fp_wrapper{{10, 20, 30, -10, -20, -30}, scale_type{1}};
   auto const result   = cudf::round(input, -1, cudf::rounding_method::HALF_EVEN);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -298,6 +311,38 @@ TYPED_TEST(RoundTestsFixedPointTypes, TestForBlog)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TEST_F(RoundTests, TestScaleMovementExceedingMaxPrecision)
+{
+  using namespace numeric;
+  using dec32_wrapper  = cudf::test::fixed_point_column_wrapper<int32_t>;
+  using dec64_wrapper  = cudf::test::fixed_point_column_wrapper<int64_t>;
+  using dec128_wrapper = cudf::test::fixed_point_column_wrapper<__int128_t>;
+
+  // max precision of int32 = 9
+  // scale movement = -(-11) -1 = 10 > 9
+  auto const input_32 =
+    dec32_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{1}};
+  auto const expected_32 = dec32_wrapper{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, scale_type{11}};
+  auto const result_32   = cudf::round(input_32, -11, cudf::rounding_method::HALF_UP);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_32, result_32->view());
+
+  // max precision of int64 = 18
+  // scale movement = -(-20) -1 = 19 > 18
+  auto const input_64 =
+    dec64_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{1}};
+  auto const expected_64 = dec64_wrapper{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, scale_type{20}};
+  auto const result_64   = cudf::round(input_64, -20, cudf::rounding_method::HALF_EVEN);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_64, result_64->view());
+
+  // max precision of int128 = 38
+  // scale movement = -(-40) -1 = 39 > 18
+  auto const input_128 =
+    dec128_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, scale_type{1}};
+  auto const expected_128 = dec128_wrapper{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, scale_type{40}};
+  auto const result_128   = cudf::round(input_128, -40, cudf::rounding_method::HALF_UP);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_128, result_128->view());
+}
+
 TYPED_TEST(RoundTestsFloatingPointTypes, SimpleFloatingPointTestHalfUp0)
 {
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam>;

From bb3844e4645f6a7963c56bfb35d70386ff8ef043 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Mon, 10 Jan 2022 18:32:33 +0530
Subject: [PATCH 136/202] Use new efficient partitioned parquet writing in cuDF
 (#9971)

Makes use of the efficient partitioned writing support added in #9810 to improve performance of partitioned parquet dataset writing.

Closes https://github.com/rapidsai/cudf/issues/5059

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/9971
---
 cpp/include/cudf/io/types.hpp            |   5 +
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |   6 +
 python/cudf/cudf/_lib/cpp/io/types.pxd   |   9 +
 python/cudf/cudf/_lib/io/utils.pxd       |   2 +
 python/cudf/cudf/_lib/io/utils.pyx       |  61 ++++--
 python/cudf/cudf/_lib/parquet.pyx        |  46 ++++-
 python/cudf/cudf/io/parquet.py           | 231 +++++++++++++----------
 python/cudf/cudf/tests/test_parquet.py   |  10 +-
 python/cudf/cudf/utils/ioutils.py        |  18 +-
 9 files changed, 255 insertions(+), 133 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 512a90b3249..8f06de99f05 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -416,6 +416,11 @@ class table_input_metadata {
 struct partition_info {
   size_type start_row;
   size_type num_rows;
+
+  partition_info() = default;
+  partition_info(size_type start_row, size_type num_rows) : start_row(start_row), num_rows(num_rows)
+  {
+  }
 };
 
 }  // namespace io
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 60be608d997..07b312361f2 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -77,6 +77,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_row_group_size_bytes() except+
         size_type get_row_group_size_rows() except+
 
+        void set_partitions(
+            vector[cudf_io_types.partition_info] partitions
+        ) except +
         void set_metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
@@ -108,6 +111,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_io_types.sink_info sink_,
             cudf_table_view.table_view table_
         ) except +
+        parquet_writer_options_builder& partitions(
+            vector[cudf_io_types.partition_info] partitions
+        ) except +
         parquet_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 40a056b46e0..5e58134809d 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -73,6 +73,13 @@ cdef extern from "cudf/io/types.hpp" \
 
         vector[column_in_metadata] column_metadata
 
+    cdef cppclass partition_info:
+        size_type start_row
+        size_type num_rows
+
+        partition_info()
+        partition_info(size_type start_row, size_type num_rows) except +
+
     cdef cppclass host_buffer:
         const char* data
         size_t size
@@ -99,8 +106,10 @@ cdef extern from "cudf/io/types.hpp" \
 
         sink_info() except +
         sink_info(string file_path) except +
+        sink_info(vector[string] file_path) except +
         sink_info(vector[char] * buffer) except +
         sink_info(data_sink * user_sink) except +
+        sink_info(vector[data_sink *] user_sink) except +
 
 
 cdef extern from "cudf/io/data_sink.hpp" \
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 36520538506..af1f2521d4a 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -13,6 +13,8 @@ from cudf._lib.cpp.io.types cimport (
 
 
 cdef source_info make_source_info(list src) except*
+cdef sink_info make_sinks_info(
+    list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
 cdef update_struct_field_names(
     table,
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 8e07ee92fde..8e345bf969b 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -79,27 +79,54 @@ cdef source_info make_source_info(list src) except*:
     return source_info(c_host_buffers)
 
 # Converts the Python sink input to libcudf++ IO sink_info.
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
-    if isinstance(src, io.StringIO):
-        sink.reset(new iobase_data_sink(src))
-        return sink_info(sink.get())
-    elif isinstance(src, io.TextIOBase):
-        # Files opened in text mode expect writes to be str rather than bytes,
-        # which requires conversion from utf-8. If the underlying buffer is
-        # utf-8, we can bypass this conversion by writing directly to it.
-        if codecs.lookup(src.encoding).name not in {"utf-8", "ascii"}:
-            raise NotImplementedError(f"Unsupported encoding {src.encoding}")
-        sink.reset(new iobase_data_sink(src.buffer))
-        return sink_info(sink.get())
-    elif isinstance(src, io.IOBase):
-        sink.reset(new iobase_data_sink(src))
-        return sink_info(sink.get())
-    elif isinstance(src, (basestring, os.PathLike)):
-        return sink_info(<string> os.path.expanduser(src).encode())
+cdef sink_info make_sinks_info(
+    list src, vector[unique_ptr[data_sink]] & sink
+) except*:
+    cdef vector[data_sink *] data_sinks
+    cdef vector[string] paths
+    if isinstance(src[0], io.StringIO):
+        data_sinks.reserve(len(src))
+        for s in src:
+            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
+            data_sinks.push_back(sink.back().get())
+        return sink_info(data_sinks)
+    elif isinstance(src[0], io.TextIOBase):
+        data_sinks.reserve(len(src))
+        for s in src:
+            # Files opened in text mode expect writes to be str rather than
+            # bytes, which requires conversion from utf-8. If the underlying
+            # buffer is utf-8, we can bypass this conversion by writing
+            # directly to it.
+            if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}:
+                raise NotImplementedError(f"Unsupported encoding {s.encoding}")
+            sink.push_back(
+                unique_ptr[data_sink](new iobase_data_sink(s.buffer))
+            )
+            data_sinks.push_back(sink.back().get())
+        return sink_info(data_sinks)
+    elif isinstance(src[0], io.IOBase):
+        data_sinks.reserve(len(src))
+        for s in src:
+            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
+            data_sinks.push_back(sink.back().get())
+        return sink_info(data_sinks)
+    elif isinstance(src[0], (basestring, os.PathLike)):
+        paths.reserve(len(src))
+        for s in src:
+            paths.push_back(<string> os.path.expanduser(s).encode())
+        return sink_info(move(paths))
     else:
         raise TypeError("Unrecognized input type: {}".format(type(src)))
 
 
+cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
+    cdef vector[unique_ptr[data_sink]] datasinks
+    cdef sink_info info = make_sinks_info([src], datasinks)
+    if not datasinks.empty():
+        sink.swap(datasinks[0])
+    return info
+
+
 # Adapts a python io.IOBase object as a libcudf++ IO data_sink. This lets you
 # write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
 cdef cppclass iobase_data_sink(data_sink):
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 955324778fd..36099b03ef6 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -23,6 +23,7 @@ from cudf.api.types import (
     is_categorical_dtype,
     is_decimal_dtype,
     is_list_dtype,
+    is_list_like,
     is_struct_dtype,
 )
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -60,6 +61,7 @@ from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
 from cudf._lib.io.utils cimport (
     make_sink_info,
+    make_sinks_info,
     make_source_info,
     update_struct_field_names,
 )
@@ -277,14 +279,15 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
 cpdef write_parquet(
         table,
-        object path,
+        object filepaths_or_buffers,
         object index=None,
         object compression="snappy",
         object statistics="ROWGROUP",
         object metadata_file_path=None,
         object int96_timestamps=False,
         object row_group_size_bytes=None,
-        object row_group_size_rows=None):
+        object row_group_size_rows=None,
+        object partitions_info=None):
     """
     Cython function to call into libcudf API, see `write_parquet`.
 
@@ -298,8 +301,10 @@ cpdef write_parquet(
 
     cdef vector[map[string, string]] user_data
     cdef table_view tv
-    cdef unique_ptr[cudf_io_types.data_sink] _data_sink
-    cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
+    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sinks
+    cdef cudf_io_types.sink_info sink = make_sinks_info(
+        filepaths_or_buffers, _data_sinks
+    )
 
     if index is True or (
         index is None and not isinstance(table._index, cudf.RangeIndex)
@@ -327,9 +332,20 @@ cpdef write_parquet(
             table[name]._column, tbl_meta.get().column_metadata[i]
         )
 
-    pandas_metadata = generate_pandas_metadata(table, index)
-    user_data.resize(1)
-    user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
+    cdef map[string, string] tmp_user_data
+    if partitions_info is not None:
+        for start_row, num_row in partitions_info:
+            partitioned_df = table.iloc[start_row: start_row + num_row].copy(
+                deep=False
+            )
+            pandas_metadata = generate_pandas_metadata(partitioned_df, index)
+            tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
+            user_data.push_back(tmp_user_data)
+            tmp_user_data.clear()
+    else:
+        pandas_metadata = generate_pandas_metadata(table, index)
+        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
+        user_data.push_back(tmp_user_data)
 
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
@@ -337,6 +353,7 @@ cpdef write_parquet(
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
     cdef vector[string] c_column_chunks_file_paths
     cdef bool _int96_timestamps = int96_timestamps
+    cdef vector[cudf_io_types.partition_info] partitions
 
     # Perform write
     cdef parquet_writer_options args = move(
@@ -348,8 +365,21 @@ cpdef write_parquet(
         .int96_timestamps(_int96_timestamps)
         .build()
     )
+    if partitions_info is not None:
+        partitions.reserve(len(partitions_info))
+        for part in partitions_info:
+            partitions.push_back(
+                cudf_io_types.partition_info(part[0], part[1])
+            )
+        args.set_partitions(move(partitions))
     if metadata_file_path is not None:
-        c_column_chunks_file_paths.push_back(str.encode(metadata_file_path))
+        if is_list_like(metadata_file_path):
+            for path in metadata_file_path:
+                c_column_chunks_file_paths.push_back(str.encode(path))
+        else:
+            c_column_chunks_file_paths.push_back(
+                str.encode(metadata_file_path)
+            )
         args.set_column_chunks_file_paths(move(c_column_chunks_file_paths))
     if row_group_size_bytes is not None:
         args.set_row_group_size_bytes(row_group_size_bytes)
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index f9b39bf2cfa..ca03e40e2a6 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -4,6 +4,7 @@
 import json
 import warnings
 from collections import defaultdict
+from contextlib import ExitStack
 from uuid import uuid4
 
 import fsspec
@@ -17,21 +18,59 @@
 from cudf.utils import ioutils
 
 
-def _get_partition_groups(df, partition_cols, preserve_index=False):
-    # TODO: We can use groupby functionality here after cudf#4346.
-    #       Longer term, we want more slicing logic to be pushed down
-    #       into cpp.  For example, it would be best to pass libcudf
-    #       a single sorted table with group offsets).
-    df = df.sort_values(partition_cols)
-    if not preserve_index:
-        df = df.reset_index(drop=True)
-    divisions = df[partition_cols].drop_duplicates(ignore_index=True)
-    splits = df[partition_cols].searchsorted(divisions, side="left")
-    splits = splits.tolist() + [len(df[partition_cols])]
-    return [
-        df.iloc[splits[i] : splits[i + 1]].copy(deep=False)
-        for i in range(0, len(splits) - 1)
+def _write_parquet(
+    df,
+    paths,
+    compression="snappy",
+    index=None,
+    statistics="ROWGROUP",
+    metadata_file_path=None,
+    int96_timestamps=False,
+    row_group_size_bytes=None,
+    row_group_size_rows=None,
+    partitions_info=None,
+    **kwargs,
+):
+    if is_list_like(paths) and len(paths) > 1:
+        if partitions_info is None:
+            ValueError("partition info is required for multiple paths")
+        elif not is_list_like(partitions_info):
+            ValueError("partition info must be list-like for multiple paths")
+        elif not len(paths) == len(partitions_info):
+            ValueError("partitions_info and paths must be of same size")
+    if is_list_like(partitions_info) and len(partitions_info) > 1:
+        if not is_list_like(paths):
+            ValueError("paths must be list-like when partitions_info provided")
+
+    paths_or_bufs = [
+        ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs)
+        for path in paths
     ]
+    common_args = {
+        "index": index,
+        "compression": compression,
+        "statistics": statistics,
+        "metadata_file_path": metadata_file_path,
+        "int96_timestamps": int96_timestamps,
+        "row_group_size_bytes": row_group_size_bytes,
+        "row_group_size_rows": row_group_size_rows,
+        "partitions_info": partitions_info,
+    }
+    if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]):
+        with ExitStack() as stack:
+            fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs]
+            file_objs = [
+                ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs
+            ]
+            write_parquet_res = libparquet.write_parquet(
+                df, filepaths_or_buffers=file_objs, **common_args
+            )
+    else:
+        write_parquet_res = libparquet.write_parquet(
+            df, filepaths_or_buffers=paths_or_bufs, **common_args
+        )
+
+    return write_parquet_res
 
 
 # Logic chosen to match: https://arrow.apache.org/
@@ -84,7 +123,6 @@ def write_to_dataset(
 
     fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
     fs.mkdirs(root_path, exist_ok=True)
-    metadata = []
 
     if partition_cols is not None and len(partition_cols) > 0:
 
@@ -92,64 +130,47 @@ def write_to_dataset(
         if len(data_cols) == 0:
             raise ValueError("No data left to save outside partition columns")
 
-        #  Loop through the partition groups
-        for _, sub_df in enumerate(
-            _get_partition_groups(
-                df, partition_cols, preserve_index=preserve_index
-            )
-        ):
-            if sub_df is None or len(sub_df) == 0:
-                continue
-            keys = tuple([sub_df[col].iloc[0] for col in partition_cols])
-            if not isinstance(keys, tuple):
-                keys = (keys,)
+        part_names, part_offsets, _, grouped_df = df.groupby(
+            partition_cols
+        )._grouped()
+        if not preserve_index:
+            grouped_df.reset_index(drop=True, inplace=True)
+        grouped_df.drop(columns=partition_cols, inplace=True)
+        # Copy the entire keys df in one operation rather than using iloc
+        part_names = part_names.to_pandas().to_frame(index=False)
+
+        full_paths = []
+        metadata_file_paths = []
+        for keys in part_names.itertuples(index=False):
             subdir = fs.sep.join(
-                [
-                    "{colname}={value}".format(colname=name, value=val)
-                    for name, val in zip(partition_cols, keys)
-                ]
+                [f"{name}={val}" for name, val in zip(partition_cols, keys)]
             )
             prefix = fs.sep.join([root_path, subdir])
             fs.mkdirs(prefix, exist_ok=True)
             filename = filename or uuid4().hex + ".parquet"
             full_path = fs.sep.join([prefix, filename])
-            write_df = sub_df.copy(deep=False)
-            write_df.drop(columns=partition_cols, inplace=True)
-            with fs.open(full_path, mode="wb") as fil:
-                fil = ioutils.get_IOBase_writer(fil)
-                if return_metadata:
-                    metadata.append(
-                        write_df.to_parquet(
-                            fil,
-                            index=preserve_index,
-                            metadata_file_path=fs.sep.join([subdir, filename]),
-                            **kwargs,
-                        )
-                    )
-                else:
-                    write_df.to_parquet(fil, index=preserve_index, **kwargs)
+            full_paths.append(full_path)
+            if return_metadata:
+                metadata_file_paths.append(fs.sep.join([subdir, filename]))
+
+        if return_metadata:
+            kwargs["metadata_file_path"] = metadata_file_paths
+        metadata = to_parquet(
+            grouped_df,
+            full_paths,
+            index=preserve_index,
+            partition_offsets=part_offsets,
+            **kwargs,
+        )
 
     else:
         filename = filename or uuid4().hex + ".parquet"
         full_path = fs.sep.join([root_path, filename])
         if return_metadata:
-            metadata.append(
-                df.to_parquet(
-                    full_path,
-                    index=preserve_index,
-                    metadata_file_path=filename,
-                    **kwargs,
-                )
-            )
-        else:
-            df.to_parquet(full_path, index=preserve_index, **kwargs)
+            kwargs["metadata_file_path"] = filename
+        metadata = df.to_parquet(full_path, index=preserve_index, **kwargs)
 
-    if metadata:
-        return (
-            merge_parquet_filemetadata(metadata)
-            if len(metadata) > 1
-            else metadata[0]
-        )
+    return metadata
 
 
 @ioutils.doc_read_parquet_metadata()
@@ -669,6 +690,7 @@ def to_parquet(
     index=None,
     partition_cols=None,
     partition_file_name=None,
+    partition_offsets=None,
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
@@ -680,8 +702,32 @@ def to_parquet(
     """{docstring}"""
 
     if engine == "cudf":
+        # Ensure that no columns dtype is 'category'
+        for col in df.columns:
+            if partition_cols is None or col not in partition_cols:
+                if df[col].dtype.name == "category":
+                    raise ValueError(
+                        "'category' column dtypes are currently not "
+                        + "supported by the gpu accelerated parquet writer"
+                    )
+
         if partition_cols:
-            write_to_dataset(
+            if metadata_file_path is not None:
+                warnings.warn(
+                    "metadata_file_path will be ignored/overwritten when "
+                    "partition_cols are provided. To request returning the "
+                    "metadata binary blob, pass `return_metadata=True`"
+                )
+            kwargs.update(
+                {
+                    "compression": compression,
+                    "statistics": statistics,
+                    "int96_timestamps": int96_timestamps,
+                    "row_group_size_bytes": row_group_size_bytes,
+                    "row_group_size_rows": row_group_size_rows,
+                }
+            )
+            return write_to_dataset(
                 df,
                 filename=partition_file_name,
                 partition_cols=partition_cols,
@@ -689,49 +735,34 @@ def to_parquet(
                 preserve_index=index,
                 **kwargs,
             )
-            return
 
-        # Ensure that no columns dtype is 'category'
-        for col in df.columns:
-            if df[col].dtype.name == "category":
-                raise ValueError(
-                    "'category' column dtypes are currently not "
-                    + "supported by the gpu accelerated parquet writer"
+        if partition_offsets:
+            kwargs["partitions_info"] = [
+                (
+                    partition_offsets[i],
+                    partition_offsets[i + 1] - partition_offsets[i],
                 )
-
-        path_or_buf = ioutils.get_writer_filepath_or_buffer(
-            path, mode="wb", **kwargs
+                for i in range(0, len(partition_offsets) - 1)
+            ]
+
+        return _write_parquet(
+            df,
+            paths=path if is_list_like(path) else [path],
+            compression=compression,
+            index=index,
+            statistics=statistics,
+            metadata_file_path=metadata_file_path,
+            int96_timestamps=int96_timestamps,
+            row_group_size_bytes=row_group_size_bytes,
+            row_group_size_rows=row_group_size_rows,
+            **kwargs,
         )
-        if ioutils.is_fsspec_open_file(path_or_buf):
-            with path_or_buf as file_obj:
-                file_obj = ioutils.get_IOBase_writer(file_obj)
-                write_parquet_res = libparquet.write_parquet(
-                    df,
-                    path=file_obj,
-                    index=index,
-                    compression=compression,
-                    statistics=statistics,
-                    metadata_file_path=metadata_file_path,
-                    int96_timestamps=int96_timestamps,
-                    row_group_size_bytes=row_group_size_bytes,
-                    row_group_size_rows=row_group_size_rows,
-                )
-        else:
-            write_parquet_res = libparquet.write_parquet(
-                df,
-                path=path_or_buf,
-                index=index,
-                compression=compression,
-                statistics=statistics,
-                metadata_file_path=metadata_file_path,
-                int96_timestamps=int96_timestamps,
-                row_group_size_bytes=row_group_size_bytes,
-                row_group_size_rows=row_group_size_rows,
-            )
-
-        return write_parquet_res
 
     else:
+        if partition_offsets is not None:
+            warnings.warn(
+                "partition_offsets will be ignored when engine is not cudf"
+            )
 
         # If index is empty set it to the expected default value of True
         if index is None:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index c52dab5c72f..9a66de8a3a6 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1706,21 +1706,19 @@ def test_read_parquet_partitioned_filtered(
         row_groups=row_groups,
         categorical_partitions=use_cat,
     )
+    expect["b"] = expect["b"].astype(str)
+    expect["c"] = expect["c"].astype(int)
     if use_cat:
         assert got.dtypes["b"] == "category"
         assert got.dtypes["c"] == "category"
+        got["b"] = got["b"].astype(str)
+        got["c"] = got["c"].astype(int)
     else:
         # Check that we didn't get categorical
         # columns, but convert back to categorical
         # for comparison with pandas
         assert got.dtypes["b"] == "object"
         assert got.dtypes["c"] == "int"
-        got["b"] = pd.Categorical(
-            got["b"].to_pandas(), categories=list("abcd")
-        )
-        got["c"] = pd.Categorical(
-            got["c"].to_pandas(), categories=np.arange(4)
-        )
     assert_eq(expect, got)
 
     # Filter on non-partitioned column.
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index c7ec539c6a6..b881f9372bc 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -198,9 +198,10 @@
 
 Parameters
 ----------
-path : str
+path : str or list of str
     File path or Root Directory path. Will be used as Root Directory path
-    while writing a partitioned dataset.
+    while writing a partitioned dataset. Use list of str with partition_offsets
+    to write parts of the dataframe to different files.
 compression : {'snappy', None}, default 'snappy'
     Name of the compression to use. Use ``None`` for no compression.
 index : bool, default None
@@ -218,6 +219,16 @@
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
     will be used for each file.
+partition_offsets : list, optional, default None
+    Offsets to partition the dataframe by. Should be used when path is list
+    of str. Should be a list of integers of size ``len(path) + 1``
+statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+    Level at which column statistics should be included in file.
+metadata_file_path : str, optional, default None
+    If specified, this function will return a binary blob containing the footer
+    metadata of the written parquet file. The returned blob will have the
+    ``chunk.file_path`` field set to the ``metadata_file_path`` for each chunk.
+    When using with ``partition_offsets``, should be same size as ``len(path)``
 int96_timestamps : bool, default False
     If ``True``, write timestamps in int96 format. This will convert
     timestamps from timestamp[ns], timestamp[ms], timestamp[s], and
@@ -230,6 +241,9 @@
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
+**kwargs
+    To request metadata binary blob when using with ``partition_cols``, Pass
+    ``return_metadata=True`` instead of specifying ``metadata_file_path``
 
 
 See Also

From b7b87fbf497ffca25e91d6a001473291e49bba97 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 10 Jan 2022 09:29:55 -0500
Subject: [PATCH 137/202] Add build-time publish step to cpu build script
 (#9927)

Follow on to #9631. This copies the generated BuildMetrics.html to the ci/cpu/build.sh to make it available for the Jenkins publishHtml publisher.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Karthikeyan (https://github.com/karthikeyann)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/9927
---
 build.sh                       | 43 +++++++++++++++++++++++++++-------
 ci/cpu/build.sh                |  8 +++++++
 ci/gpu/build.sh                |  6 ++---
 conda/recipes/libcudf/build.sh |  2 +-
 cpp/scripts/sort_ninja_log.py  | 33 ++++++++++++++++++++++----
 5 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/build.sh b/build.sh
index adf6e220744..f5a59b6edcf 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -17,7 +17,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h"
+VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
 HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
@@ -37,6 +37,8 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk
    --disable_nvtx                - disable inserting NVTX profiling ranges
    --show_depr_warn              - show cmake deprecation warnings
    --ptds                        - enable per-thread default stream
+   --build_metrics               - generate build metrics report for libcudf
+   --incl_cache_stats            - include cache statistics in build metrics report
    --cmake-args=\\\"<args>\\\"   - pass arbitrary list of CMake configuration options (escape all quotes in argument)
    -h | --h[elp]                 - print this text
 
@@ -61,6 +63,8 @@ BUILD_NVTX=ON
 BUILD_TESTS=OFF
 BUILD_DISABLE_DEPRECATION_WARNING=ON
 BUILD_PER_THREAD_DEFAULT_STREAM=OFF
+BUILD_REPORT_METRICS=OFF
+BUILD_REPORT_INCL_CACHE_STATS=OFF
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
@@ -144,6 +148,14 @@ fi
 if hasArg --ptds; then
     BUILD_PER_THREAD_DEFAULT_STREAM=ON
 fi
+if hasArg --build_metrics; then
+    BUILD_REPORT_METRICS=ON
+fi
+
+if hasArg --incl_cache_stats; then
+    BUILD_REPORT_INCL_CACHE_STATS=ON
+fi
+
 
 # If clean given, run it prior to any other steps
 if hasArg clean; then
@@ -174,8 +186,11 @@ if buildAll || hasArg libcudf; then
 
     # get the current count before the compile starts
     FILES_IN_CCACHE=""
-    if [ -x "$(command -v ccache)" ]; then
+    if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
         FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
+        echo "$FILES_IN_CCACHE"
+        # zero the ccache statistics
+        ccache -z
     fi
 
     cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
@@ -197,12 +212,24 @@ if buildAll || hasArg libcudf; then
     compile_total=$(( compile_end - compile_start ))
 
     # Record build times
-    if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
-        echo "Formatting build times"
+    if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
+        echo "Formatting build metrics"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
-        message="$FILES_IN_CCACHE <p>$PARALLEL_LEVEL parallel build time is $compile_total seconds"
-        echo "$message"
-        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html
+        MSG="<p>"
+        # get some ccache stats after the compile
+        if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
+           MSG="${MSG}<br/>$FILES_IN_CCACHE"
+           HIT_RATE=$(ccache -s | grep "cache hit rate")
+           MSG="${MSG}<br/>${HIT_RATE}"
+        fi
+        MSG="${MSG}<br/>parallel setting: $PARALLEL_LEVEL"
+        MSG="${MSG}<br/>parallel build time: $compile_total seconds"
+        if [[ -f "${LIB_BUILD_DIR}/libcudf.so" ]]; then
+           LIBCUDF_FS=$(ls -lh ${LIB_BUILD_DIR}/libcudf.so | awk '{print $5}')
+           MSG="${MSG}<br/>libcudf.so size: $LIBCUDF_FS"
+        fi
+        echo "$MSG"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$MSG" > ${LIB_BUILD_DIR}/ninja_log.html
     fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 00dffa57683..f23296038f2 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -78,6 +78,14 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then
   mkdir -p ${CONDA_BLD_DIR}/libcudf/work
   cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work
 
+  # Copy libcudf build metrics results
+  LIBCUDF_BUILD_DIR=$CONDA_BLD_DIR/libcudf/work/cpp/build
+  echo "Checking for build metrics log $LIBCUDF_BUILD_DIR/ninja_log.html"
+  if [[ -f "$LIBCUDF_BUILD_DIR/ninja_log.html" ]]; then
+      gpuci_logger "Copying build metrics results"
+      mkdir -p "$WORKSPACE/build-metrics"
+      cp "$LIBCUDF_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+  fi
 
   gpuci_logger "Build conda pkg for libcudf_kafka"
   gpuci_conda_retry build --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/libcudf_kafka $CONDA_BUILD_ARGS
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 4ac2fe79bf6..059e359e4e9 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -181,12 +181,10 @@ else
     done
 
     # Copy libcudf build time results
-    echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html"
-    if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then
+    echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.xml"
+    if [[ -f "$LIB_BUILD_DIR/ninja_log.xml" ]]; then
         gpuci_logger "Copying build time results"
         cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml"
-        mkdir -p "$WORKSPACE/build-metrics"
-        cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
     fi
 
     ################################################################################
diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
index 703f8dc15c7..c3730b3241a 100644
--- a/conda/recipes/libcudf/build.sh
+++ b/conda/recipes/libcudf/build.sh
@@ -4,5 +4,5 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     # This assumes the script is executed from the root of the repo directory
     ./build.sh -v libcudf --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
 else
-    ./build.sh -v libcudf tests --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
+    ./build.sh -v libcudf tests --allgpuarch --build_metrics --incl_cache_stats --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
 fi
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 5eada13aea2..bac6697da82 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -88,21 +88,38 @@
 elif output_fmt == "html":
     # output results in HTML format
     print("<html><head><title>Sorted Ninja Build Times</title>")
-    print("<style>", "table, th, td { border:1px solid black; }", "</style>")
+    # Note: Jenkins does not support style defined in the html
+    # https://www.jenkins.io/doc/book/security/configuring-content-security-policy/
     print("</head><body>")
     if args.msg is not None:
         print("<p>", args.msg, "</p>")
     print("<table>")
     print(
         "<tr><th>File</th>",
-        "<th align='right'>Compile time (ms)</th>",
-        "<th align='right'>Size (bytes)</th><tr>",
+        "<th>Compile time<br/>(ms)</th>",
+        "<th>Size<br/>(bytes)</th><tr>",
         sep="",
     )
+    summary = {"red": 0, "yellow": 0, "green": 0}
+    red = "bgcolor='#FFBBD0'"
+    yellow = "bgcolor='#FFFF80'"
+    green = "bgcolor='#AAFFBD'"
     for key in sl:
         result = entries[key]
+        elapsed = result[0]
+        color = green
+        if elapsed > 300000:  # 5 minutes
+            color = red
+            summary["red"] += 1
+        elif elapsed > 120000:  # 2 minutes
+            color = yellow
+            summary["yellow"] += 1
+        else:
+            summary["green"] += 1
         print(
-            "<tr><td>",
+            "<tr ",
+            color,
+            "><td>",
             key,
             "</td><td align='right'>",
             result[0],
@@ -111,6 +128,14 @@
             "</td></tr>",
             sep="",
         )
+    print("</table><br/><table border='2'>")
+    # include summary table with color legend
+    print("<tr><td", red, ">time &gt; 5 minutes</td>")
+    print("<td align='right'>", summary["red"], "</td></tr>")
+    print("<tr><td", yellow, ">2 minutes &lt; time &lt; 5 minutes</td>")
+    print("<td align='right'>", summary["yellow"], "</td></tr>")
+    print("<tr><td", green, ">time &lt; 2 minutes</td>")
+    print("<td align='right'>", summary["green"], "</td></tr>")
     print("</table></body></html>")
 
 else:

From dd390a280fade35fbdc5c6c7db4fa5edf51d073f Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Mon, 10 Jan 2022 10:35:18 -0500
Subject: [PATCH 138/202] Rewriting row/column conversions for Spark <-> cudf
 data conversions (#8444)

Row to column and column to row conversions changed to support large numbers of columns and variable-width data.

So far this is the column to row work and variable width work is not completed yet.

This code is currently copied over to the cudf side for benchmarking, but will not remain there.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/8444
---
 .../cudf/detail/utilities/integer_utils.hpp   |   68 +-
 cpp/src/copying/contiguous_split.cu           |    9 +-
 .../java/ai/rapids/cudf/HostMemoryBuffer.java |    4 +-
 java/src/main/java/ai/rapids/cudf/Table.java  |   51 +-
 java/src/main/native/src/TableJni.cpp         |   45 +-
 java/src/main/native/src/row_conversion.cu    | 1766 +++++++++++++++--
 java/src/main/native/src/row_conversion.hpp   |   17 +-
 .../test/java/ai/rapids/cudf/TableTest.java   |   61 +-
 8 files changed, 1758 insertions(+), 263 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index ddedab3944c..fe501279fd5 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,12 +33,18 @@ namespace cudf {
 //! Utility functions
 namespace util {
 /**
- * Finds the smallest integer not less than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus)
+S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -50,18 +56,44 @@ inline S round_up_safe(S number_to_round, S modulus)
 }
 
 /**
- * Finds the largest integer not greater than `number_to_round` and modulo `S` is
- * zero. This function assumes that `number_to_round` is non-negative and
- * `modulus` is positive.
+ * @brief Rounds `number_to_round` down to the last multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return largest integer not greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus)
+S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
 
+/**
+ * @brief Rounds `number_to_round` up to the next multiple of modulus
+ *
+ * @tparam S type to return
+ * @param number_to_round number that is being rounded
+ * @param modulus value to which to round
+ * @return smallest integer greater than `number_to_round` and modulo `S` is zero.
+ *
+ * @note This function assumes that `number_to_round` is non-negative and
+ * `modulus` is positive and does not check for overflow.
+ */
+template <typename S>
+constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+{
+  auto remainder = number_to_round % modulus;
+  if (remainder == 0) { return number_to_round; }
+  auto rounded_up = number_to_round - remainder + modulus;
+  return rounded_up;
+}
+
 /**
  * Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
@@ -75,16 +107,16 @@ inline S round_down_safe(S number_to_round, S modulus)
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+constexpr S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
+                                 I dividend,
+                                 I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
@@ -92,9 +124,7 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
 }
 
 template <typename I>
-constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend,
-                                        I divisor) noexcept
+constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -116,14 +146,14 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline I div_rounding_up_safe(I dividend, I divisor) noexcept
+constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
   using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline bool is_a_power_of_two(I val) noexcept
+constexpr bool is_a_power_of_two(I val) noexcept
 {
   static_assert(std::is_integral<I>::value, "This function only applies to integral types");
   return ((val - 1) & val) == 0;
@@ -153,7 +183,7 @@ constexpr inline bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr inline auto absolute_value(T value) -> T
+constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index bcedc2f62c6..8dc93bc1de3 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -40,13 +40,6 @@ namespace {
 // align all column size allocations to this boundary so that all output column buffers
 // start at that alignment.
 static constexpr std::size_t split_align = 64;
-inline __device__ std::size_t _round_up_safe(std::size_t number_to_round, std::size_t modulus)
-{
-  auto remainder = number_to_round % modulus;
-  if (remainder == 0) { return number_to_round; }
-  auto rounded_up = number_to_round - remainder + modulus;
-  return rounded_up;
-}
 
 /**
  * @brief Struct which contains information on a source buffer.
@@ -960,7 +953,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
       std::size_t const bytes =
         static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
 
-      return dst_buf_info{_round_up_safe(bytes, 64),
+      return dst_buf_info{util::round_up_unsafe(bytes, 64ul),
                           num_elements,
                           element_size,
                           num_rows,
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index 9541d05ce00..e4106574a19 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -393,7 +393,7 @@ public final void setInts(long offset, int[] data, long srcOffset, long len) {
    */
   public final long getLong(long offset) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
     return UnsafeMemoryAccessor.getLong(requestedAddress);
   }
 
@@ -404,7 +404,7 @@ public final long getLong(long offset) {
    */
   public final void setLong(long offset, long value) {
     long requestedAddress = this.address + offset;
-    addressOutOfBoundsCheck(requestedAddress, 8, "getLong");
+    addressOutOfBoundsCheck(requestedAddress, 8, "setLong");
     UnsafeMemoryAccessor.setLong(requestedAddress, value);
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 9014e69ee74..dcd7953fa2e 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -663,8 +663,12 @@ private static native long[] scatterScalars(long[] srcScalarHandles, long scatte
 
   private static native long[] convertToRows(long nativeHandle);
 
+  private static native long[] convertToRowsFixedWidthOptimized(long nativeHandle);
+
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
 
+  private static native long[] convertFromRowsFixedWidthOptimized(long nativeColumnView, int[] types, int[] scale);
+
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
@@ -2781,6 +2785,23 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSemiJoinGatherMap(gatherMapData);
   }
 
+  /**
+   * For details about how this method functions refer to
+   * {@link #convertToRowsFixedWidthOptimized()}.
+   *
+   * The only thing different between this method and {@link #convertToRowsFixedWidthOptimized()}
+   * is that this can handle rougly 250M columns while {@link #convertToRowsFixedWidthOptimized()}
+   * can only handle columns less than 100
+   */
+  public ColumnVector[] convertToRows() {
+    long[] ptrs = convertToRows(nativeHandle);
+    ColumnVector[] ret = new ColumnVector[ptrs.length];
+    for (int i = 0; i < ptrs.length; i++) {
+      ret[i] = new ColumnVector(ptrs[i]);
+    }
+    return ret;
+  }
+
   /**
    * Convert this table of columns into a row major format that is useful for interacting with other
    * systems that do row major processing of the data. Currently only fixed-width column types are
@@ -2855,8 +2876,8 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
    * There are some limits on the size of a single row.  If the row is larger than 1KB this will
    * throw an exception.
    */
-  public ColumnVector[] convertToRows() {
-    long[] ptrs = convertToRows(nativeHandle);
+  public ColumnVector[] convertToRowsFixedWidthOptimized() {
+    long[] ptrs = convertToRowsFixedWidthOptimized(nativeHandle);
     ColumnVector[] ret = new ColumnVector[ptrs.length];
     for (int i = 0; i < ptrs.length; i++) {
       ret[i] = new ColumnVector(ptrs[i]);
@@ -2867,13 +2888,14 @@ public ColumnVector[] convertToRows() {
   /**
    * Convert a column of list of bytes that is formatted like the output from `convertToRows`
    * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
    * @param vec the row data to process.
    * @param schema the types of each column.
    * @return the parsed table.
    */
   public static Table convertFromRows(ColumnView vec, DType ... schema) {
-    // TODO at some point we need a schema that support nesting so we can support nested types
-    // TODO we will need scale at some point very soon too
     int[] types = new int[schema.length];
     int[] scale = new int[schema.length];
     for (int i = 0; i < schema.length; i++) {
@@ -2884,6 +2906,27 @@ public static Table convertFromRows(ColumnView vec, DType ... schema) {
     return new Table(convertFromRows(vec.getNativeView(), types, scale));
   }
 
+  /**
+   * Convert a column of list of bytes that is formatted like the output from `convertToRows`
+   * and convert it back to a table.
+   *
+   * NOTE: This method doesn't support nested types
+   *
+   * @param vec the row data to process.
+   * @param schema the types of each column.
+   * @return the parsed table.
+   */
+  public static Table convertFromRowsFixedWidthOptimized(ColumnView vec, DType ... schema) {
+    int[] types = new int[schema.length];
+    int[] scale = new int[schema.length];
+    for (int i = 0; i < schema.length; i++) {
+      types[i] = schema[i].typeId.nativeId;
+      scale[i] = schema[i].getScale();
+
+    }
+    return new Table(convertFromRowsFixedWidthOptimized(vec.getNativeView(), types, scale));
+  }
+
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index b7bb6880731..828d163fe07 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2861,6 +2861,25 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass, jlong input_table) {
+  JNI_NULL_CHECK(env, input_table, "input table is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols =
+        cudf::jni::convert_to_rows_fixed_width_optimized(*n_input_table);
+    int num_columns = cols.size();
+    cudf::jni::native_jlongArray outcol_handles(env, num_columns);
+    for (int i = 0; i < num_columns; i++) {
+      outcol_handles[i] = reinterpret_cast<jlong>(cols[i].release());
+    }
+    return outcol_handles.get_jArray();
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
                                                                     jlong j_input, jlong j_map,
                                                                     jlong j_target,
@@ -2908,7 +2927,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   try {
     cudf::jni::auto_set_device(env);
     cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
-    std::vector<std::unique_ptr<cudf::column>> cols = cudf::java::convert_to_rows(*n_input_table);
+    std::vector<std::unique_ptr<cudf::column>> cols = cudf::jni::convert_to_rows(*n_input_table);
     int num_columns = cols.size();
     cudf::jni::native_jlongArray outcol_handles(env, num_columns);
     for (int i = 0; i < num_columns; i++) {
@@ -2919,6 +2938,28 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRowsFixedWidthOptimized(
+    JNIEnv *env, jclass, jlong input_column, jintArray types, jintArray scale) {
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, types, "types is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::lists_column_view list_input(*input);
+    cudf::jni::native_jintArray n_types(env, types);
+    cudf::jni::native_jintArray n_scale(env, scale);
+    std::vector<cudf::data_type> types_vec;
+    for (int i = 0; i < n_types.size(); i++) {
+      types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
+    }
+    std::unique_ptr<cudf::table> result =
+        cudf::jni::convert_from_rows_fixed_width_optimized(list_input, types_vec);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *env, jclass,
                                                                        jlong input_column,
                                                                        jintArray types,
@@ -2936,7 +2977,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertFromRows(JNIEnv *e
     for (int i = 0; i < n_types.size(); i++) {
       types_vec.emplace_back(cudf::jni::make_data_type(n_types[i], n_scale[i]));
     }
-    std::unique_ptr<cudf::table> result = cudf::java::convert_from_rows(list_input, types_vec);
+    std::unique_ptr<cudf::table> result = cudf::jni::convert_from_rows(list_input, types_vec);
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 68f1ae93dec..3ef092792bf 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,46 +14,216 @@
  * limitations under the License.
  */
 
-#include <iostream>
-#include <limits>
-
+#include <cooperative_groups.h>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/sequence.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/binary_search.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+#include <type_traits>
 
 #include "row_conversion.hpp"
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#include <cuda/barrier>
+#endif
+
+#include <algorithm>
+#include <cstdarg>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <tuple>
+
+constexpr auto JCUDF_ROW_ALIGNMENT = 8;
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+constexpr auto NUM_TILES_PER_KERNEL_FROM_ROWS = 2;
+constexpr auto NUM_TILES_PER_KERNEL_TO_ROWS = 2;
+constexpr auto NUM_TILES_PER_KERNEL_LOADED = 2;
+constexpr auto NUM_VALIDITY_TILES_PER_KERNEL = 8;
+constexpr auto NUM_VALIDITY_TILES_PER_KERNEL_LOADED = 2;
+
+constexpr auto MAX_BATCH_SIZE = std::numeric_limits<cudf::size_type>::max();
+
+// needed to suppress warning about cuda::barrier
+#pragma nv_diag_suppress static_var_with_dynamic_init
+#endif
+
+using namespace cudf;
+using detail::make_device_uvector_async;
+using rmm::device_uvector;
 namespace cudf {
-namespace java {
+namespace jni {
+namespace detail {
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/************************************************************************
+ * This module converts data from row-major to column-major and from column-major
+ * to row-major. It is a transpose of the data of sorts, but there are a few
+ * complicating factors. They are spelled out below:
+ *
+ * Row Batches:
+ * The row data has to fit inside a
+ * cuDF column, which limits it to 2 gigs currently. The calling code attempts
+ * to keep the data size under 2 gigs, but due to padding this isn't always
+ * the case, so being able to break this up into multiple columns is necessary.
+ * Internally, this is referred to as the row batch, which is a group of rows
+ * that will fit into this 2 gig space requirement. There are typically 1 of
+ * these batches, but there can be 2.
+ *
+ * Async Memcpy:
+ * The CUDA blocks are using memcpy_async, which allows for the device to
+ * schedule memcpy operations and then wait on them to complete at a later
+ * time with a barrier. The recommendation is to double-buffer the work
+ * so that processing can occur while a copy operation is being completed.
+ * On Ampere or later hardware there is dedicated hardware to do this copy
+ * and on pre-Ampere it should generate the same code that a hand-rolled
+ * loop would generate, so performance should be the same or better than
+ * a hand-rolled kernel.
+ *
+ * Tile Info:
+ * Each CUDA block will work on NUM_TILES_PER_KERNEL_*_ROWS tile infos
+ * before exiting. It will have enough shared memory available to load
+ * NUM_TILES_PER_KERNEL_LOADED tiles at one time. The block will load
+ * as many tiles as it can fit into shared memory and then wait on the
+ * first tile to completely load before processing. Processing in this
+ * case means copying the data from shared memory back out to device
+ * memory via memcpy_async. This kernel is completely memory bound.
+ *
+ * Batch Data:
+ * This structure contains all the row batches and some book-keeping
+ * data necessary for the batches such as row numbers for the batches.
+ *
+ * Tiles:
+ * The tile info describes a tile of data to process. In a GPU with
+ * 48KB of shared memory each tile uses approximately 24KB of memory
+ * which equates to about 144 bytes in each direction. The tiles are
+ * kept as square as possible to attempt to coalesce memory operations.
+ * The taller a tile is the better coalescing of columns, but row
+ * coalescing suffers. The wider a tile is the better the row coalescing,
+ * but columns coalescing suffers. The code attempts to produce a square
+ * tile to balance the coalescing. It starts by figuring out the optimal
+ * byte length and then adding columns to the data until the tile is too
+ * large. Since rows are different width with different alignment
+ * requirements, this isn't typically exact. Once a width is found the
+ * tiles are generated vertically with that width and height and then
+ * the process repeats. This means all the tiles will be the same
+ * height, but will have different widths based on what columns they
+ * encompass. Tiles in a vertical row will all have the same dimensions.
+ *
+ *   --------------------------------
+ *   | 4   5.0f || True   8   3   1 |
+ *   | 3   6.0f || False  3   1   1 |
+ *   | 2   7.0f || True   7   4   1 |
+ *   | 1   8.0f || False  2   5   1 |
+ *   --------------------------------
+ *   | 0   9.0f || True   6   7   1 |
+ *   ...
+ ************************************************************************/
 
 /**
- * Copy a simple vector to device memory asynchronously. Be sure to read
- * the data on the same stream as is used to copy it.
+ * @brief The CUDA blocks work on one or more tile_info structs of data.
+ *        This structure defines the workspaces for the blocks.
+ *
  */
-template <typename T>
-std::unique_ptr<rmm::device_uvector<T>> copy_to_dev_async(const std::vector<T> &input,
-                                                          rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource *mr) {
-  std::unique_ptr<rmm::device_uvector<T>> ret(new rmm::device_uvector<T>(input.size(), stream, mr));
-  CUDA_TRY(cudaMemcpyAsync(ret->data(), input.data(), sizeof(T) * input.size(),
-                           cudaMemcpyHostToDevice, stream.value()));
-  return ret;
-}
+struct tile_info {
+  int start_col;
+  int start_row;
+  int end_col;
+  int end_row;
+  int batch_number;
+
+  CUDA_DEVICE_CALLABLE
+  size_type get_shared_row_size(size_type const *const col_offsets,
+                                size_type const *const col_sizes) const {
+    return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
+                                 JCUDF_ROW_ALIGNMENT);
+  }
+
+  CUDA_DEVICE_CALLABLE
+  size_type num_cols() const { return end_col - start_col + 1; }
+
+  CUDA_DEVICE_CALLABLE
+  size_type num_rows() const { return end_row - start_row + 1; }
+};
 
-__global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
-                                            const cudf::size_type num_columns,
-                                            const cudf::size_type row_size,
-                                            const cudf::size_type *input_offset_in_row,
-                                            const cudf::size_type *num_bytes, int8_t **output_data,
-                                            cudf::bitmask_type **output_nm,
-                                            const int8_t *input_data) {
+/**
+ * @brief Returning rows is done in a byte cudf column. This is limited in size by
+ *        `size_type` and so output is broken into batches of rows that fit inside
+ *        this limit.
+ *
+ */
+struct row_batch {
+  size_type num_bytes;                   // number of bytes in this batch
+  size_type row_count;                   // number of rows in the batch
+  device_uvector<size_type> row_offsets; // offsets column of output cudf column
+};
 
+/**
+ * @brief Holds information about the batches of data to be processed
+ *
+ */
+struct batch_data {
+  device_uvector<size_type> batch_row_offsets;      // offset column of returned cudf column
+  device_uvector<size_type> d_batch_row_boundaries; // row numbers for the start of each batch
+  std::vector<size_type>
+      batch_row_boundaries;           // row numbers for the start of each batch: 0, 1500, 2700
+  std::vector<row_batch> row_batches; // information about each batch such as byte count
+};
+
+struct row_offset_functor {
+  row_offset_functor(size_type fixed_width_only_row_size)
+      : _fixed_width_only_row_size(fixed_width_only_row_size){};
+
+  CUDA_DEVICE_CALLABLE
+  size_type operator()(int row_number, int tile_row_start) const {
+    return (row_number - tile_row_start) * _fixed_width_only_row_size;
+  }
+
+  size_type _fixed_width_only_row_size;
+};
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/**
+ * @brief Copies data from row-based JCUDF format to column-based cudf format.
+ *
+ * This optimized version of the conversion is faster for fixed-width tables
+ * that do not have more than 100 columns.
+ *
+ * @param num_rows number of rows in the incoming table
+ * @param num_columns number of columns in the incoming table
+ * @param row_size length in bytes of each row
+ * @param input_offset_in_row offset to each row of data
+ * @param num_bytes total number of bytes in the incoming data
+ * @param output_data array of pointers to the output data
+ * @param output_nm array of pointers to the output null masks
+ * @param input_data pointing to the incoming row data
+ */
+__global__ void
+copy_from_rows_fixed_width_optimized(const size_type num_rows, const size_type num_columns,
+                                     const size_type row_size, const size_type *input_offset_in_row,
+                                     const size_type *num_bytes, int8_t **output_data,
+                                     bitmask_type **output_nm, const int8_t *input_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -68,10 +238,10 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  cudf::size_type rows_per_group = blockDim.x;
-  cudf::size_type row_group_start = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type const rows_per_group = blockDim.x;
+  size_type const row_group_start = blockIdx.x;
+  size_type const row_group_stride = gridDim.x;
+  size_type const row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
@@ -80,28 +250,24 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
   int8_t *row_tmp = &shared_data[row_size * threadIdx.x];
   int8_t *row_vld_tmp = &row_tmp[input_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+  for (auto row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
     // Step 1: Copy the data into shared memory
     // We know row_size is always aligned with and a multiple of int64_t;
     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
-    const int64_t *long_input = reinterpret_cast<int64_t const *>(input_data);
+    int64_t const *long_input = reinterpret_cast<int64_t const *>(input_data);
 
-    cudf::size_type shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_output_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
-    if (row_index_end > num_rows) {
-      row_index_end = num_rows;
-    }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length = row_size * num_rows_in_group;
+    auto const shared_output_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    auto const shared_output_stride = blockDim.x * blockDim.y;
+    auto const row_index_end = std::min(num_rows, ((row_group_index + 1) * rows_per_group));
+    auto const num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    auto const shared_length = row_size * num_rows_in_group;
 
-    cudf::size_type shared_output_end = shared_length / sizeof(int64_t);
+    size_type const shared_output_end = shared_length / sizeof(int64_t);
 
-    cudf::size_type start_input_index =
-        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+    auto const start_input_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-    for (cudf::size_type shared_index = shared_output_index; shared_index < shared_output_end;
+    for (size_type shared_index = shared_output_index; shared_index < shared_output_end;
          shared_index += shared_output_stride) {
       long_shared[shared_index] = long_input[start_input_index + shared_index];
     }
@@ -112,19 +278,18 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
 
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
-    cudf::size_type row_index = (row_group_index * rows_per_group) + threadIdx.x;
+    auto const row_index = (row_group_index * rows_per_group) + threadIdx.x;
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data in for the next row group.
     uint32_t active_mask = __ballot_sync(0xffffffff, row_index < num_rows);
     if (row_index < num_rows) {
-      cudf::size_type col_index_start = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+      auto const col_index_start = threadIdx.y;
+      auto const col_index_stride = blockDim.y;
+      for (auto col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
-        cudf::size_type col_size = num_bytes[col_index];
-        const int8_t *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
+        auto const col_size = num_bytes[col_index];
+        int8_t const *col_tmp = &(row_tmp[input_offset_in_row[col_index]]);
         int8_t *col_output = output_data[col_index];
         switch (col_size) {
           case 1: {
@@ -147,18 +312,18 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
             break;
           }
           default: {
-            cudf::size_type output_offset = col_size * row_index;
+            auto const output_offset = col_size * row_index;
             // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
+            for (auto b = 0; b < col_size; b++) {
               col_output[b + output_offset] = col_tmp[b];
             }
             break;
           }
         }
 
-        cudf::bitmask_type *nm = output_nm[col_index];
+        bitmask_type *nm = output_nm[col_index];
         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        size_type byte_bit_offset = col_index % 8;
         int predicate = *valid_byte & (1 << byte_bit_offset);
         uint32_t bitmask = __ballot_sync(active_mask, predicate);
         if (row_index % 32 == 0) {
@@ -171,12 +336,10 @@ __global__ void copy_to_fixed_width_columns(const cudf::size_type num_rows,
   }
 }
 
-__global__ void
-copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_type num_rows,
-                              const cudf::size_type num_columns, const cudf::size_type row_size,
-                              const cudf::size_type *output_offset_in_row,
-                              const cudf::size_type *num_bytes, const int8_t **input_data,
-                              const cudf::bitmask_type **input_nm, int8_t *output_data) {
+__global__ void copy_to_rows_fixed_width_optimized(
+    const size_type start_row, const size_type num_rows, const size_type num_columns,
+    const size_type row_size, const size_type *output_offset_in_row, const size_type *num_bytes,
+    const int8_t **input_data, const bitmask_type **input_nm, int8_t *output_data) {
   // We are going to copy the data in two passes.
   // The first pass copies a chunk of data into shared memory.
   // The second pass copies that chunk from shared memory out to the final location.
@@ -193,10 +356,10 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
   // are controlled by the x dimension (there are multiple blocks in the x
   // dimension).
 
-  cudf::size_type rows_per_group = blockDim.x;
-  cudf::size_type row_group_start = blockIdx.x;
-  cudf::size_type row_group_stride = gridDim.x;
-  cudf::size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
+  size_type rows_per_group = blockDim.x;
+  size_type row_group_start = blockIdx.x;
+  size_type row_group_stride = gridDim.x;
+  size_type row_group_end = (num_rows + rows_per_group - 1) / rows_per_group + 1;
 
   extern __shared__ int8_t shared_data[];
 
@@ -206,22 +369,20 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
   int8_t *row_vld_tmp =
       &row_tmp[output_offset_in_row[num_columns - 1] + num_bytes[num_columns - 1]];
 
-  for (cudf::size_type row_group_index = row_group_start; row_group_index < row_group_end;
+  for (size_type row_group_index = row_group_start; row_group_index < row_group_end;
        row_group_index += row_group_stride) {
-
     // Within the row group there should be 1 thread for each row.  This is a
     // requirement for launching the kernel
-    cudf::size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
+    size_type row_index = start_row + (row_group_index * rows_per_group) + threadIdx.x;
     // But we might not use all of the threads if the number of rows does not go
     // evenly into the thread count. We don't want those threads to exit yet
     // because we may need them to copy data back out.
     if (row_index < (start_row + num_rows)) {
-      cudf::size_type col_index_start = threadIdx.y;
-      cudf::size_type col_index_stride = blockDim.y;
-      for (cudf::size_type col_index = col_index_start; col_index < num_columns;
+      size_type col_index_start = threadIdx.y;
+      size_type col_index_stride = blockDim.y;
+      for (size_type col_index = col_index_start; col_index < num_columns;
            col_index += col_index_stride) {
-
-        cudf::size_type col_size = num_bytes[col_index];
+        size_type col_size = num_bytes[col_index];
         int8_t *col_tmp = &(row_tmp[output_offset_in_row[col_index]]);
         const int8_t *col_input = input_data[col_index];
         switch (col_size) {
@@ -245,9 +406,9 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
             break;
           }
           default: {
-            cudf::size_type input_offset = col_size * row_index;
+            size_type input_offset = col_size * row_index;
             // TODO this should just not be supported for fixed width columns, but just in case...
-            for (cudf::size_type b = 0; b < col_size; b++) {
+            for (size_type b = 0; b < col_size; b++) {
               col_tmp[b] = col_input[b + input_offset];
             }
             break;
@@ -256,10 +417,10 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
         // atomicOr only works on 32 bit or 64 bit  aligned values, and not byte aligned
         // so we have to rewrite the addresses to make sure that it is 4 byte aligned
         int8_t *valid_byte = &row_vld_tmp[col_index / 8];
-        cudf::size_type byte_bit_offset = col_index % 8;
+        size_type byte_bit_offset = col_index % 8;
         uint64_t fixup_bytes = reinterpret_cast<uint64_t>(valid_byte) % 4;
         int32_t *valid_int = reinterpret_cast<int32_t *>(valid_byte - fixup_bytes);
-        cudf::size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
+        size_type int_bit_offset = byte_bit_offset + (fixup_bytes * 8);
         // Now copy validity for the column
         if (input_nm[col_index]) {
           if (bit_is_set(input_nm[col_index], row_index)) {
@@ -281,21 +442,20 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
     int64_t *long_shared = reinterpret_cast<int64_t *>(shared_data);
     int64_t *long_output = reinterpret_cast<int64_t *>(output_data);
 
-    cudf::size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
-    cudf::size_type shared_input_stride = blockDim.x * blockDim.y;
-    cudf::size_type row_index_end = ((row_group_index + 1) * rows_per_group);
+    size_type shared_input_index = threadIdx.x + (threadIdx.y * blockDim.x);
+    size_type shared_input_stride = blockDim.x * blockDim.y;
+    size_type row_index_end = ((row_group_index + 1) * rows_per_group);
     if (row_index_end > num_rows) {
       row_index_end = num_rows;
     }
-    cudf::size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
-    cudf::size_type shared_length = row_size * num_rows_in_group;
+    size_type num_rows_in_group = row_index_end - (row_group_index * rows_per_group);
+    size_type shared_length = row_size * num_rows_in_group;
 
-    cudf::size_type shared_input_end = shared_length / sizeof(int64_t);
+    size_type shared_input_end = shared_length / sizeof(int64_t);
 
-    cudf::size_type start_output_index =
-        (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
+    size_type start_output_index = (row_size * row_group_index * rows_per_group) / sizeof(int64_t);
 
-    for (cudf::size_type shared_index = shared_input_index; shared_index < shared_input_end;
+    for (size_type shared_index = shared_input_index; shared_index < shared_input_end;
          shared_index += shared_input_stride) {
       long_output[start_output_index + shared_index] = long_shared[shared_index];
     }
@@ -304,8 +464,575 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
   }
 }
 
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/**
+ * @brief copy data from cudf columns into JCUDF format, which is row-based
+ *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_tile shared memory amount each `tile_info` is using
+ * @param tile_infos span of `tile_info` structs the define the work
+ * @param input_data pointer to raw table data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param row_offsets offset to a specific row in the output data
+ * @param batch_row_boundaries row numbers for batch starts
+ * @param output_data pointer to output data
+ *
+ */
+template <typename RowOffsetIter>
+__global__ void copy_to_rows(const size_type num_rows, const size_type num_columns,
+                             const size_type shmem_used_per_tile,
+                             device_span<const tile_info> tile_infos, const int8_t **input_data,
+                             const size_type *col_sizes, const size_type *col_offsets,
+                             RowOffsetIter row_offsets, size_type const *batch_row_boundaries,
+                             int8_t **output_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the tile_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier[NUM_TILES_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&tile_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS,
+               static_cast<uint>(NUM_TILES_PER_KERNEL_TO_ROWS));
+
+  size_t fetch_index;      //< tile we are currently fetching
+  size_t processing_index; //< tile we are currently processing
+  for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) {
+    // Fetch ahead up to NUM_TILES_PER_KERNEL_LOADED
+    for (; fetch_index < tiles_remaining && fetch_index < (processing_index + stages_count);
+         ++fetch_index) {
+      auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + fetch_index];
+      auto const num_fetch_cols = fetch_tile.num_cols();
+      auto const num_fetch_rows = fetch_tile.num_rows();
+      auto const num_elements_in_tile = num_fetch_cols * num_fetch_rows;
+      auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+      auto const starting_column_offset = col_offsets[fetch_tile.start_col];
+      auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED];
+
+      // wait for the last use of the memory to be completed
+      if (fetch_index >= NUM_TILES_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
+
+      // to do the copy we need to do n column copies followed by m element copies OR
+      // we have to do m element copies followed by r row copies. When going from column
+      // to row it is much easier to copy by elements first otherwise we would need a running
+      // total of the column sizes for our tile, which isn't readily available. This makes it
+      // more appealing to copy element-wise from input data into shared matching the end layout
+      // and do row-based memcopies out.
+
+      auto const shared_buffer_base = shared[fetch_index % stages_count];
+      for (auto el = static_cast<int>(threadIdx.x); el < num_elements_in_tile; el += blockDim.x) {
+        auto const relative_col = el / num_fetch_rows;
+        auto const relative_row = el % num_fetch_rows;
+        auto const absolute_col = relative_col + fetch_tile.start_col;
+        auto const absolute_row = relative_row + fetch_tile.start_row;
+        auto const col_size = col_sizes[absolute_col];
+        auto const col_offset = col_offsets[absolute_col];
+        auto const relative_col_offset = col_offset - starting_column_offset;
+
+        auto const shared_offset = relative_row * fetch_tile_row_size + relative_col_offset;
+        auto const input_src = input_data[absolute_col] + col_size * absolute_row;
+
+        // copy the element from global memory
+        switch (col_size) {
+          case 2:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<2>(col_size), fetch_barrier);
+            break;
+          case 4:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<4>(col_size), fetch_barrier);
+            break;
+          case 8:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src,
+                               cuda::aligned_size_t<8>(col_size), fetch_barrier);
+            break;
+          default:
+            cuda::memcpy_async(&shared_buffer_base[shared_offset], input_src, col_size,
+                               fetch_barrier);
+            break;
+        }
+      }
+    }
+
+    auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED];
+    processing_barrier.arrive_and_wait();
+
+    auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_TO_ROWS + processing_index];
+    auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
+    auto const column_offset = col_offsets[tile.start_col];
+    auto const tile_output_buffer = output_data[tile.batch_number];
+    auto const row_batch_start =
+        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
+
+    // copy entire row 8 bytes at a time
+    constexpr auto bytes_per_chunk = 8;
+    auto const chunks_per_row = util::div_rounding_up_unsafe(tile_row_size, bytes_per_chunk);
+    auto const total_chunks = chunks_per_row * tile.num_rows();
+
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_row = i / chunks_per_row;
+      auto const relative_chunk_offset = (i % chunks_per_row) * bytes_per_chunk;
+      auto const output_dest = tile_output_buffer +
+                               row_offsets(relative_row + tile.start_row, row_batch_start) +
+                               column_offset + relative_chunk_offset;
+      auto const input_src = &shared[processing_index % stages_count]
+                                    [tile_row_size * relative_row + relative_chunk_offset];
+
+      cuda::memcpy_async(output_dest, input_src,
+                         cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                         processing_barrier);
+    }
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) {
+    tile_barrier[i].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
+ * @param row_offsets offset to a specific row in the output data
+ * @param batch_row_boundaries row numbers for batch starts
+ * @param output_data pointer to output data, partitioned by data size
+ * @param validity_offsets offset into input data row for validity data
+ * @param tile_infos information about the tiles of work
+ * @param input_nm pointer to input data
+ *
+ */
+template <typename RowOffsetIter>
+__global__ void
+copy_validity_to_rows(const size_type num_rows, const size_type num_columns,
+                      const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                      size_type const *batch_row_boundaries, int8_t **output_data,
+                      const size_type validity_offset, device_span<const tile_info> tile_infos,
+                      const bitmask_type **input_nm) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_tile / 2};
+
+  using cudf::detail::warp_size;
+
+  // each thread of warp reads a single int32 of validity - so we read 128 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_TILES_PER_KERNEL));
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&shared_tile_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) {
+    if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) {
+      shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED].arrive_and_wait();
+    }
+    int8_t *this_shared_tile = shared_tiles[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+    auto tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile];
+
+    auto const num_tile_cols = tile.num_cols();
+    auto const num_tile_rows = tile.num_rows();
+
+    auto const num_sections_x = util::div_rounding_up_unsafe(num_tile_cols, 32);
+    auto const num_sections_y = util::div_rounding_up_unsafe(num_tile_rows, 32);
+    auto const validity_data_row_length = util::round_up_unsafe(
+        util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+    auto const total_sections = num_sections_x * num_sections_y;
+
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_tile = std::max(1u, blockDim.x / warp_size);
+
+    // the tile is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_tile) {
+      // convert to rows and cols
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+      auto const relative_col = section_x * 32 + lane_id;
+      auto const relative_row = section_y * 32;
+      auto const absolute_col = relative_col + tile.start_col;
+      auto const absolute_row = relative_row + tile.start_row;
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_col < num_columns);
+
+      if (absolute_col < num_columns) {
+        auto my_data = input_nm[absolute_col] != nullptr ?
+                           input_nm[absolute_col][absolute_row / 32] :
+                           std::numeric_limits<uint32_t>::max();
+
+        // every thread that is participating in the warp has 4 bytes, but it's column-based
+        // data and we need it in row-based. So we shuffle the bits around with ballot_sync to
+        // make the bytes we actually write.
+        bitmask_type dw_mask = 1;
+        for (int i = 0; i < 32 && relative_row + i < num_rows; ++i, dw_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_data & dw_mask);
+          // lead thread in each warp writes data
+          auto const validity_write_offset =
+              validity_data_row_length * (relative_row + i) + relative_col / CHAR_BIT;
+          if (threadIdx.x % warp_size == 0) {
+            *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
+          }
+        }
+      }
+    }
+
+    // make sure entire tile has finished copy
+    group.sync();
+
+    auto const output_data_base =
+        output_data[tile.batch_number] + validity_offset + tile.start_col / CHAR_BIT;
+
+    // now async memcpy the shared memory out to the final destination 4 bytes at a time since we do
+    // 32-row chunks
+    constexpr auto bytes_per_chunk = 8;
+    auto const row_bytes = util::div_rounding_up_unsafe(num_tile_cols, CHAR_BIT);
+    auto const chunks_per_row = util::div_rounding_up_unsafe(row_bytes, bytes_per_chunk);
+    auto const total_chunks = chunks_per_row * tile.num_rows();
+    auto &processing_barrier =
+        shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+    auto const tail_bytes = row_bytes % bytes_per_chunk;
+    auto const row_batch_start =
+        tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
+
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_row = i / chunks_per_row;
+      auto const col_chunk = i % chunks_per_row;
+      auto const relative_chunk_offset = col_chunk * bytes_per_chunk;
+      auto const output_dest = output_data_base +
+                               row_offsets(relative_row + tile.start_row, row_batch_start) +
+                               relative_chunk_offset;
+      auto const input_src =
+          &this_shared_tile[validity_data_row_length * relative_row + relative_chunk_offset];
+
+      if (tail_bytes > 0 && col_chunk == chunks_per_row - 1)
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
+      else
+        cuda::memcpy_async(output_dest, input_src,
+                           cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                           processing_barrier);
+    }
+  }
+
+  // wait for last tiles of data to arrive
+  for (int validity_tile = 0;
+       validity_tile < tiles_remaining % NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++validity_tile) {
+    shared_tile_barriers[validity_tile].arrive_and_wait();
+  }
+}
+
+/**
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
+ * @param row_offsets offset to a specific row in the input data
+ * @param batch_row_boundaries row numbers for batch starts
+ * @param output_data pointers to column data
+ * @param col_sizes array of sizes for each element in a column - one per column
+ * @param col_offsets offset into input data row for each column's start
+ * @param tile_infos information about the tiles of work
+ * @param input_data pointer to input data
+ *
+ */
+template <typename RowOffsetIter>
+__global__ void copy_from_rows(const size_type num_rows, const size_type num_columns,
+                               const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                               size_type const *batch_row_boundaries, int8_t **output_data,
+                               const size_type *col_sizes, const size_type *col_offsets,
+                               device_span<const tile_info> tile_infos, const int8_t *input_data) {
+  // We are going to copy the data in two passes.
+  // The first pass copies a chunk of data into shared memory.
+  // The second pass copies that chunk from shared memory out to the final location.
+
+  // Because shared memory is limited we copy a subset of the rows at a time.
+  // This has been broken up for us in the tile_info struct, so we don't have
+  // any calculation to do here, but it is important to note.
+
+  // to speed up some of the random access memory we do, we copy col_sizes and col_offsets
+  // to shared memory for each of the tiles that we work on
+
+  constexpr unsigned stages_count = NUM_TILES_PER_KERNEL_LOADED;
+  auto group = cooperative_groups::this_thread_block();
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared[stages_count] = {shared_data, shared_data + shmem_used_per_tile};
+
+  __shared__ cuda::barrier<cuda::thread_scope_block> tile_barrier[NUM_TILES_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&tile_barrier[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  auto tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS,
+               static_cast<uint>(NUM_TILES_PER_KERNEL_FROM_ROWS));
+
+  size_t fetch_index;
+  size_t processing_index;
+  for (processing_index = fetch_index = 0; processing_index < tiles_remaining; ++processing_index) {
+    // Fetch ahead up to stages_count groups
+    for (; fetch_index < static_cast<size_t>(tiles_remaining) &&
+           fetch_index < (processing_index + stages_count);
+         ++fetch_index) {
+      auto const fetch_tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + fetch_index];
+      auto const fetch_tile_start_row = fetch_tile.start_row;
+      auto const starting_col_offset = col_offsets[fetch_tile.start_col];
+      auto const fetch_tile_row_size = fetch_tile.get_shared_row_size(col_offsets, col_sizes);
+      auto &fetch_barrier = tile_barrier[fetch_index % NUM_TILES_PER_KERNEL_LOADED];
+      auto const row_batch_start =
+          fetch_tile.batch_number == 0 ? 0 : batch_row_boundaries[fetch_tile.batch_number];
+
+      // if we have fetched all buffers, we need to wait for processing
+      // to complete on them before we can use them again
+      if (fetch_index > NUM_TILES_PER_KERNEL_LOADED) {
+        fetch_barrier.arrive_and_wait();
+      }
+
+      for (auto row = fetch_tile_start_row + static_cast<int>(threadIdx.x);
+           row <= fetch_tile.end_row; row += blockDim.x) {
+        auto shared_offset = (row - fetch_tile_start_row) * fetch_tile_row_size;
+        // copy the data
+        cuda::memcpy_async(&shared[fetch_index % stages_count][shared_offset],
+                           &input_data[row_offsets(row, row_batch_start) + starting_col_offset],
+                           fetch_tile_row_size, fetch_barrier);
+      }
+    }
+
+    auto &processing_barrier = tile_barrier[processing_index % NUM_TILES_PER_KERNEL_LOADED];
+
+    // ensure our data is ready
+    processing_barrier.arrive_and_wait();
+
+    auto const tile = tile_infos[blockIdx.x * NUM_TILES_PER_KERNEL_FROM_ROWS + processing_index];
+    auto const rows_in_tile = tile.num_rows();
+    auto const cols_in_tile = tile.num_cols();
+    auto const tile_row_size = tile.get_shared_row_size(col_offsets, col_sizes);
+
+    // now we copy from shared memory to final destination.
+    // the data is laid out in rows in shared memory, so the reads
+    // for a column will be "vertical". Because of this and the different
+    // sizes for each column, this portion is handled on row/column basis.
+    // to prevent each thread working on a single row and also to ensure
+    // that all threads can do work in the case of more threads than rows,
+    // we do a global index instead of a double for loop with col/row.
+    for (int index = threadIdx.x; index < rows_in_tile * cols_in_tile; index += blockDim.x) {
+      auto const relative_col = index % cols_in_tile;
+      auto const relative_row = index / cols_in_tile;
+      auto const absolute_col = relative_col + tile.start_col;
+      auto const absolute_row = relative_row + tile.start_row;
+
+      auto const shared_memory_row_offset = tile_row_size * relative_row;
+      auto const shared_memory_offset =
+          col_offsets[absolute_col] - col_offsets[tile.start_col] + shared_memory_row_offset;
+      auto const column_size = col_sizes[absolute_col];
+
+      int8_t *shmem_src = &shared[processing_index % stages_count][shared_memory_offset];
+      int8_t *dst = &output_data[absolute_col][absolute_row * column_size];
+
+      cuda::memcpy_async(dst, shmem_src, column_size, processing_barrier);
+    }
+    group.sync();
+  }
+
+  // wait on the last copies to complete
+  for (uint i = 0; i < std::min(stages_count, tiles_remaining); ++i) {
+    tile_barrier[i].arrive_and_wait();
+  }
+}
+
 /**
- * Calculate the dimensions of the kernel for fixed width only columns.
+ * @brief copy data from row-based format to cudf columns
+ *
+ * @tparam RowOffsetIter iterator that gives the size of a specific row of the table.
+ * @param num_rows total number of rows in the table
+ * @param num_columns total number of columns in the table
+ * @param shmem_used_per_tile amount of shared memory that is used by a tile
+ * @param row_offsets offset to a specific row in the input data
+ * @param batch_row_boundaries row numbers for batch starts
+ * @param output_nm pointers to null masks for columns
+ * @param validity_offsets offset into input data row for validity data
+ * @param tile_infos information about the tiles of work
+ * @param input_data pointer to input data
+ *
+ */
+template <typename RowOffsetIter>
+__global__ void
+copy_validity_from_rows(const size_type num_rows, const size_type num_columns,
+                        const size_type shmem_used_per_tile, RowOffsetIter row_offsets,
+                        size_type const *batch_row_boundaries, bitmask_type **output_nm,
+                        const size_type validity_offset, device_span<const tile_info> tile_infos,
+                        const int8_t *input_data) {
+  extern __shared__ int8_t shared_data[];
+  int8_t *shared_tiles[NUM_VALIDITY_TILES_PER_KERNEL_LOADED] = {
+      shared_data, shared_data + shmem_used_per_tile / 2};
+
+  using cudf::detail::warp_size;
+
+  // each thread of warp reads a single byte of validity - so we read 32 bytes
+  // then ballot_sync the bits and write the result to shmem
+  // after we fill shared mem memcpy it out in a blob.
+  // probably need knobs for number of rows vs columns to balance read/write
+  auto group = cooperative_groups::this_thread_block();
+
+  int const tiles_remaining =
+      std::min(static_cast<uint>(tile_infos.size()) - blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL,
+               static_cast<uint>(NUM_VALIDITY_TILES_PER_KERNEL));
+
+  __shared__ cuda::barrier<cuda::thread_scope_block>
+      shared_tile_barriers[NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+  if (group.thread_rank() == 0) {
+    for (int i = 0; i < NUM_VALIDITY_TILES_PER_KERNEL_LOADED; ++i) {
+      init(&shared_tile_barriers[i], group.size());
+    }
+  }
+
+  group.sync();
+
+  for (int validity_tile = 0; validity_tile < tiles_remaining; ++validity_tile) {
+    if (validity_tile >= NUM_VALIDITY_TILES_PER_KERNEL_LOADED) {
+      auto const validity_index = validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED;
+      shared_tile_barriers[validity_index].arrive_and_wait();
+    }
+    int8_t *this_shared_tile = shared_tiles[validity_tile % 2];
+    auto const tile = tile_infos[blockIdx.x * NUM_VALIDITY_TILES_PER_KERNEL + validity_tile];
+    auto const tile_start_col = tile.start_col;
+    auto const tile_start_row = tile.start_row;
+    auto const num_tile_cols = tile.num_cols();
+    auto const num_tile_rows = tile.num_rows();
+    constexpr auto rows_per_read = 32;
+    auto const num_sections_x = util::div_rounding_up_safe(num_tile_cols, CHAR_BIT);
+    auto const num_sections_y = util::div_rounding_up_safe(num_tile_rows, rows_per_read);
+    auto const validity_data_col_length = num_sections_y * 4; // words to bytes
+    auto const total_sections = num_sections_x * num_sections_y;
+    int const warp_id = threadIdx.x / warp_size;
+    int const lane_id = threadIdx.x % warp_size;
+    auto const warps_per_tile = std::max(1u, blockDim.x / warp_size);
+
+    // the tile is divided into sections. A warp operates on a section at a time.
+    for (int my_section_idx = warp_id; my_section_idx < total_sections;
+         my_section_idx += warps_per_tile) {
+      // convert section to row and col
+      auto const section_x = my_section_idx % num_sections_x;
+      auto const section_y = my_section_idx / num_sections_x;
+      auto const relative_col = section_x * CHAR_BIT;
+      auto const relative_row = section_y * rows_per_read + lane_id;
+      auto const absolute_col = relative_col + tile_start_col;
+      auto const absolute_row = relative_row + tile_start_row;
+      auto const row_batch_start =
+          tile.batch_number == 0 ? 0 : batch_row_boundaries[tile.batch_number];
+
+      auto const participation_mask = __ballot_sync(0xFFFFFFFF, absolute_row < num_rows);
+
+      if (absolute_row < num_rows) {
+        auto const my_byte = input_data[row_offsets(absolute_row, row_batch_start) +
+                                        validity_offset + absolute_col / CHAR_BIT];
+
+        // so every thread that is participating in the warp has a byte, but it's row-based
+        // data and we need it in column-based. So we shuffle the bits around to make
+        // the bytes we actually write.
+        for (int i = 0, byte_mask = 1; i < CHAR_BIT && relative_col + i < num_columns;
+             ++i, byte_mask <<= 1) {
+          auto validity_data = __ballot_sync(participation_mask, my_byte & byte_mask);
+          // lead thread in each warp writes data
+          if (threadIdx.x % warp_size == 0) {
+            auto const validity_write_offset =
+                validity_data_col_length * (relative_col + i) + relative_row / CHAR_BIT;
+
+            *reinterpret_cast<int32_t *>(&this_shared_tile[validity_write_offset]) = validity_data;
+          }
+        }
+      }
+    }
+
+    // make sure entire tile has finished copy
+    group.sync();
+
+    // now async memcpy the shared memory out to the final destination 8 bytes at a time
+    constexpr auto bytes_per_chunk = 8;
+    auto const col_bytes = util::div_rounding_up_unsafe(num_tile_rows, CHAR_BIT);
+    auto const chunks_per_col = util::div_rounding_up_unsafe(col_bytes, bytes_per_chunk);
+    auto const total_chunks = chunks_per_col * num_tile_cols;
+    auto &processing_barrier =
+        shared_tile_barriers[validity_tile % NUM_VALIDITY_TILES_PER_KERNEL_LOADED];
+    auto const tail_bytes = col_bytes % bytes_per_chunk;
+
+    for (auto i = threadIdx.x; i < total_chunks; i += blockDim.x) {
+      // determine source address of my chunk
+      auto const relative_col = i / chunks_per_col;
+      auto const row_chunk = i % chunks_per_col;
+      auto const absolute_col = relative_col + tile_start_col;
+      auto const relative_chunk_byte_offset = row_chunk * bytes_per_chunk;
+      auto const output_dest = output_nm[absolute_col] + word_index(tile_start_row) + row_chunk * 2;
+      auto const input_src =
+          &this_shared_tile[validity_data_col_length * relative_col + relative_chunk_byte_offset];
+
+      if (tail_bytes > 0 && row_chunk == chunks_per_col - 1) {
+        cuda::memcpy_async(output_dest, input_src, tail_bytes, processing_barrier);
+      } else {
+        cuda::memcpy_async(output_dest, input_src,
+                           cuda::aligned_size_t<bytes_per_chunk>(bytes_per_chunk),
+                           processing_barrier);
+      }
+    }
+  }
+
+  // wait for last tiles of data to arrive
+  auto const num_tiles_to_wait = tiles_remaining > NUM_VALIDITY_TILES_PER_KERNEL_LOADED ?
+                                     NUM_VALIDITY_TILES_PER_KERNEL_LOADED :
+                                     tiles_remaining;
+  for (int validity_tile = 0; validity_tile < num_tiles_to_wait; ++validity_tile) {
+    shared_tile_barriers[validity_tile].arrive_and_wait();
+  }
+}
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/**
+ * @brief Calculate the dimensions of the kernel for fixed width only columns.
+ *
  * @param [in] num_columns the number of columns being copied.
  * @param [in] num_rows the number of rows being copied.
  * @param [in] size_per_row the size each row takes up when padded.
@@ -313,11 +1040,8 @@ copy_from_fixed_width_columns(const cudf::size_type start_row, const cudf::size_
  * @param [out] threads the size of the threads for the kernel
  * @return the size in bytes of shared memory needed for each block.
  */
-static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
-                                        const cudf::size_type num_rows,
-                                        const cudf::size_type size_per_row, dim3 &blocks,
-                                        dim3 &threads) {
-
+static int calc_fixed_width_kernel_dims(const size_type num_columns, const size_type num_rows,
+                                        const size_type size_per_row, dim3 &blocks, dim3 &threads) {
   // We have found speed degrades when a thread handles more than 4 columns.
   // Each block is 2 dimensional. The y dimension indicates the columns.
   // We limit this to 32 threads in the y dimension so we can still
@@ -327,37 +1051,29 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
   // in the x dimension because we use atomic operations at the block
   // level when writing validity data out to main memory, and that would
   // need to change if we split a word of validity data between blocks.
-  int y_block_size = (num_columns + 3) / 4;
-  if (y_block_size > 32) {
-    y_block_size = 32;
-  }
-  int x_possible_block_size = 1024 / y_block_size;
+  int const y_block_size = min(util::div_rounding_up_safe(num_columns, 4), 32);
+  int const x_possible_block_size = 1024 / y_block_size;
   // 48KB is the default setting for shared memory per block according to the cuda tutorials
   // If someone configures the GPU to only have 16 KB this might not work.
-  int max_shared_size = 48 * 1024;
-  int max_block_size = max_shared_size / size_per_row;
+  int const max_shared_size = 48 * 1024;
   // If we don't have enough shared memory there is no point in having more threads
   // per block that will just sit idle
-  max_block_size = max_block_size > x_possible_block_size ? x_possible_block_size : max_block_size;
+  auto const max_block_size = std::min(x_possible_block_size, max_shared_size / size_per_row);
   // Make sure that the x dimension is a multiple of 32 this not only helps
   // coalesce memory access it also lets us do a ballot sync for validity to write
   // the data back out the warp level.  If x is a multiple of 32 then each thread in the y
   // dimension is associated with one or more warps, that should correspond to the validity
   // words directly.
-  int block_size = (max_block_size / 32) * 32;
+  int const block_size = (max_block_size / 32) * 32;
   CUDF_EXPECTS(block_size != 0, "Row size is too large to fit in shared memory");
 
-  int num_blocks = (num_rows + block_size - 1) / block_size;
-  if (num_blocks < 1) {
-    num_blocks = 1;
-  } else if (num_blocks > 10240) {
-    // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
-    // but in practice haveing too many can cause some overhead that I don't totally
-    // understand. Playing around with this haveing as little as 600 blocks appears
-    // to be able to saturate memory on V100, so this is an order of magnitude higher
-    // to try and future proof this a bit.
-    num_blocks = 10240;
-  }
+  // The maximum number of blocks supported in the x dimension is 2 ^ 31 - 1
+  // but in practice haveing too many can cause some overhead that I don't totally
+  // understand. Playing around with this haveing as little as 600 blocks appears
+  // to be able to saturate memory on V100, so this is an order of magnitude higher
+  // to try and future proof this a bit.
+  int const num_blocks = std::clamp((num_rows + block_size - 1) / block_size, 1, 10240);
+
   blocks.x = num_blocks;
   blocks.y = 1;
   blocks.z = 1;
@@ -373,140 +1089,647 @@ static int calc_fixed_width_kernel_dims(const cudf::size_type num_columns,
  * going from start row and containing the next num_rows.  Most of the parameters passed
  * into this function are common between runs and should be calculated once.
  */
-static std::unique_ptr<cudf::column> fixed_width_convert_to_rows(
-    const cudf::size_type start_row, const cudf::size_type num_rows,
-    const cudf::size_type num_columns, const cudf::size_type size_per_row,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_start,
-    std::unique_ptr<rmm::device_uvector<cudf::size_type>> &column_size,
-    std::unique_ptr<rmm::device_uvector<const int8_t *>> &input_data,
-    std::unique_ptr<rmm::device_uvector<const cudf::bitmask_type *>> &input_nm,
-    const cudf::scalar &zero, const cudf::scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
+static std::unique_ptr<column> fixed_width_convert_to_rows(
+    const size_type start_row, const size_type num_rows, const size_type num_columns,
+    const size_type size_per_row, rmm::device_uvector<size_type> &column_start,
+    rmm::device_uvector<size_type> &column_size, rmm::device_uvector<const int8_t *> &input_data,
+    rmm::device_uvector<const bitmask_type *> &input_nm, const scalar &zero,
+    const scalar &scalar_size_per_row, rmm::cuda_stream_view stream,
     rmm::mr::device_memory_resource *mr) {
-  int64_t total_allocation = size_per_row * num_rows;
+  int64_t const total_allocation = size_per_row * num_rows;
   // We made a mistake in the split somehow
-  CUDF_EXPECTS(total_allocation < std::numeric_limits<int>::max(), "Table is too large to fit!");
+  CUDF_EXPECTS(total_allocation < std::numeric_limits<size_type>::max(),
+               "Table is too large to fit!");
 
   // Allocate and set the offsets row for the byte array
-  std::unique_ptr<cudf::column> offsets =
+  std::unique_ptr<column> offsets =
       cudf::detail::sequence(num_rows + 1, zero, scalar_size_per_row, stream);
 
-  std::unique_ptr<cudf::column> data = cudf::make_numeric_column(
-      cudf::data_type(cudf::type_id::INT8), static_cast<cudf::size_type>(total_allocation),
-      cudf::mask_state::UNALLOCATED, stream, mr);
+  std::unique_ptr<column> data =
+      make_numeric_column(data_type(type_id::INT8), static_cast<size_type>(total_allocation),
+                          mask_state::UNALLOCATED, stream, mr);
 
   dim3 blocks;
   dim3 threads;
   int shared_size =
-      calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+      detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-  copy_from_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-      start_row, num_rows, num_columns, size_per_row, column_start->data(), column_size->data(),
-      input_data->data(), input_nm->data(), data->mutable_view().data<int8_t>());
+  copy_to_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+      start_row, num_rows, num_columns, size_per_row, column_start.data(), column_size.data(),
+      input_data.data(), input_nm.data(), data->mutable_view().data<int8_t>());
 
-  return cudf::make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
-                                 rmm::device_buffer{}, stream, mr);
+  return make_lists_column(num_rows, std::move(offsets), std::move(data), 0,
+                           rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
 }
 
-static cudf::data_type get_data_type(const cudf::column_view &v) {
-  return v.type();
-}
-
-static bool is_fixed_width(const cudf::data_type &t) {
-  return cudf::is_fixed_width(t);
-}
-
-static inline int32_t align_offset(int32_t offset, std::size_t alignment) {
-  return (offset + alignment - 1) & ~(alignment - 1);
-}
-
-static inline bool are_all_fixed_width(std::vector<cudf::data_type> const &schema) {
-  return std::all_of(schema.begin(), schema.end(), cudf::java::is_fixed_width);
+static inline bool are_all_fixed_width(std::vector<data_type> const &schema) {
+  return std::all_of(schema.begin(), schema.end(),
+                     [](const data_type &t) { return is_fixed_width(t); });
 }
 
 /**
- * Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ * @brief Given a set of fixed width columns, calculate how the data will be laid out in memory.
+ *
  * @param [in] schema the types of columns that need to be laid out.
  * @param [out] column_start the byte offset where each column starts in the row.
  * @param [out] column_size the size in bytes of the data for each columns in the row.
  * @return the size in bytes each row needs.
  */
-static inline int32_t compute_fixed_width_layout(std::vector<cudf::data_type> const &schema,
-                                                 std::vector<cudf::size_type> &column_start,
-                                                 std::vector<cudf::size_type> &column_size) {
+static inline int32_t compute_fixed_width_layout(std::vector<data_type> const &schema,
+                                                 std::vector<size_type> &column_start,
+                                                 std::vector<size_type> &column_size) {
   // We guarantee that the start of each column is 64-bit aligned so anything can go
   // there, but to make the code simple we will still do an alignment for it.
   int32_t at_offset = 0;
   for (auto col = schema.begin(); col < schema.end(); col++) {
-    cudf::size_type s = cudf::size_of(*col);
+    size_type s = size_of(*col);
     column_size.emplace_back(s);
     std::size_t allocation_needed = s;
     std::size_t alignment_needed = allocation_needed; // They are the same for fixed width types
-    at_offset = align_offset(at_offset, alignment_needed);
+    at_offset = util::round_up_unsafe(at_offset, static_cast<int32_t>(alignment_needed));
     column_start.emplace_back(at_offset);
     at_offset += allocation_needed;
   }
 
   // Now we need to add in space for validity
-  // Eventually we can think about nullable vs not nullable, but for now we will just always add it
-  // in
-  int32_t validity_bytes_needed = (schema.size() + 7) / 8;
+  // Eventually we can think about nullable vs not nullable, but for now we will just always add
+  // it in
+  int32_t const validity_bytes_needed =
+      util::div_rounding_up_safe<int32_t>(schema.size(), CHAR_BIT);
   // validity comes at the end and is byte aligned so we can pack more in.
   at_offset += validity_bytes_needed;
   // Now we need to pad the end so all rows are 64 bit aligned
-  return align_offset(at_offset, 8); // 8 bytes (64 bits)
+  return util::round_up_unsafe(at_offset, JCUDF_ROW_ALIGNMENT);
+}
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+/**
+ * @brief Compute information about a table such as bytes per row and offsets.
+ *
+ * @tparam iterator iterator of column schema data
+ * @param begin starting iterator of column schema
+ * @param end ending iterator of column schema
+ * @param column_starts column start offsets
+ * @param column_sizes size in bytes of each column
+ * @return size of the fixed_width data portion of a row.
+ */
+template <typename iterator>
+static size_type compute_column_information(iterator begin, iterator end,
+                                            std::vector<size_type> &column_starts,
+                                            std::vector<size_type> &column_sizes) {
+  size_type fixed_width_size_per_row = 0;
+  for (auto cv = begin; cv != end; ++cv) {
+    auto col_type = std::get<0>(*cv);
+    bool nested_type = is_compound(col_type);
+
+    // a list or string column will write a single uint64
+    // of data here for offset/length
+    auto col_size = nested_type ? 8 : size_of(col_type);
+
+    // align size for this type
+    size_type const alignment_needed = col_size; // They are the same for fixed width types
+    fixed_width_size_per_row = util::round_up_unsafe(fixed_width_size_per_row, alignment_needed);
+    column_starts.push_back(fixed_width_size_per_row);
+    column_sizes.push_back(col_size);
+    fixed_width_size_per_row += col_size;
+  }
+
+  auto validity_offset = fixed_width_size_per_row;
+  column_starts.push_back(validity_offset);
+
+  return util::round_up_unsafe(
+      fixed_width_size_per_row +
+          util::div_rounding_up_safe(static_cast<size_type>(std::distance(begin, end)), CHAR_BIT),
+      JCUDF_ROW_ALIGNMENT);
 }
 
-std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view const &tbl,
-                                                           rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource *mr) {
+/**
+ * @brief Build `tile_info` for the validity data to break up the work.
+ *
+ * @param num_columns number of columns in the table
+ * @param num_rows number of rows in the table
+ * @param shmem_limit_per_tile size of shared memory available to a single gpu tile
+ * @param row_batches batched row information for multiple output locations
+ * @return vector of `tile_info` structs for validity data
+ */
+std::vector<detail::tile_info>
+build_validity_tile_infos(size_type const &num_columns, size_type const &num_rows,
+                          size_type const &shmem_limit_per_tile,
+                          std::vector<row_batch> const &row_batches) {
+  auto const desired_rows_and_columns = static_cast<int>(sqrt(shmem_limit_per_tile));
+  auto const column_stride = util::round_up_unsafe(
+      [&]() {
+        if (desired_rows_and_columns > num_columns) {
+          // not many columns, group it into 8s and ship it off
+          return std::min(CHAR_BIT, num_columns);
+        } else {
+          return util::round_down_safe(desired_rows_and_columns, CHAR_BIT);
+        }
+      }(),
+      JCUDF_ROW_ALIGNMENT);
+
+  // we fit as much as we can given the column stride
+  // note that an element in the table takes just 1 bit, but a row with a single
+  // element still takes 8 bytes!
+  auto const bytes_per_row = util::round_up_safe(
+      util::div_rounding_up_unsafe(column_stride, CHAR_BIT), JCUDF_ROW_ALIGNMENT);
+  auto const row_stride =
+      std::min(num_rows, util::round_down_safe(shmem_limit_per_tile / bytes_per_row, 64));
+
+  std::vector<detail::tile_info> validity_tile_infos;
+  validity_tile_infos.reserve(num_columns / column_stride * num_rows / row_stride);
+  for (int col = 0; col < num_columns; col += column_stride) {
+    int current_tile_row_batch = 0;
+    int rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
+    int row = 0;
+    while (row < num_rows) {
+      if (rows_left_in_batch == 0) {
+        current_tile_row_batch++;
+        rows_left_in_batch = row_batches[current_tile_row_batch].row_count;
+      }
+      int const tile_height = std::min(row_stride, rows_left_in_batch);
+
+      validity_tile_infos.emplace_back(detail::tile_info{
+          col, row, std::min(col + column_stride - 1, num_columns - 1), row + tile_height - 1});
+      row += tile_height;
+      rows_left_in_batch -= tile_height;
+    }
+  }
+
+  return validity_tile_infos;
+}
+
+/**
+ * @brief functor that returns the size of a row or 0 is row is greater than the number of rows in
+ * the table
+ *
+ * @tparam RowSize iterator that returns the size of a specific row
+ */
+template <typename RowSize> struct row_size_functor {
+  row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
+      : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {}
+
+  CUDA_DEVICE_CALLABLE
+  uint64_t operator()(int i) const { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; }
 
-  const cudf::size_type num_columns = tbl.num_columns();
+  size_type _row_end;
+  RowSize _row_sizes;
+  size_type _last_row_end;
+};
 
-  std::vector<cudf::data_type> schema;
+/**
+ * @brief Builds batches of rows that will fit in the size limit of a column.
+ *
+ * @tparam RowSize iterator that gives the size of a specific row of the table.
+ * @param num_rows Total number of rows in the table
+ * @param row_sizes iterator that gives the size of a specific row of the table.
+ * @param all_fixed_width bool indicating all data in this table is fixed width
+ * @param stream stream to operate on for this work
+ * @param mr memory resource used to allocate any returned data
+ * @returns vector of size_type's that indicate row numbers for batch boundaries and a
+ * device_uvector of row offsets
+ */
+template <typename RowSize>
+batch_data build_batches(size_type num_rows, RowSize row_sizes, bool all_fixed_width,
+                         rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
+  auto const total_size = thrust::reduce(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows);
+  auto const num_batches = static_cast<int32_t>(
+      util::div_rounding_up_safe(total_size, static_cast<uint64_t>(MAX_BATCH_SIZE)));
+  auto const num_offsets = num_batches + 1;
+  std::vector<row_batch> row_batches;
+  std::vector<size_type> batch_row_boundaries;
+  device_uvector<size_type> batch_row_offsets(all_fixed_width ? 0 : num_rows, stream);
+
+  // at most max gpu memory / 2GB iterations.
+  batch_row_boundaries.reserve(num_offsets);
+  batch_row_boundaries.push_back(0);
+  size_type last_row_end = 0;
+  device_uvector<uint64_t> cumulative_row_sizes(num_rows, stream);
+  thrust::inclusive_scan(rmm::exec_policy(stream), row_sizes, row_sizes + num_rows,
+                         cumulative_row_sizes.begin());
+
+  while (static_cast<int>(batch_row_boundaries.size()) < num_offsets) {
+    // find the next MAX_BATCH_SIZE boundary
+    size_type const row_end =
+        ((thrust::lower_bound(rmm::exec_policy(stream), cumulative_row_sizes.begin(),
+                              cumulative_row_sizes.begin() + (num_rows - last_row_end),
+                              MAX_BATCH_SIZE) -
+          cumulative_row_sizes.begin()) +
+         last_row_end);
+
+    // build offset list for each row in this batch
+    auto const num_rows_in_batch = row_end - last_row_end;
+
+    // build offset list for each row in this batch
+    auto const num_entries = row_end - last_row_end + 1;
+    device_uvector<size_type> output_batch_row_offsets(num_entries, stream, mr);
+
+    auto row_size_iter_bounded = cudf::detail::make_counting_transform_iterator(
+        0, row_size_functor(row_end, row_sizes, last_row_end));
+
+    thrust::exclusive_scan(rmm::exec_policy(stream), row_size_iter_bounded,
+                           row_size_iter_bounded + num_entries, output_batch_row_offsets.begin());
+
+    auto const batch_bytes = output_batch_row_offsets.element(num_rows_in_batch, stream);
+
+    // The output_batch_row_offsets vector is used as the offset column of the returned data. This
+    // needs to be individually allocated, but the kernel needs a contiguous array of offsets or
+    // more global lookups are necessary.
+    if (!all_fixed_width) {
+      cudaMemcpy(batch_row_offsets.data() + last_row_end, output_batch_row_offsets.data(),
+                 num_rows_in_batch * sizeof(size_type), cudaMemcpyDeviceToDevice);
+    }
+
+    batch_row_boundaries.push_back(row_end);
+    row_batches.push_back({batch_bytes, num_rows_in_batch, std::move(output_batch_row_offsets)});
+
+    last_row_end = row_end;
+  }
+
+  return {std::move(batch_row_offsets), make_device_uvector_async(batch_row_boundaries, stream),
+          std::move(batch_row_boundaries), std::move(row_batches)};
+}
+
+/**
+ * @brief Computes the number of tiles necessary given a tile height and batch offsets
+ *
+ * @param batch_row_boundaries row boundaries for each batch
+ * @param desired_tile_height height of each tile in the table
+ * @param stream stream to use
+ * @return number of tiles necessary
+ */
+int compute_tile_counts(device_span<size_type const> const &batch_row_boundaries,
+                        int desired_tile_height, rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_boundaries.size() - 1;
+  device_uvector<size_type> num_tiles(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
+                    [desired_tile_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_tile_height);
+                    });
+  return thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
+}
+
+/**
+ * @brief Builds the `tile_info` structs for a given table.
+ *
+ * @param tiles span of tiles to populate
+ * @param batch_row_boundaries boundary to row batches
+ * @param column_start starting column of the tile
+ * @param column_end ending column of the tile
+ * @param desired_tile_height height of the tile
+ * @param total_number_of_rows total number of rows in the table
+ * @param stream stream to use
+ * @return number of tiles created
+ */
+size_type
+build_tiles(device_span<tile_info> tiles,
+            device_uvector<size_type> const &batch_row_boundaries, // comes from build_batches
+            int column_start, int column_end, int desired_tile_height, int total_number_of_rows,
+            rmm::cuda_stream_view stream) {
+  size_type const num_batches = batch_row_boundaries.size() - 1;
+  device_uvector<size_type> num_tiles(num_batches, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(stream), iter, iter + num_batches, num_tiles.begin(),
+                    [desired_tile_height,
+                     batch_row_boundaries =
+                         batch_row_boundaries.data()] __device__(auto batch_index) -> size_type {
+                      return util::div_rounding_up_unsafe(batch_row_boundaries[batch_index + 1] -
+                                                              batch_row_boundaries[batch_index],
+                                                          desired_tile_height);
+                    });
+
+  size_type const total_tiles =
+      thrust::reduce(rmm::exec_policy(stream), num_tiles.begin(), num_tiles.end());
+
+  device_uvector<size_type> tile_starts(num_batches + 1, stream);
+  auto tile_iter = cudf::detail::make_counting_transform_iterator(
+      0, [num_tiles = num_tiles.data(), num_batches] __device__(auto i) {
+        return (i < num_batches) ? num_tiles[i] : 0;
+      });
+  thrust::exclusive_scan(rmm::exec_policy(stream), tile_iter, tile_iter + num_batches + 1,
+                         tile_starts.begin()); // in tiles
+
+  thrust::transform(
+      rmm::exec_policy(stream), iter, iter + total_tiles, tiles.begin(),
+      [=, tile_starts = tile_starts.data(),
+       batch_row_boundaries = batch_row_boundaries.data()] __device__(size_type tile_index) {
+        // what batch this tile falls in
+        auto const batch_index_iter =
+            thrust::upper_bound(thrust::seq, tile_starts, tile_starts + num_batches, tile_index);
+        auto const batch_index = std::distance(tile_starts, batch_index_iter) - 1;
+        // local index within the tile
+        int const local_tile_index = tile_index - tile_starts[batch_index];
+        // the start row for this batch.
+        int const batch_row_start = batch_row_boundaries[batch_index];
+        // the start row for this tile
+        int const tile_row_start = batch_row_start + (local_tile_index * desired_tile_height);
+        // the end row for this tile
+        int const max_row =
+            std::min(total_number_of_rows - 1,
+                     batch_index + 1 > num_batches ?
+                         std::numeric_limits<size_type>::max() :
+                         static_cast<int>(batch_row_boundaries[batch_index + 1]) - 1);
+        int const tile_row_end =
+            std::min(batch_row_start + ((local_tile_index + 1) * desired_tile_height) - 1, max_row);
+
+        // stuff the tile
+        return tile_info{column_start, tile_row_start, column_end, tile_row_end,
+                         static_cast<int>(batch_index)};
+      });
+
+  return total_tiles;
+}
+
+/**
+ * @brief Determines what data should be operated on by each tile for the incoming table.
+ *
+ * @tparam TileCallback Callback that receives the start and end columns of tiles
+ * @param column_sizes vector of the size of each column
+ * @param column_starts vector of the offset of each column
+ * @param first_row_batch_size size of the first row batch to limit max tile size since a tile
+ * is unable to span batches
+ * @param total_number_of_rows total number of rows in the table
+ * @param shmem_limit_per_tile shared memory allowed per tile
+ * @param f callback function called when building a tile
+ */
+template <typename TileCallback>
+void determine_tiles(std::vector<size_type> const &column_sizes,
+                     std::vector<size_type> const &column_starts,
+                     size_type const first_row_batch_size, size_type const total_number_of_rows,
+                     size_type const &shmem_limit_per_tile, TileCallback f) {
+  // tile infos are organized with the tile going "down" the columns
+  // this provides the most coalescing of memory access
+  int current_tile_width = 0;
+  int current_tile_start_col = 0;
+
+  // the ideal tile height has lots of 8-byte reads and 8-byte writes. The optimal read/write
+  // would be memory cache line sized access, but since other tiles will read/write the edges
+  // this may not turn out to be overly important. For now, we will attempt to build a square
+  // tile as far as byte sizes. x * y = shared_mem_size. Which translates to x^2 =
+  // shared_mem_size since we want them equal, so height and width are sqrt(shared_mem_size). The
+  // trick is that it's in bytes, not rows or columns.
+  auto const optimal_square_len = static_cast<size_type>(sqrt(shmem_limit_per_tile));
+  auto const tile_height =
+      std::clamp(util::round_up_safe<int>(
+                     std::min(optimal_square_len / column_sizes[0], total_number_of_rows), 32),
+                 1, first_row_batch_size);
+
+  int row_size = 0;
+
+  // march each column and build the tiles of appropriate sizes
+  for (uint col = 0; col < column_sizes.size(); ++col) {
+    auto const col_size = column_sizes[col];
+
+    // align size for this type
+    auto const alignment_needed = col_size; // They are the same for fixed width types
+    auto const row_size_aligned = util::round_up_unsafe(row_size, alignment_needed);
+    auto const row_size_with_this_col = row_size_aligned + col_size;
+    auto const row_size_with_end_pad =
+        util::round_up_unsafe(row_size_with_this_col, JCUDF_ROW_ALIGNMENT);
+
+    if (row_size_with_end_pad * tile_height > shmem_limit_per_tile) {
+      // too large, close this tile, generate vertical tiles and restart
+      f(current_tile_start_col, col == 0 ? col : col - 1, tile_height);
+
+      row_size =
+          util::round_up_unsafe((column_starts[col] + column_sizes[col]) & 7, alignment_needed);
+      row_size += col_size; // alignment required for shared memory tile boundary to match
+                            // alignment of output row
+      current_tile_start_col = col;
+      current_tile_width = 0;
+    } else {
+      row_size = row_size_with_this_col;
+      current_tile_width++;
+    }
+  }
+
+  // build last set of tiles
+  if (current_tile_width > 0) {
+    f(current_tile_start_col, static_cast<int>(column_sizes.size()) - 1, tile_height);
+  }
+}
+
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+} // namespace detail
+
+std::vector<std::unique_ptr<column>> convert_to_rows(table_view const &tbl,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  auto const num_columns = tbl.num_columns();
+  auto const num_rows = tbl.num_rows();
+
+  auto const fixed_width_only = std::all_of(
+      tbl.begin(), tbl.end(), [](column_view const &c) { return is_fixed_width(c.type()); });
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem_in_bytes;
+  CUDA_TRY(
+      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
+  total_shmem_in_bytes -=
+      sizeof(cuda::barrier<cuda::thread_scope_block>) * NUM_TILES_PER_KERNEL_LOADED;
+  auto const shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED;
+
+  // break up the work into tiles, which are a starting and ending row/col #.
+  // this tile size is calculated based on the shared memory size available
+  // we want a single tile to fill up the entire shared memory space available
+  // for the transpose-like conversion.
+
+  // There are two different processes going on here. The GPU conversion of the data
+  // and the writing of the data into the list of byte columns that are a maximum of
+  // 2 gigs each due to offset maximum size. The GPU conversion portion has to understand
+  // this limitation because the column must own the data inside and as a result it must be
+  // a distinct allocation for that column. Copying the data into these final buffers would
+  // be prohibitively expensive, so care is taken to ensure the GPU writes to the proper buffer.
+  // The tiles are broken at the boundaries of specific rows based on the row sizes up
+  // to that point. These are row batches and they are decided first before building the
+  // tiles so the tiles can be properly cut around them.
+
+  // Get the pointers to the input columnar data ready
+
+  auto data_begin = thrust::make_transform_iterator(
+      tbl.begin(), [](auto const &c) { return c.template data<int8_t>(); });
+  std::vector<int8_t const *> input_data(data_begin, data_begin + tbl.num_columns());
+
+  auto nm_begin =
+      thrust::make_transform_iterator(tbl.begin(), [](auto const &c) { return c.null_mask(); });
+  std::vector<bitmask_type const *> input_nm(nm_begin, nm_begin + tbl.num_columns());
+
+  auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+  auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
+
+  std::vector<size_type> column_sizes;  // byte size of each column
+  std::vector<size_type> column_starts; // offset of column inside a row including alignment
+  column_sizes.reserve(num_columns);
+  column_starts.reserve(num_columns + 1); // we add a final offset for validity data start
+
+  auto schema_column_iter =
+      thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+                                      [&tbl](auto i) -> std::tuple<data_type, column_view const> {
+                                        return {tbl.column(i).type(), tbl.column(i)};
+                                      });
+
+  auto const fixed_width_size_per_row = detail::compute_column_information(
+      schema_column_iter, schema_column_iter + num_columns, column_starts, column_sizes);
+
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+
+  // total encoded row size. This includes fixed-width data, validity, and variable-width data.
+  auto row_size_iter = thrust::make_constant_iterator<uint64_t>(fixed_width_size_per_row);
+  auto batch_info = detail::build_batches(num_rows, row_size_iter, fixed_width_only, stream, mr);
+
+  // the first batch always exists unless we were sent an empty table
+  auto const first_batch_size = batch_info.row_batches[0].row_count;
+
+  std::vector<rmm::device_buffer> output_buffers;
+  std::vector<int8_t *> output_data;
+  output_data.reserve(batch_info.row_batches.size());
+  output_buffers.reserve(batch_info.row_batches.size());
+  std::transform(batch_info.row_batches.begin(), batch_info.row_batches.end(),
+                 std::back_inserter(output_buffers), [&](auto const &batch) {
+                   return rmm::device_buffer(batch.num_bytes, stream, mr);
+                 });
+  std::transform(output_buffers.begin(), output_buffers.end(), std::back_inserter(output_data),
+                 [](auto &buf) { return static_cast<int8_t *>(buf.data()); });
+
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+
+  int info_count = 0;
+  detail::determine_tiles(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &info_count,
+       &stream](int const start_col, int const end_col, int const tile_height) {
+        int i = detail::compute_tile_counts(gpu_batch_row_boundaries, tile_height, stream);
+        info_count += i;
+      });
+
+  // allocate space for tiles
+  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
+  int tile_offset = 0;
+
+  detail::determine_tiles(
+      column_sizes, column_starts, first_batch_size, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries = batch_info.d_batch_row_boundaries, &gpu_tile_infos, num_rows,
+       &tile_offset, stream](int const start_col, int const end_col, int const tile_height) {
+        tile_offset += detail::build_tiles(
+            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
+      });
+
+  // blast through the entire table and convert it
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_TO_ROWS));
+  dim3 threads(256);
+
+  auto validity_tile_infos = detail::build_validity_tile_infos(
+      num_columns, num_rows, shmem_limit_per_tile, batch_info.row_batches);
+
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
+  dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
+
+  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
+
+  detail::copy_to_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, gpu_tile_infos, dev_input_data.data(),
+      dev_col_sizes.data(), dev_col_starts.data(), offset_functor,
+      batch_info.d_batch_row_boundaries.data(),
+      reinterpret_cast<int8_t **>(dev_output_data.data()));
+
+  detail::copy_validity_to_rows<<<validity_blocks, validity_threads, total_shmem_in_bytes,
+                                  stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor,
+      batch_info.d_batch_row_boundaries.data(), dev_output_data.data(), column_starts.back(),
+      dev_validity_tile_infos, dev_input_nm.data());
+
+  // split up the output buffer into multiple buffers based on row batch sizes
+  // and create list of byte columns
+  std::vector<std::unique_ptr<column>> ret;
+  auto counting_iter = thrust::make_counting_iterator(0);
+  std::transform(counting_iter, counting_iter + batch_info.row_batches.size(),
+                 std::back_inserter(ret), [&](auto batch) {
+                   auto const offset_count = batch_info.row_batches[batch].row_offsets.size();
+                   auto offsets = std::make_unique<column>(
+                       data_type{type_id::INT32}, (size_type)offset_count,
+                       batch_info.row_batches[batch].row_offsets.release());
+                   auto data = std::make_unique<column>(data_type{type_id::INT8},
+                                                        batch_info.row_batches[batch].num_bytes,
+                                                        std::move(output_buffers[batch]));
+
+                   return make_lists_column(
+                       batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
+                       0, rmm::device_buffer{0, rmm::cuda_stream_default, mr}, stream, mr);
+                 });
+
+  return ret;
+#else
+  CUDF_FAIL("Column to row conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
+
+std::vector<std::unique_ptr<column>>
+convert_to_rows_fixed_width_optimized(table_view const &tbl, rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource *mr) {
+  auto const num_columns = tbl.num_columns();
+
+  std::vector<data_type> schema;
   schema.resize(num_columns);
-  std::transform(tbl.begin(), tbl.end(), schema.begin(), cudf::java::get_data_type);
+  std::transform(tbl.begin(), tbl.end(), schema.begin(),
+                 [](auto i) -> data_type { return i.type(); });
 
-  if (are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<size_type> column_start;
+    std::vector<size_type> column_size;
 
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    int32_t const size_per_row =
+        detail::compute_fixed_width_layout(schema, column_start, column_size);
+    auto dev_column_start = make_device_uvector_async(column_start, stream, mr);
+    auto dev_column_size = make_device_uvector_async(column_size, stream, mr);
 
-    int32_t max_rows_per_batch = std::numeric_limits<int>::max() / size_per_row;
     // Make the number of rows per batch a multiple of 32 so we don't have to worry about
     // splitting validity at a specific row offset.  This might change in the future.
-    max_rows_per_batch = (max_rows_per_batch / 32) * 32;
+    auto const max_rows_per_batch =
+        util::round_down_safe(std::numeric_limits<size_type>::max() / size_per_row, 32);
 
-    cudf::size_type num_rows = tbl.num_rows();
+    auto const num_rows = tbl.num_rows();
 
     // Get the pointers to the input columnar data ready
     std::vector<const int8_t *> input_data;
-    std::vector<cudf::bitmask_type const *> input_nm;
-    for (cudf::size_type column_number = 0; column_number < num_columns; column_number++) {
-      cudf::column_view cv = tbl.column(column_number);
+    std::vector<bitmask_type const *> input_nm;
+    for (size_type column_number = 0; column_number < num_columns; column_number++) {
+      column_view cv = tbl.column(column_number);
       input_data.emplace_back(cv.data<int8_t>());
       input_nm.emplace_back(cv.null_mask());
     }
-    auto dev_input_data = copy_to_dev_async(input_data, stream, mr);
-    auto dev_input_nm = copy_to_dev_async(input_nm, stream, mr);
+    auto dev_input_data = make_device_uvector_async(input_data, stream, mr);
+    auto dev_input_nm = make_device_uvector_async(input_nm, stream, mr);
 
-    using ScalarType = cudf::scalar_type_t<cudf::size_type>;
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    using ScalarType = scalar_type_t<size_type>;
+    auto zero = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     zero->set_valid_async(true, stream);
     static_cast<ScalarType *>(zero.get())->set_value(0, stream);
 
-    auto step = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32), stream.value());
+    auto step = make_numeric_scalar(data_type(type_id::INT32), stream.value());
     step->set_valid_async(true, stream);
-    static_cast<ScalarType *>(step.get())
-        ->set_value(static_cast<cudf::size_type>(size_per_row), stream);
+    static_cast<ScalarType *>(step.get())->set_value(static_cast<size_type>(size_per_row), stream);
 
-    std::vector<std::unique_ptr<cudf::column>> ret;
-    for (cudf::size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
-      cudf::size_type row_count = num_rows - row_start;
+    std::vector<std::unique_ptr<column>> ret;
+    for (size_type row_start = 0; row_start < num_rows; row_start += max_rows_per_batch) {
+      size_type row_count = num_rows - row_start;
       row_count = row_count > max_rows_per_batch ? max_rows_per_batch : row_count;
-      ret.emplace_back(fixed_width_convert_to_rows(
+      ret.emplace_back(detail::fixed_width_convert_to_rows(
           row_start, row_count, num_columns, size_per_row, dev_column_start, dev_column_size,
           dev_input_data, dev_input_nm, *zero, *step, stream, mr));
     }
@@ -517,63 +1740,186 @@ std::vector<std::unique_ptr<cudf::column>> convert_to_rows(cudf::table_view cons
   }
 }
 
-std::unique_ptr<cudf::table> convert_from_rows(cudf::lists_column_view const &input,
-                                               std::vector<cudf::data_type> const &schema,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource *mr) {
+std::unique_ptr<table> convert_from_rows(lists_column_view const &input,
+                                         std::vector<data_type> const &schema,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource *mr) {
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+  // verify that the types are what we expect
+  column_view child = input.child();
+  auto const list_type = child.type().id();
+  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
+               "Only a list of bytes is supported as input");
+
+  auto const num_columns = schema.size();
+  auto const num_rows = input.parent().size();
+
+  int device_id;
+  CUDA_TRY(cudaGetDevice(&device_id));
+  int total_shmem_in_bytes;
+  CUDA_TRY(
+      cudaDeviceGetAttribute(&total_shmem_in_bytes, cudaDevAttrMaxSharedMemoryPerBlock, device_id));
+
+  // Need to reduce total shmem available by the size of barriers in the kernel's shared memory
+  total_shmem_in_bytes -=
+      sizeof(cuda::barrier<cuda::thread_scope_block>) * NUM_TILES_PER_KERNEL_LOADED;
+  int shmem_limit_per_tile = total_shmem_in_bytes / NUM_TILES_PER_KERNEL_LOADED;
+
+  std::vector<size_type> column_starts;
+  std::vector<size_type> column_sizes;
+
+  auto iter = thrust::make_transform_iterator(thrust::make_counting_iterator(0), [&schema](auto i) {
+    return std::make_tuple(schema[i], nullptr);
+  });
+  auto const fixed_width_size_per_row =
+      detail::compute_column_information(iter, iter + num_columns, column_starts, column_sizes);
+
+  // Ideally we would check that the offsets are all the same, etc. but for now
+  // this is probably fine
+  CUDF_EXPECTS(fixed_width_size_per_row * num_rows == child.size(),
+               "The layout of the data appears to be off");
+  auto dev_col_starts = make_device_uvector_async(column_starts, stream, mr);
+  auto dev_col_sizes = make_device_uvector_async(column_sizes, stream, mr);
+
+  // Allocate the columns we are going to write into
+  std::vector<std::unique_ptr<column>> output_columns;
+  std::vector<int8_t *> output_data;
+  std::vector<bitmask_type *> output_nm;
+  for (int i = 0; i < static_cast<int>(num_columns); i++) {
+    auto column =
+        make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
+    auto mut = column->mutable_view();
+    output_data.emplace_back(mut.data<int8_t>());
+    output_nm.emplace_back(mut.null_mask());
+    output_columns.emplace_back(std::move(column));
+  }
 
+  // build the row_batches from the passed in list column
+  std::vector<detail::row_batch> row_batches;
+  row_batches.push_back(
+      {detail::row_batch{child.size(), num_rows, device_uvector<size_type>(0, stream)}});
+
+  auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+  auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
+
+  // only ever get a single batch when going from rows, so boundaries
+  // are 0, num_rows
+  constexpr auto num_batches = 2;
+  device_uvector<size_type> gpu_batch_row_boundaries(num_batches, stream);
+
+  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(num_batches), gpu_batch_row_boundaries.begin(),
+                    [num_rows] __device__(auto i) { return i == 0 ? 0 : num_rows; });
+
+  int info_count = 0;
+  detail::determine_tiles(column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile,
+                          [&gpu_batch_row_boundaries, &info_count,
+                           &stream](int const start_col, int const end_col, int const tile_height) {
+                            info_count += detail::compute_tile_counts(gpu_batch_row_boundaries,
+                                                                      tile_height, stream);
+                          });
+
+  // allocate space for tiles
+  device_uvector<detail::tile_info> gpu_tile_infos(info_count, stream);
+
+  int tile_offset = 0;
+  detail::determine_tiles(
+      column_sizes, column_starts, num_rows, num_rows, shmem_limit_per_tile,
+      [&gpu_batch_row_boundaries, &gpu_tile_infos, num_rows, &tile_offset,
+       stream](int const start_col, int const end_col, int const tile_height) {
+        tile_offset += detail::build_tiles(
+            {gpu_tile_infos.data() + tile_offset, gpu_tile_infos.size() - tile_offset},
+            gpu_batch_row_boundaries, start_col, end_col, tile_height, num_rows, stream);
+      });
+
+  dim3 blocks(util::div_rounding_up_unsafe(gpu_tile_infos.size(), NUM_TILES_PER_KERNEL_FROM_ROWS));
+  dim3 threads(std::min(std::min(256, shmem_limit_per_tile / 8), static_cast<int>(child.size())));
+
+  auto validity_tile_infos =
+      detail::build_validity_tile_infos(num_columns, num_rows, shmem_limit_per_tile, row_batches);
+
+  auto dev_validity_tile_infos = make_device_uvector_async(validity_tile_infos, stream);
+
+  dim3 validity_blocks(
+      util::div_rounding_up_unsafe(validity_tile_infos.size(), NUM_VALIDITY_TILES_PER_KERNEL));
+
+  dim3 validity_threads(std::min(validity_tile_infos.size() * 32, 128lu));
+
+  detail::row_offset_functor offset_functor(fixed_width_size_per_row);
+
+  detail::copy_from_rows<<<blocks, threads, total_shmem_in_bytes, stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
+      dev_output_data.data(), dev_col_sizes.data(), dev_col_starts.data(), gpu_tile_infos,
+      child.data<int8_t>());
+
+  detail::copy_validity_from_rows<<<validity_blocks, validity_threads, total_shmem_in_bytes,
+                                    stream.value()>>>(
+      num_rows, num_columns, shmem_limit_per_tile, offset_functor, gpu_batch_row_boundaries.data(),
+      dev_output_nm.data(), column_starts.back(), dev_validity_tile_infos, child.data<int8_t>());
+
+  return std::make_unique<table>(std::move(output_columns));
+#else
+  CUDF_FAIL("Row to column conversion optimization requires volta or later hardware.");
+  return {};
+#endif // #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+}
+
+std::unique_ptr<table> convert_from_rows_fixed_width_optimized(
+    lists_column_view const &input, std::vector<data_type> const &schema,
+    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource *mr) {
   // verify that the types are what we expect
-  cudf::column_view child = input.child();
-  cudf::type_id list_type = child.type().id();
-  CUDF_EXPECTS(list_type == cudf::type_id::INT8 || list_type == cudf::type_id::UINT8,
+  column_view child = input.child();
+  auto const list_type = child.type().id();
+  CUDF_EXPECTS(list_type == type_id::INT8 || list_type == type_id::UINT8,
                "Only a list of bytes is supported as input");
 
-  cudf::size_type num_columns = schema.size();
+  auto const num_columns = schema.size();
 
-  if (are_all_fixed_width(schema)) {
-    std::vector<cudf::size_type> column_start;
-    std::vector<cudf::size_type> column_size;
+  if (detail::are_all_fixed_width(schema)) {
+    std::vector<size_type> column_start;
+    std::vector<size_type> column_size;
 
-    cudf::size_type num_rows = input.parent().size();
-    int32_t size_per_row = compute_fixed_width_layout(schema, column_start, column_size);
+    auto const num_rows = input.parent().size();
+    auto const size_per_row = detail::compute_fixed_width_layout(schema, column_start, column_size);
 
     // Ideally we would check that the offsets are all the same, etc. but for now
     // this is probably fine
     CUDF_EXPECTS(size_per_row * num_rows == child.size(),
                  "The layout of the data appears to be off");
-    auto dev_column_start = copy_to_dev_async(column_start, stream, mr);
-    auto dev_column_size = copy_to_dev_async(column_size, stream, mr);
+    auto dev_column_start = make_device_uvector_async(column_start, stream);
+    auto dev_column_size = make_device_uvector_async(column_size, stream);
 
     // Allocate the columns we are going to write into
-    std::vector<std::unique_ptr<cudf::column>> output_columns;
+    std::vector<std::unique_ptr<column>> output_columns;
     std::vector<int8_t *> output_data;
-    std::vector<cudf::bitmask_type *> output_nm;
-    for (cudf::size_type i = 0; i < num_columns; i++) {
-      auto column = cudf::make_fixed_width_column(schema[i], num_rows,
-                                                  cudf::mask_state::UNINITIALIZED, stream, mr);
+    std::vector<bitmask_type *> output_nm;
+    for (int i = 0; i < static_cast<int>(num_columns); i++) {
+      auto column =
+          make_fixed_width_column(schema[i], num_rows, mask_state::UNINITIALIZED, stream, mr);
       auto mut = column->mutable_view();
       output_data.emplace_back(mut.data<int8_t>());
       output_nm.emplace_back(mut.null_mask());
       output_columns.emplace_back(std::move(column));
     }
 
-    auto dev_output_data = copy_to_dev_async(output_data, stream, mr);
-    auto dev_output_nm = copy_to_dev_async(output_nm, stream, mr);
+    auto dev_output_data = make_device_uvector_async(output_data, stream, mr);
+    auto dev_output_nm = make_device_uvector_async(output_nm, stream, mr);
 
     dim3 blocks;
     dim3 threads;
     int shared_size =
-        calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
+        detail::calc_fixed_width_kernel_dims(num_columns, num_rows, size_per_row, blocks, threads);
 
-    copy_to_fixed_width_columns<<<blocks, threads, shared_size, stream.value()>>>(
-        num_rows, num_columns, size_per_row, dev_column_start->data(), dev_column_size->data(),
-        dev_output_data->data(), dev_output_nm->data(), child.data<int8_t>());
+    detail::copy_from_rows_fixed_width_optimized<<<blocks, threads, shared_size, stream.value()>>>(
+        num_rows, num_columns, size_per_row, dev_column_start.data(), dev_column_size.data(),
+        dev_output_data.data(), dev_output_nm.data(), child.data<int8_t>());
 
-    return std::make_unique<cudf::table>(std::move(output_columns));
+    return std::make_unique<table>(std::move(output_columns));
   } else {
     CUDF_FAIL("Only fixed width types are currently supported");
   }
 }
 
-} // namespace java
+} // namespace jni
+
 } // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index 17abde8df19..1a3cf37caba 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace java {
+namespace jni {
+
+std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
+    cudf::table_view const &tbl,
+    // TODO need something for validity
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
@@ -31,10 +37,15 @@ convert_to_rows(cudf::table_view const &tbl,
                 rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
+std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
+    cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
+
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
                   rmm::cuda_stream_view stream = rmm::cuda_stream_default,
                   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
-} // namespace java
+} // namespace jni
 } // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b2b51553217..7fe69d2d7fc 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -55,6 +55,7 @@
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
@@ -7445,9 +7446,43 @@ void testStructColumnFilterStrings() {
     }
   }
 
+  @Test
+  void fixedWidthRowsRoundTripWide() {
+    TestBuilder tb = new TestBuilder();
+    IntStream.range(0, 10).forEach(i -> tb.column(3l, 9l, 4l, 2l, 20l, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(5, 1, 0, 2, 7, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(true, false, false, true, false, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(1.0f, 3.5f, 5.9f, 7.1f, 9.8f, null));
+    IntStream.range(0, 10).forEach(i -> tb.column(new Byte[]{2, 3, 4, 5, 9, null}));
+    IntStream.range(0, 10).forEach(i -> tb.decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d,
+        9.5d, 0.9d, 7.23d, 2.8d, null));
+    IntStream.range(0, 10).forEach(i -> tb.decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null));
+    try (Table origTable = tb.build()) {
+      ColumnVector[] rowMajorTable = origTable.convertToRows();
+      try {
+        // We didn't overflow
+        assert rowMajorTable.length == 1;
+        ColumnVector cv = rowMajorTable[0];
+        assert cv.getRowCount() == origTable.getRowCount();
+        DType[] types = new DType[origTable.getNumberOfColumns()];
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          types[i] = origTable.getColumn(i).getType();
+        }
+        try (Table backAgain = Table.convertFromRows(cv, types)) {
+          assertTablesAreEqual(origTable, backAgain);
+        }
+      } finally {
+        for (ColumnVector cv : rowMajorTable) {
+          cv.close();
+        }
+      }
+    }
+  }
+
   @Test
   void fixedWidthRowsRoundTrip() {
-    try (Table t = new TestBuilder()
+    try (Table origTable = new TestBuilder()
         .column(3l, 9l, 4l, 2l, 20l, null)
         .column(5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .column(5, 1, 0, 2, 7, null)
@@ -7457,25 +7492,21 @@ void fixedWidthRowsRoundTrip() {
         .decimal32Column(-3, RoundingMode.UNNECESSARY, 5.0d, 9.5d, 0.9d, 7.23d, 2.8d, null)
         .decimal64Column(-8, 3L, 9L, 4L, 2L, 20L, null)
         .build()) {
-      ColumnVector[] rows = t.convertToRows();
+      ColumnVector[] rowMajorTable = origTable.convertToRowsFixedWidthOptimized();
       try {
         // We didn't overflow
-        assert rows.length == 1;
-        ColumnVector cv = rows[0];
-        assert cv.getRowCount() == t.getRowCount();
-//        try (HostColumnVector hcv = cv.copyToHost()) {
-//          hcv.getChildColumnView(0).getDataBuffer().printBuffer(8);
-//        }
-
-        DType[] types = new DType[t.getNumberOfColumns()];
-        for (int i = 0; i < t.getNumberOfColumns(); i++) {
-          types[i] = t.getColumn(i).getType();
+        assert rowMajorTable.length == 1;
+        ColumnVector cv = rowMajorTable[0];
+        assert cv.getRowCount() == origTable.getRowCount();
+        DType[] types = new DType[origTable.getNumberOfColumns()];
+        for (int i = 0; i < origTable.getNumberOfColumns(); i++) {
+          types[i] = origTable.getColumn(i).getType();
         }
-        try (Table backAgain = Table.convertFromRows(cv, types)) {
-          assertTablesAreEqual(t, backAgain);
+        try (Table backAgain = Table.convertFromRowsFixedWidthOptimized(cv, types)) {
+          assertTablesAreEqual(origTable, backAgain);
         }
       } finally {
-        for (ColumnVector cv : rows) {
+        for (ColumnVector cv : rowMajorTable) {
           cv.close();
         }
       }

From 0d5ec7f8902cbf3d5b73b037b9d0f5db66506d86 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 10 Jan 2022 08:17:40 -0800
Subject: [PATCH 139/202] Use gpuci_mamba_retry on Java CI. (#9983)

Resolves #9976.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9983
---
 ci/gpu/java.sh | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index 6f7038d21d7..b048470d155 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -77,7 +77,7 @@ conda config --show-sources
 conda list --show-channel-urls
 
 gpuci_logger "Install dependencies"
-gpuci_conda_retry install -y \
+gpuci_mamba_retry install -y \
                   "cudatoolkit=$CUDA_REL" \
                   "rapids-build-env=$MINOR_VERSION.*" \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
@@ -86,10 +86,14 @@ gpuci_conda_retry install -y \
                   "ucx-py=${UCX_PY_VERSION}" \
                   "openjdk=8.*" \
                   "maven"
+# "mamba install openjdk" adds an activation script to set JAVA_HOME but this is
+# not triggered on installation. Re-activating the conda environment will set
+# this environment variable so that CMake can find JNI.
+conda activate rapids
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
-# gpuci_conda_retry install -y "your-pkg=1.0.0"
+# gpuci_mamba_retry install -y "your-pkg=1.0.0"
 
 
 gpuci_logger "Check compiler versions"
@@ -130,7 +134,7 @@ KAFKA_CONDA_FILE=`basename "$KAFKA_CONDA_FILE" .tar.bz2` #get filename without e
 KAFKA_CONDA_FILE=${KAFKA_CONDA_FILE//-/=} #convert to conda install
 
 gpuci_logger "Installing $CUDF_CONDA_FILE & $KAFKA_CONDA_FILE"
-conda install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
+gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CUDF_CONDA_FILE" "$KAFKA_CONDA_FILE"
 
 install_dask
 

From e8c4e60571fd9a8c4899e1c5efd00d9b94d3da40 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Tue, 11 Jan 2022 01:45:24 +0530
Subject: [PATCH 140/202] Add missing list filling header in meta.yaml (#10007)

Style check was failing in the last couple of PRs but not loudly enough to break CI.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10007
---
 conda/recipes/libcudf/meta.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index bd9b76e4890..2cbe5173de0 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -158,6 +158,7 @@ test:
     - test -f $PREFIX/include/cudf/lists/explode.hpp
     - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
     - test -f $PREFIX/include/cudf/lists/extract.hpp
+    - test -f $PREFIX/include/cudf/lists/filling.hpp
     - test -f $PREFIX/include/cudf/lists/contains.hpp
     - test -f $PREFIX/include/cudf/lists/gather.hpp
     - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp

From cee55fd1e001f76f5ccb12cb41629bc1ea8fa984 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 10 Jan 2022 16:14:58 -0500
Subject: [PATCH 141/202] Optimize `groupby::scan` (#9754)

Closes https://github.com/rapidsai/cudf/issues/8522

This PR gets rid of redundant rearranging processes in `groupby::scan` if input values are presorted. Instead of a short circuit in `sort_helper`, it adds an early exit in the scan functor to avoid materializing `sorted_values`/`grouped_values` thus reducing memory footprint. This optimization brings a 1.6x speedup for presorted scan operations.

- Baseline
```
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
Groupby/BasicSumScan/1000000/manual_time            0.455 ms        0.472 ms         1388
Groupby/BasicSumScan/10000000/manual_time            8.80 ms         8.81 ms           61
Groupby/BasicSumScan/100000000/manual_time            543 ms          543 ms            1
Groupby/PreSortedSumScan/1000000/manual_time        0.217 ms        0.236 ms         3319
Groupby/PreSortedSumScan/10000000/manual_time        1.45 ms         1.47 ms          479
Groupby/PreSortedSumScan/100000000/manual_time       14.0 ms         14.0 ms           47
```
- After optimization
```
-----------------------------------------------------------------------------------------
Benchmark                                               Time             CPU   Iterations
-----------------------------------------------------------------------------------------
Groupby/BasicSumScan/1000000/manual_time            0.455 ms        0.472 ms         1393
Groupby/BasicSumScan/10000000/manual_time            8.81 ms         8.82 ms           60
Groupby/BasicSumScan/100000000/manual_time            546 ms          546 ms            1
Groupby/PreSortedSumScan/1000000/manual_time        0.129 ms        0.148 ms         5389
Groupby/PreSortedSumScan/10000000/manual_time       0.901 ms        0.921 ms          769
Groupby/PreSortedSumScan/100000000/manual_time       8.68 ms         8.70 ms           74
```

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9754
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 .../groupby/group_benchmark_common.hpp        |  29 +++++
 .../groupby/group_no_requests_benchmark.cu    |  21 +---
 cpp/benchmarks/groupby/group_nth_benchmark.cu |  21 +---
 .../groupby/group_scan_benchmark.cu           | 110 ++++++++++++++++++
 .../groupby/group_shift_benchmark.cu          |  19 +--
 cpp/benchmarks/groupby/group_sum_benchmark.cu |  21 +---
 cpp/src/groupby/sort/functors.hpp             |  19 ++-
 cpp/src/groupby/sort/scan.cpp                 |   6 +-
 cpp/tests/groupby/max_scan_tests.cpp          |  18 +++
 cpp/tests/groupby/min_scan_tests.cpp          |  18 +++
 cpp/tests/groupby/sum_scan_tests.cpp          |  18 +++
 12 files changed, 237 insertions(+), 64 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_benchmark_common.hpp
 create mode 100644 cpp/benchmarks/groupby/group_scan_benchmark.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 63f6857ee08..370f84fc14a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -177,6 +177,7 @@ ConfigureBench(
   groupby/group_shift_benchmark.cu
   groupby/group_struct_benchmark.cu
   groupby/group_no_requests_benchmark.cu
+  groupby/group_scan_benchmark.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_benchmark_common.hpp b/cpp/benchmarks/groupby/group_benchmark_common.hpp
new file mode 100644
index 00000000000..fba5bc28822
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_benchmark_common.hpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <random>
+
+template <typename T>
+T random_int(T min, T max)
+{
+  static unsigned seed = 13377331;
+  static std::mt19937 engine{seed};
+  static std::uniform_int_distribution<T> uniform{min, max};
+
+  return uniform(engine);
+}
diff --git a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
index 7dbe1888cee..209155862bd 100644
--- a/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_no_requests_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_basic_no_requests(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
diff --git a/cpp/benchmarks/groupby/group_nth_benchmark.cu b/cpp/benchmarks/groupby/group_nth_benchmark.cu
index 8d1de36db95..107b3839c4c 100644
--- a/cpp/benchmarks/groupby/group_nth_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_nth_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_pre_sorted_nth(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
diff --git a/cpp/benchmarks/groupby/group_scan_benchmark.cu b/cpp/benchmarks/groupby/group_scan_benchmark.cu
new file mode 100644
index 00000000000..d9849e53498
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_scan_benchmark.cu
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+class Groupby : public cudf::benchmark {
+};
+
+void BM_basic_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size);
+
+  cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys}));
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, BasicSumScan)(::benchmark::State& state) { BM_basic_sum_scan(state); }
+
+BENCHMARK_REGISTER_F(Groupby, BasicSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
+
+void BM_pre_sorted_sum_scan(benchmark::State& state)
+{
+  using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+  const cudf::size_type column_size{(cudf::size_type)state.range(0)};
+
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100); });
+  auto valid_it = cudf::detail::make_counting_transform_iterator(
+    0, [=](cudf::size_type row) { return random_int(0, 100) < 90; });
+
+  wrapper keys(data_it, data_it + column_size);
+  wrapper vals(data_it, data_it + column_size, valid_it);
+
+  auto keys_table  = cudf::table_view({keys});
+  auto sort_order  = cudf::sorted_order(keys_table);
+  auto sorted_keys = cudf::gather(keys_table, *sort_order);
+  // No need to sort values using sort_order because they were generated randomly
+
+  cudf::groupby::groupby gb_obj(*sorted_keys, cudf::null_policy::EXCLUDE, cudf::sorted::YES);
+
+  std::vector<cudf::groupby::scan_request> requests;
+  requests.emplace_back(cudf::groupby::scan_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_scan_aggregation>());
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+
+    auto result = gb_obj.scan(requests);
+  }
+}
+
+BENCHMARK_DEFINE_F(Groupby, PreSortedSumScan)(::benchmark::State& state)
+{
+  BM_pre_sorted_sum_scan(state);
+}
+
+BENCHMARK_REGISTER_F(Groupby, PreSortedSumScan)
+  ->UseManualTime()
+  ->Unit(benchmark::kMillisecond)
+  ->Arg(1000000)
+  ->Arg(10000000)
+  ->Arg(100000000);
diff --git a/cpp/benchmarks/groupby/group_shift_benchmark.cu b/cpp/benchmarks/groupby/group_shift_benchmark.cu
index 81afcdd80e1..6b0710f4044 100644
--- a/cpp/benchmarks/groupby/group_shift_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_shift_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -24,24 +26,9 @@
 
 #include <cudf_test/column_wrapper.hpp>
 
-#include <benchmark/benchmark.h>
-
-#include <random>
-
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_group_shift(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu
index 0e9f5061a1a..63f9aa02070 100644
--- a/cpp/benchmarks/groupby/group_sum_benchmark.cu
+++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,23 @@
  * limitations under the License.
  */
 
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/groupby/group_benchmark_common.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+
 #include <cudf_test/column_wrapper.hpp>
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
 
 #include <memory>
-#include <random>
 
 class Groupby : public cudf::benchmark {
 };
 
-// TODO: put it in a struct so `uniform` can be remade with different min, max
-template <typename T>
-T random_int(T min, T max)
-{
-  static unsigned seed = 13377331;
-  static std::mt19937 engine{seed};
-  static std::uniform_int_distribution<T> uniform{min, max};
-
-  return uniform(engine);
-}
-
 void BM_basic_sum(benchmark::State& state)
 {
   using wrapper = cudf::test::fixed_width_column_wrapper<int64_t>;
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index cbe5f08639a..05330a7c492 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -40,12 +40,23 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
-    : helper(helper), cache(cache), values(values), stream(stream), mr(mr)
+                       rmm::mr::device_memory_resource* mr,
+                       sorted keys_are_sorted = sorted::NO)
+    : helper(helper),
+      cache(cache),
+      values(values),
+      stream(stream),
+      mr(mr),
+      keys_are_sorted(keys_are_sorted)
   {
   }
 
  protected:
+  /**
+   * @brief Check if the groupby keys are presorted
+   */
+  bool is_presorted() const { return keys_are_sorted == sorted::YES; }
+
   /**
    * @brief Get the grouped values
    *
@@ -54,6 +65,8 @@ struct store_result_functor {
    */
   column_view get_grouped_values()
   {
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -74,6 +87,7 @@ struct store_result_functor {
    */
   column_view get_sorted_values()
   {
+    if (is_presorted()) { return values; }
     return sorted_values ? sorted_values->view()
                          : (sorted_values = helper.sorted_values(values, stream))->view();
   };
@@ -86,6 +100,7 @@ struct store_result_functor {
   rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
   rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
 
+  sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
   std::unique_ptr<column> grouped_values;  ///< Memoised grouped values
 };
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 6ac416c1a30..402ff8c47ed 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -55,6 +55,9 @@ struct scan_result_functor final : store_result_functor {
  private:
   column_view get_grouped_values()
   {
+    // early exit if presorted
+    if (is_presorted()) { return values; }
+
     // TODO (dm): After implementing single pass multi-agg, explore making a
     //            cache of all grouped value columns rather than one at a time
     if (grouped_values)
@@ -155,7 +158,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort
   cudf::detail::result_cache cache(requests.size());
 
   for (auto const& request : requests) {
-    auto store_functor = detail::scan_result_functor(request.values, helper(), cache, stream, mr);
+    auto store_functor =
+      detail::scan_result_functor(request.values, helper(), cache, stream, mr, _keys_are_sorted);
     for (auto const& aggregation : request.aggregations) {
       // TODO (dm): single pass compute all supported reductions
       cudf::detail::aggregation_dispatcher(aggregation->kind, store_functor, *aggregation);
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 196aeed0430..19935dd4c91 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -59,6 +59,24 @@ TYPED_TEST(groupby_max_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_max_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals({5, 8, 1, 6, 9, 0, 4, 7, 2, 3});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals({5, 8, 8, 6, 9, 9, 9, 7, 7, 7});
+  // clang-format on
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_max_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index e4c018a9ce1..c672209c7b0 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -57,6 +57,24 @@ TYPED_TEST(groupby_min_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_min_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys   {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals({5, 8, 1, 6, 9, 0, 4, 7, 2, 3});
+
+  key_wrapper expect_keys    {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  result_wrapper expect_vals({5, 5, 1, 6, 6, 0, 0, 7, 2, 2});
+  // clang-format on
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_min_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 3117f8b1557..f4ac3a94d19 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -61,6 +61,24 @@ TYPED_TEST(groupby_sum_scan_test, basic)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TYPED_TEST(groupby_sum_scan_test, pre_sorted)
+{
+  using value_wrapper  = typename TestFixture::value_wrapper;
+  using result_wrapper = typename TestFixture::result_wrapper;
+
+  // clang-format off
+  key_wrapper keys  {1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  value_wrapper vals{0, 3, 6, 1, 4, 5, 9, 2, 7, 8};
+
+  key_wrapper expect_keys   {1, 1, 1, 2, 2,  2,  2, 3, 3, 3};
+  result_wrapper expect_vals{0, 3, 9, 1, 5, 10, 19, 2, 9, 17};
+  // clang-format on
+
+  auto agg = cudf::make_sum_aggregation<groupby_scan_aggregation>();
+  test_single_scan(
+    keys, vals, expect_keys, expect_vals, std::move(agg), null_policy::EXCLUDE, sorted::YES);
+}
+
 TYPED_TEST(groupby_sum_scan_test, empty_cols)
 {
   using value_wrapper  = typename TestFixture::value_wrapper;

From 496aa47973aa89daf5a3f0b0b76405a8c7f2b92e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Jan 2022 16:33:15 -0600
Subject: [PATCH 142/202] Refactor host device macros (#9797)

This PR is a follow-up to #9530 to standardize the names of the macros used for the `__host__ __device__` attributes. Aliases for `__device__` and combinations with inlining have been removed, and the only remaining macro is `CUDF_HOST_DEVICE` which is `__host__ __device__` in device code and empty in host code. See https://github.com/rapidsai/cudf/pull/9530#discussion_r739597422 for more discussion.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9797
---
 .../cudf/ast/detail/expression_evaluator.cuh  |  93 ++++++-----
 cpp/include/cudf/ast/detail/operators.hpp     | 152 +++++++++---------
 .../cudf/column/column_device_view.cuh        |  13 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  18 +--
 cpp/include/cudf/detail/indexalator.cuh       |  44 ++---
 cpp/include/cudf/detail/iterator.cuh          |  22 +--
 cpp/include/cudf/detail/null_mask.cuh         |   6 +-
 .../cudf/detail/reduction_operators.cuh       |  41 +++--
 .../detail/utilities/device_operators.cuh     |  28 ++--
 .../cudf/detail/utilities/hash_functions.cuh  | 100 ++++++------
 .../utilities/transform_unary_functions.cuh   |  25 ++-
 cpp/include/cudf/fixed_point/fixed_point.hpp  | 126 +++++++--------
 cpp/include/cudf/fixed_point/temporary.hpp    |   4 +-
 .../cudf/lists/detail/scatter_helper.cuh      |  24 +--
 cpp/include/cudf/lists/list_device_view.cuh   |  43 +++--
 .../cudf/lists/lists_column_device_view.cuh   |  12 +-
 cpp/include/cudf/strings/json.hpp             |   4 +-
 cpp/include/cudf/strings/string_view.cuh      |   4 +-
 cpp/include/cudf/strings/string_view.hpp      | 124 +++++++-------
 cpp/include/cudf/types.hpp                    |  14 +-
 cpp/include/cudf/utilities/bit.hpp            |  22 +--
 .../cudf/utilities/type_dispatcher.hpp        |  16 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |  36 ++---
 cpp/src/binaryop/compiled/binary_ops.cu       |  18 +--
 cpp/src/binaryop/compiled/binary_ops.cuh      |  12 +-
 cpp/src/binaryop/compiled/operation.cuh       |  74 ++++-----
 cpp/src/datetime/datetime_ops.cu              |  16 +-
 cpp/src/groupby/sort/group_correlation.cu     |   2 +-
 .../sort/group_single_pass_reduction_util.cuh |   2 +-
 cpp/src/groupby/sort/sort_helper.cu           |   3 +-
 cpp/src/hash/md5_hash.cu                      |  29 ++--
 cpp/src/io/orc/timezone.cuh                   |   8 +-
 cpp/src/quantiles/quantiles_util.hpp          |  33 ++--
 cpp/src/reductions/scan/scan_inclusive.cu     |   3 +-
 cpp/src/rolling/rolling_detail.hpp            |   8 +-
 cpp/src/strings/json/json_path.cu             |  36 ++---
 cpp/src/strings/regex/regex.cuh               |   2 +-
 cpp/src/strings/regex/regex.inl               |  10 +-
 cpp/src/unary/cast_ops.cu                     |  18 +--
 .../optional_iterator_test_numeric.cu         |   5 +-
 .../iterator/pair_iterator_test_numeric.cu    |   7 +-
 41 files changed, 610 insertions(+), 647 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index f974088c8e7..0b739482c4d 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -57,11 +57,8 @@ struct expression_result {
   /**
    * Helper function to get the subclass type to dispatch methods to.
    */
-  CUDA_DEVICE_CALLABLE Subclass& subclass() { return static_cast<Subclass&>(*this); }
-  CUDA_DEVICE_CALLABLE Subclass const& subclass() const
-  {
-    return static_cast<Subclass const&>(*this);
-  }
+  __device__ inline Subclass& subclass() { return static_cast<Subclass&>(*this); }
+  __device__ inline Subclass const& subclass() const { return static_cast<Subclass const&>(*this); }
 
   // TODO: The index is ignored by the value subclass, but is included in this
   // signature because it is required by the implementation in the template
@@ -73,15 +70,15 @@ struct expression_result {
   // used, whereas passing it as a parameter keeps it in registers for fast
   // access at the point where indexing occurs.
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     subclass().template set_value<Element>(index, result);
   }
 
-  CUDA_DEVICE_CALLABLE bool is_valid() const { return subclass().is_valid(); }
+  __device__ inline bool is_valid() const { return subclass().is_valid(); }
 
-  CUDA_DEVICE_CALLABLE T value() const { return subclass().value(); }
+  __device__ inline T value() const { return subclass().value(); }
 };
 
 /**
@@ -97,11 +94,11 @@ struct expression_result {
 template <typename T, bool has_nulls>
 struct value_expression_result
   : public expression_result<value_expression_result<T, has_nulls>, T, has_nulls> {
-  CUDA_DEVICE_CALLABLE value_expression_result() {}
+  __device__ inline value_expression_result() {}
 
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (std::is_same_v<Element, T>) {
       _obj = result;
@@ -113,7 +110,7 @@ struct value_expression_result
   /**
    * @brief Returns true if the underlying data is valid and false otherwise.
    */
-  CUDA_DEVICE_CALLABLE bool is_valid() const
+  __device__ inline bool is_valid() const
   {
     if constexpr (has_nulls) { return _obj.has_value(); }
     return true;
@@ -125,7 +122,7 @@ struct value_expression_result
    * If the underlying data is not valid, behavior is undefined. Callers should
    * use is_valid to check for validity before accessing the value.
    */
-  CUDA_DEVICE_CALLABLE T value() const
+  __device__ inline T value() const
   {
     // Using two separate constexprs silences compiler warnings, whereas an
     // if/else does not. An unconditional return is not ignored by the compiler
@@ -156,13 +153,11 @@ struct mutable_column_expression_result
   : public expression_result<mutable_column_expression_result<has_nulls>,
                              mutable_column_device_view,
                              has_nulls> {
-  CUDA_DEVICE_CALLABLE mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj)
-  {
-  }
+  __device__ inline mutable_column_expression_result(mutable_column_device_view& obj) : _obj(obj) {}
 
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void set_value(cudf::size_type index,
-                                      possibly_null_value_t<Element, has_nulls> const& result)
+  __device__ inline void set_value(cudf::size_type index,
+                                   possibly_null_value_t<Element, has_nulls> const& result)
   {
     if constexpr (has_nulls) {
       if (result.has_value()) {
@@ -179,7 +174,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  CUDA_DEVICE_CALLABLE bool is_valid() const
+  __device__ inline bool is_valid() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -191,7 +186,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  CUDA_DEVICE_CALLABLE mutable_column_device_view value() const
+  __device__ inline mutable_column_device_view value() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -222,7 +217,7 @@ struct single_dispatch_binary_operator {
    * @param args Forwarded arguments to `operator()` of `f`.
    */
   template <typename LHS, typename F, typename... Ts>
-  CUDA_DEVICE_CALLABLE auto operator()(F&& f, Ts&&... args)
+  __device__ inline auto operator()(F&& f, Ts&&... args)
   {
     f.template operator()<LHS, LHS>(std::forward<Ts>(args)...);
   }
@@ -247,9 +242,9 @@ struct expression_evaluator {
    * storing intermediates.
 
    */
-  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& left,
-                                            table_device_view const& right,
-                                            expression_device_view const& plan)
+  __device__ inline expression_evaluator(table_device_view const& left,
+                                         table_device_view const& right,
+                                         expression_device_view const& plan)
     : left(left), right(right), plan(plan)
   {
   }
@@ -262,8 +257,8 @@ struct expression_evaluator {
    * @param thread_intermediate_storage Pointer to this thread's portion of shared memory for
    * storing intermediates.
    */
-  CUDA_DEVICE_CALLABLE expression_evaluator(table_device_view const& table,
-                                            expression_device_view const& plan)
+  __device__ inline expression_evaluator(table_device_view const& table,
+                                         expression_device_view const& plan)
     : expression_evaluator(table, table, plan)
   {
   }
@@ -282,7 +277,7 @@ struct expression_evaluator {
    * @return Element The type- and null-resolved data.
    */
   template <typename Element, CUDF_ENABLE_IF(column_device_view::has_element_accessor<Element>())>
-  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+  __device__ inline possibly_null_value_t<Element, has_nulls> resolve_input(
     detail::device_data_reference const& input_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
@@ -333,7 +328,7 @@ struct expression_evaluator {
 
   template <typename Element,
             CUDF_ENABLE_IF(not column_device_view::has_element_accessor<Element>())>
-  CUDA_DEVICE_CALLABLE possibly_null_value_t<Element, has_nulls> resolve_input(
+  __device__ inline possibly_null_value_t<Element, has_nulls> resolve_input(
     detail::device_data_reference const& device_data_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
@@ -358,7 +353,7 @@ struct expression_evaluator {
    * @param op The operator to act with.
    */
   template <typename Input, typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDA_DEVICE_CALLABLE void operator()(
+  __device__ inline void operator()(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const input_row_index,
     detail::device_data_reference const& input,
@@ -395,7 +390,7 @@ struct expression_evaluator {
    * @param op The operator to act with.
    */
   template <typename LHS, typename RHS, typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDA_DEVICE_CALLABLE void operator()(
+  __device__ inline void operator()(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const left_row_index,
     cudf::size_type const right_row_index,
@@ -431,9 +426,10 @@ struct expression_evaluator {
    * @param row_index Row index of all input and output data column(s).
    */
   template <typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
-                         cudf::size_type const row_index,
-                         IntermediateDataType<has_nulls>* thread_intermediate_storage)
+  __device__ __forceinline__ void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
     evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
   }
@@ -451,11 +447,12 @@ struct expression_evaluator {
    * @param output_row_index The row in the output to insert the result.
    */
   template <typename ResultSubclass, typename T, bool result_has_nulls>
-  CUDF_DFI void evaluate(expression_result<ResultSubclass, T, result_has_nulls>& output_object,
-                         cudf::size_type const left_row_index,
-                         cudf::size_type const right_row_index,
-                         cudf::size_type const output_row_index,
-                         IntermediateDataType<has_nulls>* thread_intermediate_storage)
+  __device__ __forceinline__ void evaluate(
+    expression_result<ResultSubclass, T, result_has_nulls>& output_object,
+    cudf::size_type const left_row_index,
+    cudf::size_type const right_row_index,
+    cudf::size_type const output_row_index,
+    IntermediateDataType<has_nulls>* thread_intermediate_storage)
   {
     cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
@@ -517,7 +514,7 @@ struct expression_evaluator {
    */
   struct expression_output_handler {
    public:
-    CUDA_DEVICE_CALLABLE expression_output_handler() {}
+    __device__ inline expression_output_handler() {}
 
     /**
      * @brief Resolves an output data reference and assigns result value.
@@ -539,7 +536,7 @@ struct expression_evaluator {
               typename T,
               bool result_has_nulls,
               CUDF_ENABLE_IF(is_rep_layout_compatible<Element>())>
-    CUDA_DEVICE_CALLABLE void resolve_output(
+    __device__ inline void resolve_output(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       detail::device_data_reference const& device_data_reference,
       cudf::size_type const row_index,
@@ -563,7 +560,7 @@ struct expression_evaluator {
               typename T,
               bool result_has_nulls,
               CUDF_ENABLE_IF(!is_rep_layout_compatible<Element>())>
-    CUDA_DEVICE_CALLABLE void resolve_output(
+    __device__ inline void resolve_output(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       detail::device_data_reference const& device_data_reference,
       cudf::size_type const row_index,
@@ -582,7 +579,7 @@ struct expression_evaluator {
    */
   template <typename Input>
   struct unary_expression_output_handler : public expression_output_handler {
-    CUDA_DEVICE_CALLABLE unary_expression_output_handler() {}
+    __device__ inline unary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a unary operation.
@@ -602,7 +599,7 @@ struct expression_evaluator {
               std::enable_if_t<
                 detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                           possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<Input, has_nulls> const& input,
@@ -626,7 +623,7 @@ struct expression_evaluator {
               std::enable_if_t<
                 !detail::is_valid_unary_op<detail::operator_functor<op, has_nulls>,
                                            possibly_null_value_t<Input, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<Input, has_nulls> const& input,
@@ -645,7 +642,7 @@ struct expression_evaluator {
    */
   template <typename LHS, typename RHS>
   struct binary_expression_output_handler : public expression_output_handler {
-    CUDA_DEVICE_CALLABLE binary_expression_output_handler() {}
+    __device__ inline binary_expression_output_handler() {}
 
     /**
      * @brief Callable to perform a binary operation.
@@ -667,7 +664,7 @@ struct expression_evaluator {
                                                           possibly_null_value_t<LHS, has_nulls>,
                                                           possibly_null_value_t<RHS, has_nulls>>>* =
                 nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<LHS, has_nulls> const& lhs,
@@ -693,7 +690,7 @@ struct expression_evaluator {
                 !detail::is_valid_binary_op<detail::operator_functor<op, has_nulls>,
                                             possibly_null_value_t<LHS, has_nulls>,
                                             possibly_null_value_t<RHS, has_nulls>>>* = nullptr>
-    CUDA_DEVICE_CALLABLE void operator()(
+    __device__ inline void operator()(
       expression_result<ResultSubclass, T, result_has_nulls>& output_object,
       cudf::size_type const output_row_index,
       possibly_null_value_t<LHS, has_nulls> const& lhs,
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index cffefcaf9cd..d7fd109f12a 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -49,9 +49,7 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable<Op, T>::value;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op,
-                                                                 F&& f,
-                                                                 Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
@@ -234,7 +232,7 @@ struct operator_functor<ast_operator::ADD, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -245,7 +243,7 @@ struct operator_functor<ast_operator::SUB, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs - rhs)
   {
     return lhs - rhs;
   }
@@ -256,7 +254,7 @@ struct operator_functor<ast_operator::MUL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -267,7 +265,7 @@ struct operator_functor<ast_operator::DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs / rhs)
   {
     return lhs / rhs;
   }
@@ -278,7 +276,7 @@ struct operator_functor<ast_operator::TRUE_DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(static_cast<double>(lhs) / static_cast<double>(rhs))
   {
     return static_cast<double>(lhs) / static_cast<double>(rhs);
@@ -290,7 +288,7 @@ struct operator_functor<ast_operator::FLOOR_DIV, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(floor(static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return floor(static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -305,7 +303,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                    = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_integral<CommonType>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs))
   {
     return static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs);
@@ -315,7 +313,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                  = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, float>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)))
   {
     return fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs));
@@ -325,7 +323,7 @@ struct operator_functor<ast_operator::MOD, false> {
             typename RHS,
             typename CommonType                                   = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, double>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)))
   {
     return fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs));
@@ -340,7 +338,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                    = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_integral<CommonType>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(((static_cast<CommonType>(lhs) % static_cast<CommonType>(rhs)) +
                  static_cast<CommonType>(rhs)) %
                 static_cast<CommonType>(rhs))
@@ -354,7 +352,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                  = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, float>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmodf(fmodf(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)) +
                         static_cast<CommonType>(rhs),
                       static_cast<CommonType>(rhs)))
@@ -368,7 +366,7 @@ struct operator_functor<ast_operator::PYMOD, false> {
             typename RHS,
             typename CommonType                                   = std::common_type_t<LHS, RHS>,
             std::enable_if_t<std::is_same_v<CommonType, double>>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs)
     -> decltype(fmod(fmod(static_cast<CommonType>(lhs), static_cast<CommonType>(rhs)) +
                        static_cast<CommonType>(rhs),
                      static_cast<CommonType>(rhs)))
@@ -384,7 +382,7 @@ struct operator_functor<ast_operator::POW, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs))
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(std::pow(lhs, rhs))
   {
     return std::pow(lhs, rhs);
   }
@@ -395,7 +393,7 @@ struct operator_functor<ast_operator::EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs == rhs)
   {
     return lhs == rhs;
   }
@@ -412,7 +410,7 @@ struct operator_functor<ast_operator::NOT_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs != rhs)
   {
     return lhs != rhs;
   }
@@ -423,7 +421,7 @@ struct operator_functor<ast_operator::LESS, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs < rhs)
   {
     return lhs < rhs;
   }
@@ -434,7 +432,7 @@ struct operator_functor<ast_operator::GREATER, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs > rhs)
   {
     return lhs > rhs;
   }
@@ -445,7 +443,7 @@ struct operator_functor<ast_operator::LESS_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs <= rhs)
   {
     return lhs <= rhs;
   }
@@ -456,7 +454,7 @@ struct operator_functor<ast_operator::GREATER_EQUAL, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs >= rhs)
   {
     return lhs >= rhs;
   }
@@ -467,7 +465,7 @@ struct operator_functor<ast_operator::BITWISE_AND, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs & rhs)
   {
     return lhs & rhs;
   }
@@ -478,7 +476,7 @@ struct operator_functor<ast_operator::BITWISE_OR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs | rhs)
   {
     return lhs | rhs;
   }
@@ -489,7 +487,7 @@ struct operator_functor<ast_operator::BITWISE_XOR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs ^ rhs)
   {
     return lhs ^ rhs;
   }
@@ -500,7 +498,7 @@ struct operator_functor<ast_operator::LOGICAL_AND, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs && rhs)
   {
     return lhs && rhs;
   }
@@ -517,7 +515,7 @@ struct operator_functor<ast_operator::LOGICAL_OR, false> {
   static constexpr auto arity{2};
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs)
+  __device__ inline auto operator()(LHS lhs, RHS rhs) -> decltype(lhs || rhs)
   {
     return lhs || rhs;
   }
@@ -534,7 +532,7 @@ struct operator_functor<ast_operator::IDENTITY, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input)
+  __device__ inline auto operator()(InputT input) -> decltype(input)
   {
     return input;
   }
@@ -545,7 +543,7 @@ struct operator_functor<ast_operator::SIN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sin(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sin(input))
   {
     return std::sin(input);
   }
@@ -556,7 +554,7 @@ struct operator_functor<ast_operator::COS, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cos(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cos(input))
   {
     return std::cos(input);
   }
@@ -567,7 +565,7 @@ struct operator_functor<ast_operator::TAN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tan(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::tan(input))
   {
     return std::tan(input);
   }
@@ -578,7 +576,7 @@ struct operator_functor<ast_operator::ARCSIN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asin(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::asin(input))
   {
     return std::asin(input);
   }
@@ -589,7 +587,7 @@ struct operator_functor<ast_operator::ARCCOS, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acos(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::acos(input))
   {
     return std::acos(input);
   }
@@ -600,7 +598,7 @@ struct operator_functor<ast_operator::ARCTAN, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atan(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::atan(input))
   {
     return std::atan(input);
   }
@@ -611,7 +609,7 @@ struct operator_functor<ast_operator::SINH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sinh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sinh(input))
   {
     return std::sinh(input);
   }
@@ -622,7 +620,7 @@ struct operator_functor<ast_operator::COSH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cosh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cosh(input))
   {
     return std::cosh(input);
   }
@@ -633,7 +631,7 @@ struct operator_functor<ast_operator::TANH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::tanh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::tanh(input))
   {
     return std::tanh(input);
   }
@@ -644,7 +642,7 @@ struct operator_functor<ast_operator::ARCSINH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::asinh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::asinh(input))
   {
     return std::asinh(input);
   }
@@ -655,7 +653,7 @@ struct operator_functor<ast_operator::ARCCOSH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::acosh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::acosh(input))
   {
     return std::acosh(input);
   }
@@ -666,7 +664,7 @@ struct operator_functor<ast_operator::ARCTANH, false> {
   static constexpr auto arity{1};
 
   template <typename InputT, std::enable_if_t<std::is_floating_point<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::atanh(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::atanh(input))
   {
     return std::atanh(input);
   }
@@ -677,7 +675,7 @@ struct operator_functor<ast_operator::EXP, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::exp(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::exp(input))
   {
     return std::exp(input);
   }
@@ -688,7 +686,7 @@ struct operator_functor<ast_operator::LOG, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::log(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::log(input))
   {
     return std::log(input);
   }
@@ -699,7 +697,7 @@ struct operator_functor<ast_operator::SQRT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::sqrt(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::sqrt(input))
   {
     return std::sqrt(input);
   }
@@ -710,7 +708,7 @@ struct operator_functor<ast_operator::CBRT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::cbrt(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::cbrt(input))
   {
     return std::cbrt(input);
   }
@@ -721,7 +719,7 @@ struct operator_functor<ast_operator::CEIL, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::ceil(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::ceil(input))
   {
     return std::ceil(input);
   }
@@ -732,7 +730,7 @@ struct operator_functor<ast_operator::FLOOR, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::floor(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::floor(input))
   {
     return std::floor(input);
   }
@@ -744,13 +742,13 @@ struct operator_functor<ast_operator::ABS, false> {
 
   // Only accept signed or unsigned types (both require is_arithmetic<T> to be true)
   template <typename InputT, std::enable_if_t<std::is_signed<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::abs(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::abs(input))
   {
     return std::abs(input);
   }
 
   template <typename InputT, std::enable_if_t<std::is_unsigned<InputT>::value>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(input)
+  __device__ inline auto operator()(InputT input) -> decltype(input)
   {
     return input;
   }
@@ -761,7 +759,7 @@ struct operator_functor<ast_operator::RINT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(std::rint(input))
+  __device__ inline auto operator()(InputT input) -> decltype(std::rint(input))
   {
     return std::rint(input);
   }
@@ -772,7 +770,7 @@ struct operator_functor<ast_operator::BIT_INVERT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(~input)
+  __device__ inline auto operator()(InputT input) -> decltype(~input)
   {
     return ~input;
   }
@@ -783,7 +781,7 @@ struct operator_functor<ast_operator::NOT, false> {
   static constexpr auto arity{1};
 
   template <typename InputT>
-  CUDA_DEVICE_CALLABLE auto operator()(InputT input) -> decltype(!input)
+  __device__ inline auto operator()(InputT input) -> decltype(!input)
   {
     return !input;
   }
@@ -793,7 +791,7 @@ template <typename To>
 struct cast {
   static constexpr auto arity{1};
   template <typename From>
-  CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast<To>(f))
+  __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);
   }
@@ -822,7 +820,7 @@ struct operator_functor<op, true> {
             typename RHS,
             std::size_t arity_placeholder             = arity,
             std::enable_if_t<arity_placeholder == 2>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     using Out = possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>;
@@ -832,7 +830,7 @@ struct operator_functor<op, true> {
   template <typename Input,
             std::size_t arity_placeholder             = arity,
             std::enable_if_t<arity_placeholder == 1>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(Input const input)
+  __device__ inline auto operator()(Input const input)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*input)), true>
   {
     using Out = possibly_null_value_t<decltype(NonNullOperator{}(*input)), true>;
@@ -848,7 +846,7 @@ struct operator_functor<ast_operator::NULL_EQUAL, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -869,7 +867,7 @@ struct operator_functor<ast_operator::NULL_LOGICAL_AND, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -892,7 +890,7 @@ struct operator_functor<ast_operator::NULL_LOGICAL_OR, true> {
   static constexpr auto arity = NonNullOperator::arity;
 
   template <typename LHS, typename RHS>
-  CUDA_DEVICE_CALLABLE auto operator()(LHS const lhs, RHS const rhs)
+  __device__ inline auto operator()(LHS const lhs, RHS const rhs)
     -> possibly_null_value_t<decltype(NonNullOperator{}(*lhs, *rhs)), true>
   {
     // Case 1: Neither is null, so the output is given by the operation.
@@ -922,7 +920,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
   }
@@ -931,7 +929,7 @@ struct single_dispatch_binary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation.");
@@ -960,10 +958,10 @@ struct type_dispatch_binary_op {
    * @param args Forwarded arguments to `operator()` of `f`.
    */
   template <ast_operator op, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type lhs_type,
-                                            cudf::data_type rhs_type,
-                                            F&& f,
-                                            Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
+                                          cudf::data_type rhs_type,
+                                          F&& f,
+                                          Ts&&... args)
   {
     // Single dispatch (assume lhs_type == rhs_type)
     type_dispatcher(
@@ -986,7 +984,7 @@ struct type_dispatch_binary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void binary_operator_dispatcher(
+CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
   ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
 {
   ast_operator_dispatcher(op,
@@ -1011,7 +1009,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
     f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
   }
@@ -1020,7 +1018,7 @@ struct dispatch_unary_operator_types {
             typename F,
             typename... Ts,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation.");
@@ -1035,7 +1033,7 @@ struct dispatch_unary_operator_types {
  */
 struct type_dispatch_unary_op {
   template <ast_operator op, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
   {
     type_dispatcher(
       input_type,
@@ -1056,10 +1054,10 @@ struct type_dispatch_unary_op {
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr void unary_operator_dispatcher(ast_operator op,
-                                                                   cudf::data_type input_type,
-                                                                   F&& f,
-                                                                   Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                                 cudf::data_type input_type,
+                                                                 F&& f,
+                                                                 Ts&&... args)
 {
   ast_operator_dispatcher(op,
                           detail::type_dispatch_unary_op{},
@@ -1084,7 +1082,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1094,7 +1092,7 @@ struct return_type_functor {
             typename LHS,
             typename RHS,
             std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
@@ -1113,7 +1111,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
     using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
     result    = cudf::data_type(cudf::type_to_id<Out>());
@@ -1122,7 +1120,7 @@ struct return_type_functor {
   template <typename OperatorFunctor,
             typename T,
             std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::data_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
   {
 #ifndef __CUDA_ARCH__
     CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
@@ -1161,7 +1159,7 @@ inline cudf::data_type ast_operator_return_type(ast_operator op,
  */
 struct arity_functor {
   template <ast_operator op>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(cudf::size_type& result)
+  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
   {
     // Arity is not dependent on null handling, so just use the false implementation here.
     result = operator_functor<op, false>::arity;
@@ -1174,7 +1172,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDA_HOST_DEVICE_CALLABLE cudf::size_type ast_operator_arity(ast_operator op)
+CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
 {
   auto result = cudf::size_type(0);
   ast_operator_dispatcher(op, detail::arity_functor{}, result);
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index a15f20ef52d..b29df1852b2 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -1151,8 +1151,7 @@ struct optional_accessor {
     if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::optional<T> operator()(cudf::size_type i) const
+  __device__ inline thrust::optional<T> operator()(cudf::size_type i) const
   {
     if (has_nulls) {
       return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
@@ -1196,8 +1195,7 @@ struct pair_accessor {
     if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::pair<T, bool> operator()(cudf::size_type i) const
+  __device__ inline thrust::pair<T, bool> operator()(cudf::size_type i) const
   {
     return {col.element<T>(i), (has_nulls ? col.is_valid_nocheck(i) : true)};
   }
@@ -1237,21 +1235,20 @@ struct pair_rep_accessor {
     if (has_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); }
   }
 
-  CUDA_DEVICE_CALLABLE
-  thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
+  __device__ inline thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
   {
     return {get_rep<T>(i), (has_nulls ? col.is_valid_nocheck(i) : true)};
   }
 
  private:
   template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const
+  __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i);
   }
 
   template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto get_rep(cudf::size_type i) const
+  __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i).value();
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index c2bd7a4893c..3674efbcc7b 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1324,9 +1324,9 @@ AGG_KIND_MAPPING(aggregation::VARIANCE, var_aggregation);
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kind k,
-                                                                F&& f,
-                                                                Ts&&... args)
+CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind k,
+                                                              F&& f,
+                                                              Ts&&... args)
 {
   switch (k) {
     case aggregation::SUM:
@@ -1418,7 +1418,7 @@ template <typename Element>
 struct dispatch_aggregation {
 #pragma nv_exec_check_disable
   template <aggregation::Kind k, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<Element, k>(std::forward<Ts>(args)...);
   }
@@ -1427,9 +1427,7 @@ struct dispatch_aggregation {
 struct dispatch_source {
 #pragma nv_exec_check_disable
   template <typename Element, typename F, typename... Ts>
-  CUDA_HOST_DEVICE_CALLABLE decltype(auto) operator()(aggregation::Kind k,
-                                                      F&& f,
-                                                      Ts&&... args) const
+  CUDF_HOST_DEVICE inline decltype(auto) operator()(aggregation::Kind k, F&& f, Ts&&... args) const
   {
     return aggregation_dispatcher(
       k, dispatch_aggregation<Element>{}, std::forward<F>(f), std::forward<Ts>(args)...);
@@ -1453,8 +1451,10 @@ struct dispatch_source {
  */
 #pragma nv_exec_check_disable
 template <typename F, typename... Ts>
-CUDA_HOST_DEVICE_CALLABLE constexpr decltype(auto) dispatch_type_and_aggregation(
-  data_type type, aggregation::Kind k, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) dispatch_type_and_aggregation(data_type type,
+                                                                               aggregation::Kind k,
+                                                                               F&& f,
+                                                                               Ts&&... args)
 {
   return type_dispatcher(type, dispatch_source{}, k, std::forward<F>(f), std::forward<Ts>(args)...);
 }
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index d0fa4e02440..11c82da8097 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -55,7 +55,7 @@ struct base_indexalator {
   /**
    * @brief Prefix increment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator++()
+  CUDF_HOST_DEVICE inline T& operator++()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += width_;
@@ -65,7 +65,7 @@ struct base_indexalator {
   /**
    * @brief Postfix increment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator++(int)
+  CUDF_HOST_DEVICE inline T operator++(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator++();
@@ -75,7 +75,7 @@ struct base_indexalator {
   /**
    * @brief Prefix decrement operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator--()
+  CUDF_HOST_DEVICE inline T& operator--()
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= width_;
@@ -85,7 +85,7 @@ struct base_indexalator {
   /**
    * @brief Postfix decrement operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator--(int)
+  CUDF_HOST_DEVICE inline T operator--(int)
   {
     T tmp{static_cast<T&>(*this)};
     operator--();
@@ -95,7 +95,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by sum operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator+=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ += offset * width_;
@@ -105,7 +105,7 @@ struct base_indexalator {
   /**
    * @brief Increment by offset operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator+(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ += (offset * width_);
@@ -115,7 +115,7 @@ struct base_indexalator {
   /**
    * @brief Addition assignment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE friend T operator+(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ += (offset * rhs.width_);
@@ -125,7 +125,7 @@ struct base_indexalator {
   /**
    * @brief Compound assignment by difference operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T& operator-=(difference_type offset)
+  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
   {
     T& derived = static_cast<T&>(*this);
     derived.p_ -= offset * width_;
@@ -135,7 +135,7 @@ struct base_indexalator {
   /**
    * @brief Decrement by offset operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE T operator-(difference_type offset) const
+  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
   {
     auto tmp = T{static_cast<T const&>(*this)};
     tmp.p_ -= (offset * width_);
@@ -145,7 +145,7 @@ struct base_indexalator {
   /**
    * @brief Subtraction assignment operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE friend T operator-(difference_type offset, T const& rhs)
+  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
   {
     T tmp{rhs};
     tmp.p_ -= (offset * rhs.width_);
@@ -155,7 +155,7 @@ struct base_indexalator {
   /**
    * @brief Compute offset from iterator difference operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE difference_type operator-(T const& rhs) const
+  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
   {
     return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
   }
@@ -163,42 +163,42 @@ struct base_indexalator {
   /**
    * @brief Equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator==(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
   {
     return rhs.p_ == static_cast<T const&>(*this).p_;
   }
   /**
    * @brief Not equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator!=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
   {
     return rhs.p_ != static_cast<T const&>(*this).p_;
   }
   /**
    * @brief Less than operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator<(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ < rhs.p_;
   }
   /**
    * @brief Greater than operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator>(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ > rhs.p_;
   }
   /**
    * @brief Less than or equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator<=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ <= rhs.p_;
   }
   /**
    * @brief Greater than or equals to operator.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool operator>=(T const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
   {
     return static_cast<T const&>(*this).p_ >= rhs.p_;
   }
@@ -253,7 +253,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
   /**
    * @brief Indirection operator returns the value at the current iterator position.
    */
-  CUDA_DEVICE_CALLABLE size_type operator*() const { return operator[](0); }
+  __device__ inline size_type operator*() const { return operator[](0); }
 
   /**
    * @brief Dispatch functor for resolving a size_type value from any index type.
@@ -275,7 +275,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
    * @brief Array subscript operator returns a value at the input
    * `idx` position as a `size_type` value.
    */
-  CUDA_DEVICE_CALLABLE size_type operator[](size_type idx) const
+  __device__ inline size_type operator[](size_type idx) const
   {
     void const* tp = p_ + (idx * width_);
     return type_dispatcher(dtype_, index_as_size_type{}, tp);
@@ -339,14 +339,14 @@ struct output_indexalator : base_indexalator<output_indexalator> {
    * @brief Indirection operator returns this iterator instance in order
    * to capture the `operator=(size_type)` calls.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const& operator*() const { return *this; }
+  __device__ inline output_indexalator const& operator*() const { return *this; }
 
   /**
    * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
    *
    * This allows capturing the subsequent `operator=(size_type)` call in this class.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const operator[](size_type idx) const
+  __device__ inline output_indexalator const operator[](size_type idx) const
   {
     output_indexalator tmp{*this};
     tmp.p_ += (idx * width_);
@@ -372,7 +372,7 @@ struct output_indexalator : base_indexalator<output_indexalator> {
   /**
    * @brief Assign a size_type value to the current iterator position.
    */
-  CUDA_DEVICE_CALLABLE output_indexalator const& operator=(size_type const value) const
+  __device__ inline output_indexalator const& operator=(size_type const value) const
   {
     void* tp = p_;
     type_dispatcher(dtype_, size_type_to_index{}, tp, value);
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 01742384972..10d9cda55dd 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -107,8 +107,7 @@ struct null_replaced_value_accessor {
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
-  CUDA_DEVICE_CALLABLE
-  Element operator()(cudf::size_type i) const
+  __device__ inline Element operator()(cudf::size_type i) const
   {
     return has_nulls && col.is_null_nocheck(i) ? null_replacement : col.element<Element>(i);
   }
@@ -135,8 +134,7 @@ struct validity_accessor {
     CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column.");
   }
 
-  CUDA_DEVICE_CALLABLE
-  bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); }
+  __device__ inline bool operator()(cudf::size_type i) const { return col.is_valid_nocheck(i); }
 };
 
 /**
@@ -344,8 +342,7 @@ struct scalar_value_accessor {
    *
    * @return value of the scalar.
    */
-  CUDA_DEVICE_CALLABLE
-  const Element operator()(size_type) const
+  __device__ inline const Element operator()(size_type) const
   {
 #if defined(__CUDA_ARCH__)
     return dscalar.value();
@@ -423,8 +420,7 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
    *
    * @return a thrust::optional<Element> for the scalar value.
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
     if (has_nulls) {
       return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()}
@@ -457,8 +453,7 @@ struct scalar_pair_accessor : public scalar_value_accessor<Element> {
    *
    * @return a pair with value and validity of the scalar.
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  CUDF_HOST_DEVICE inline const value_type operator()(size_type) const
   {
 #if defined(__CUDA_ARCH__)
     return {Element(super_t::dscalar.value()), super_t::dscalar.is_valid()};
@@ -509,8 +504,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
    *
    * @return a pair with representative value and validity of the scalar.
    */
-  CUDA_DEVICE_CALLABLE
-  const value_type operator()(size_type) const
+  __device__ inline const value_type operator()(size_type) const
   {
     return {get_rep(base::dscalar), base::dscalar.is_valid()};
   }
@@ -518,14 +512,14 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  private:
   template <typename DeviceScalar,
             std::enable_if_t<!has_rep_member<DeviceScalar>::value, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const
+  __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const
   {
     return dscalar.value();
   }
 
   template <typename DeviceScalar,
             std::enable_if_t<has_rep_member<DeviceScalar>::value, void>* = nullptr>
-  CUDA_DEVICE_CALLABLE rep_type get_rep(DeviceScalar const& dscalar) const
+  __device__ inline rep_type get_rep(DeviceScalar const& dscalar) const
   {
     return dscalar.rep();
   }
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index 6090477c28d..df06ad9e4f3 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -261,7 +261,7 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi
  */
 struct bit_to_word_index {
   bit_to_word_index(bool inclusive) : inclusive(inclusive) {}
-  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& bit_index) const
+  __device__ inline size_type operator()(const size_type& bit_index) const
   {
     return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1);
   }
@@ -269,7 +269,7 @@ struct bit_to_word_index {
 };
 
 struct popc {
-  CUDA_DEVICE_CALLABLE size_type operator()(bitmask_type word) const { return __popc(word); }
+  __device__ inline size_type operator()(bitmask_type word) const { return __popc(word); }
 };
 
 // Count set/unset bits in a segmented null mask, using offset iterators accessible by the device.
@@ -377,7 +377,7 @@ size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator
 }
 
 struct index_alternator {
-  CUDA_DEVICE_CALLABLE size_type operator()(const size_type& i) const
+  __device__ inline size_type operator()(const size_type& i) const
   {
     return *(d_indices + 2 * i + (is_end ? 1 : 0));
   }
diff --git a/cpp/include/cudf/detail/reduction_operators.cuh b/cpp/include/cudf/detail/reduction_operators.cuh
index 866e26cd655..5a0cb4c1714 100644
--- a/cpp/include/cudf/detail/reduction_operators.cuh
+++ b/cpp/include/cudf/detail/reduction_operators.cuh
@@ -19,7 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/transform_unary_functions.cuh>
-#include <cudf/types.hpp>  //for CUDA_HOST_DEVICE_CALLABLE
+#include <cudf/types.hpp>  //for CUDF_HOST_DEVICE
 
 #include <cmath>
 #include <thrust/functional.h>
@@ -32,14 +32,12 @@ struct var_std {
   ResultType value;          /// the value
   ResultType value_squared;  /// the value of squared
 
-  CUDA_HOST_DEVICE_CALLABLE
-  var_std(ResultType _value = 0, ResultType _value_squared = 0)
+  CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0)
     : value(_value), value_squared(_value_squared){};
 
   using this_t = var_std<ResultType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared));
   };
@@ -50,8 +48,10 @@ template <typename ResultType>
 struct transformer_var_std {
   using OutputType = var_std<ResultType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  OutputType operator()(ResultType const& value) { return OutputType(value, value * value); };
+  CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value)
+  {
+    return OutputType(value, value * value);
+  };
 };
 
 // ------------------------------------------------------------------------
@@ -201,9 +201,9 @@ struct compound_op : public simple_op<Derived> {
    * @return transformed output result of compound operator
    */
   template <typename ResultType, typename IntermediateType>
-  CUDA_HOST_DEVICE_CALLABLE static ResultType compute_result(const IntermediateType& input,
-                                                             const cudf::size_type& count,
-                                                             const cudf::size_type& ddof)
+  CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                           const cudf::size_type& count,
+                                                           const cudf::size_type& ddof)
   {
     // Enforced interface
     return Derived::template intermediate<ResultType>::compute_result(input, count, ddof);
@@ -230,10 +230,9 @@ struct mean : public compound_op<mean> {
     using IntermediateType = ResultType;  // sum value
 
     // compute `mean` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       return (input / count);
     };
@@ -252,10 +251,9 @@ struct variance : public compound_op<variance> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `variance` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       ResultType mean     = input.value / count;
       ResultType asum     = input.value_squared;
@@ -279,10 +277,9 @@ struct standard_deviation : public compound_op<standard_deviation> {
     using IntermediateType = var_std<ResultType>;  // with sum of value, and sum of squared value
 
     // compute `standard deviation` from intermediate type `IntermediateType`
-    CUDA_HOST_DEVICE_CALLABLE
-    static ResultType compute_result(const IntermediateType& input,
-                                     const cudf::size_type& count,
-                                     const cudf::size_type& ddof)
+    CUDF_HOST_DEVICE inline static ResultType compute_result(const IntermediateType& input,
+                                                             const cudf::size_type& count,
+                                                             const cudf::size_type& ddof)
     {
       using intermediateOp = variance::template intermediate<ResultType>;
       ResultType var       = intermediateOp::compute_result(input, count, ddof);
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 95605dc8a71..a59ad4c42ee 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -40,7 +40,7 @@ namespace detail {
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
   return std::min(lhs, rhs);
 }
@@ -51,7 +51,7 @@ CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs)
 template <typename LHS,
           typename RHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs)
+CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
   return std::max(lhs, rhs);
 }
@@ -62,7 +62,7 @@ CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs)
  */
 struct DeviceSum {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -94,13 +94,13 @@ struct DeviceSum {
  */
 struct DeviceCount {
   template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T& lhs, const T& rhs)
   {
     return T{DeviceCount{}(lhs.time_since_epoch(), rhs.time_since_epoch())};
   }
 
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T&, const T& rhs)
+  CUDF_HOST_DEVICE inline T operator()(const T&, const T& rhs)
   {
     return rhs + T{1};
   }
@@ -117,7 +117,7 @@ struct DeviceCount {
  */
 struct DeviceMin {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::min(lhs, rhs))
   {
     return numeric::detail::min(lhs, rhs);
@@ -142,7 +142,7 @@ struct DeviceMin {
 
   // @brief identity specialized for string_view
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::max();
   }
@@ -159,7 +159,7 @@ struct DeviceMin {
  */
 struct DeviceMax {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs)
     -> decltype(cudf::detail::max(lhs, rhs))
   {
     return numeric::detail::max(lhs, rhs);
@@ -183,7 +183,7 @@ struct DeviceMax {
   }
 
   template <typename T, typename std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE static constexpr T identity()
+  CUDF_HOST_DEVICE inline static constexpr T identity()
   {
     return string_view::min();
   }
@@ -200,7 +200,7 @@ struct DeviceMax {
  */
 struct DeviceProduct {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -224,7 +224,7 @@ struct DeviceProduct {
  */
 struct DeviceAnd {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
   {
     return (lhs & rhs);
   }
@@ -235,7 +235,7 @@ struct DeviceAnd {
  */
 struct DeviceOr {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
   {
     return (lhs | rhs);
   }
@@ -246,7 +246,7 @@ struct DeviceOr {
  */
 struct DeviceXor {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
+  CUDF_HOST_DEVICE inline auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
   {
     return (lhs ^ rhs);
   }
@@ -258,7 +258,7 @@ struct DeviceXor {
 struct DeviceLeadLag {
   const size_type row_offset;
 
-  explicit CUDA_HOST_DEVICE_CALLABLE DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
+  explicit CUDF_HOST_DEVICE inline DeviceLeadLag(size_type offset_) : row_offset(offset_) {}
 };
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index f3390d9387b..c35d24ddeac 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -31,7 +31,7 @@ namespace detail {
  * Normalization of floating point NaNs and zeros, passthrough for all other values.
  */
 template <typename T>
-T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key)
+T __device__ inline normalize_nans_and_zeros(T const& key)
 {
   if constexpr (is_floating_point<T>()) {
     if (isnan(key)) {
@@ -50,7 +50,7 @@ T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key)
  * Licensed under the MIT license.
  * See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
  */
-void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destination)
+void __device__ inline uint32ToLowercaseHexString(uint32_t num, char* destination)
 {
   // Transform 0xABCD1234 => 0x0000ABCD00001234 => 0x0B0A0D0C02010403
   uint64_t x = num;
@@ -86,12 +86,12 @@ struct MurmurHash3_32 {
   MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
+  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const
+  __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -118,7 +118,7 @@ struct MurmurHash3_32 {
    *
    * @returns A hash value that intelligently combines the lhs and rhs hash values
    */
-  CUDA_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs)
+  __device__ inline result_type hash_combine(result_type lhs, result_type rhs)
   {
     result_type combined{lhs};
 
@@ -127,11 +127,11 @@ struct MurmurHash3_32 {
     return combined;
   }
 
-  result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
 
   // compute wrapper for floating point types
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-  hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
+  hash_value_type __device__ inline compute_floating_point(T const& key) const
   {
     if (key == T{0.0}) {
       return compute(T{0.0});
@@ -144,7 +144,7 @@ struct MurmurHash3_32 {
   }
 
   template <typename TKey>
-  result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const
+  result_type __device__ inline compute(TKey const& key) const
   {
     constexpr int len         = sizeof(argument_type);
     uint8_t const* const data = reinterpret_cast<uint8_t const*>(&key);
@@ -191,7 +191,7 @@ struct MurmurHash3_32 {
 };
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<bool>::operator()(bool const& key) const
+hash_value_type __device__ inline MurmurHash3_32<bool>::operator()(bool const& key) const
 {
   return this->compute(static_cast<uint8_t>(key));
 }
@@ -200,8 +200,8 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<bool>::operator()(bool const
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
 {
   auto const len        = key.size_bytes();
   uint8_t const* data   = reinterpret_cast<uint8_t const*>(key.data());
@@ -249,49 +249,49 @@ MurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) cons
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<float>::operator()(float const& key) const
+hash_value_type __device__ inline MurmurHash3_32<float>::operator()(float const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32<double>::operator()(double const& key) const
+hash_value_type __device__ inline MurmurHash3_32<double>::operator()(double const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+hash_value_type __device__ inline MurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
 {
   cudf_assert(false && "List column hashing is not supported");
   return 0;
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-MurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+hash_value_type __device__ inline MurmurHash3_32<cudf::struct_view>::operator()(
+  cudf::struct_view const& key) const
 {
   cudf_assert(false && "Direct hashing of struct_view is not supported");
   return 0;
@@ -305,12 +305,12 @@ struct SparkMurmurHash3_32 {
   SparkMurmurHash3_32() = default;
   constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const
+  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  CUDA_DEVICE_CALLABLE uint32_t fmix32(uint32_t h) const
+  __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
@@ -320,11 +320,11 @@ struct SparkMurmurHash3_32 {
     return h;
   }
 
-  result_type CUDA_DEVICE_CALLABLE operator()(Key const& key) const { return compute(key); }
+  result_type __device__ inline operator()(Key const& key) const { return compute(key); }
 
   // compute wrapper for floating point types
   template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-  hash_value_type CUDA_DEVICE_CALLABLE compute_floating_point(T const& key) const
+  hash_value_type __device__ inline compute_floating_point(T const& key) const
   {
     if (isnan(key)) {
       T nan = std::numeric_limits<T>::quiet_NaN();
@@ -335,7 +335,7 @@ struct SparkMurmurHash3_32 {
   }
 
   template <typename TKey>
-  result_type CUDA_DEVICE_CALLABLE compute(TKey const& key) const
+  result_type __device__ inline compute(TKey const& key) const
   {
     constexpr int len        = sizeof(TKey);
     int8_t const* const data = reinterpret_cast<int8_t const*>(&key);
@@ -379,71 +379,68 @@ struct SparkMurmurHash3_32 {
 };
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<bool>::operator()(bool const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<bool>::operator()(bool const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<int8_t>::operator()(int8_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<uint8_t>::operator()(uint8_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<int16_t>::operator()(int16_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<uint16_t>::operator()(uint16_t const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<uint16_t>::operator()(
+  uint16_t const& key) const
 {
   return this->compute<uint32_t>(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal32>::operator()(
+  numeric::decimal32 const& key) const
 {
   return this->compute<uint64_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal64>::operator()(
+  numeric::decimal64 const& key) const
 {
   return this->compute<uint64_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
+  numeric::decimal128 const& key) const
 {
   return this->compute<__int128_t>(key.value());
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::list_view>::operator()(cudf::list_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
 {
   cudf_assert(false && "List column hashing is not supported");
   return 0;
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::struct_view>::operator()(
+  cudf::struct_view const& key) const
 {
   cudf_assert(false && "Direct hashing of struct_view is not supported");
   return 0;
@@ -453,8 +450,8 @@ SparkMurmurHash3_32<cudf::struct_view>::operator()(cudf::struct_view const& key)
  * @brief Specialization of MurmurHash3_32 operator for strings.
  */
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operator()(
+  cudf::string_view const& key) const
 {
   auto const len        = key.size_bytes();
   int8_t const* data    = reinterpret_cast<int8_t const*>(key.data());
@@ -499,14 +496,13 @@ SparkMurmurHash3_32<cudf::string_view>::operator()(cudf::string_view const& key)
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE SparkMurmurHash3_32<float>::operator()(float const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<float>::operator()(float const& key) const
 {
   return this->compute_floating_point(key);
 }
 
 template <>
-hash_value_type CUDA_DEVICE_CALLABLE
-SparkMurmurHash3_32<double>::operator()(double const& key) const
+hash_value_type __device__ inline SparkMurmurHash3_32<double>::operator()(double const& key) const
 {
   return this->compute_floating_point(key);
 }
diff --git a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
index 05a788abd45..12774f57c6a 100644
--- a/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/transform_unary_functions.cuh
@@ -43,14 +43,13 @@ struct null_replacing_transformer {
   using type = ResultType;
   Functor f;
   type replacement;
-  CUDA_HOST_DEVICE_CALLABLE
-  null_replacing_transformer(type null_replacement, Functor transformer)
+  CUDF_HOST_DEVICE inline null_replacing_transformer(type null_replacement, Functor transformer)
     : f(transformer), replacement(null_replacement)
   {
   }
 
   template <typename ElementType>
-  CUDA_HOST_DEVICE_CALLABLE type operator()(thrust::pair<ElementType, bool> const& pair_value)
+  CUDF_HOST_DEVICE inline type operator()(thrust::pair<ElementType, bool> const& pair_value)
   {
     if (pair_value.second)
       return f(pair_value.first);
@@ -76,22 +75,21 @@ struct meanvar {
   ElementType value_squared;  /// the value of squared
   cudf::size_type count;      /// the count
 
-  CUDA_HOST_DEVICE_CALLABLE
-  meanvar(ElementType _value = 0, ElementType _value_squared = 0, cudf::size_type _count = 0)
+  CUDF_HOST_DEVICE inline meanvar(ElementType _value         = 0,
+                                  ElementType _value_squared = 0,
+                                  cudf::size_type _count     = 0)
     : value(_value), value_squared(_value_squared), count(_count){};
 
   using this_t = cudf::meanvar<ElementType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  this_t operator+(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const
   {
     return this_t((this->value + rhs.value),
                   (this->value_squared + rhs.value_squared),
                   (this->count + rhs.count));
   };
 
-  CUDA_HOST_DEVICE_CALLABLE
-  bool operator==(this_t const& rhs) const
+  CUDF_HOST_DEVICE inline bool operator==(this_t const& rhs) const
   {
     return ((this->value == rhs.value) && (this->value_squared == rhs.value_squared) &&
             (this->count == rhs.count));
@@ -113,8 +111,10 @@ struct meanvar {
  */
 template <typename ElementType>
 struct transformer_squared {
-  CUDA_HOST_DEVICE_CALLABLE
-  ElementType operator()(ElementType const& value) { return (value * value); };
+  CUDF_HOST_DEVICE inline ElementType operator()(ElementType const& value)
+  {
+    return (value * value);
+  };
 };
 
 /**
@@ -130,8 +130,7 @@ template <typename ElementType>
 struct transformer_meanvar {
   using ResultType = meanvar<ElementType>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return meanvar<ElementType>(v, v * v, (pair.second) ? 1 : 0);
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index e8223b53997..727dce0db9d 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -79,7 +79,7 @@ template <typename Rep,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
                                            is_supported_representation_type<Rep>())>* = nullptr>
-CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
+CUDF_HOST_DEVICE inline Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
   if (exponent == 0) return static_cast<Rep>(1);
@@ -108,7 +108,7 @@ CUDA_HOST_DEVICE_CALLABLE Rep ipow(T exponent)
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T right_shift(T const& val, scale_type const& scale)
 {
   return val / ipow<Rep, Rad>(static_cast<int32_t>(scale));
 }
@@ -125,7 +125,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T right_shift(T const& val, scale_type const
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& scale)
 {
   return val * ipow<Rep, Rad>(static_cast<int32_t>(-scale));
 }
@@ -144,7 +144,7 @@ CUDA_HOST_DEVICE_CALLABLE constexpr T left_shift(T const& val, scale_type const&
  * @return Shifted value of type T
  */
 template <typename Rep, Radix Rad, typename T>
-CUDA_HOST_DEVICE_CALLABLE constexpr T shift(T const& val, scale_type const& scale)
+CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale)
 {
   if (scale == 0)
     return val;
@@ -179,7 +179,7 @@ template <typename Rep,
 struct scaled_integer {
   Rep value;
   scale_type scale;
-  CUDA_HOST_DEVICE_CALLABLE explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
+  CUDF_HOST_DEVICE inline explicit scaled_integer(Rep v, scale_type s) : value{v}, scale{s} {}
 };
 
 /**
@@ -210,7 +210,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
   {
   }
@@ -226,7 +226,7 @@ class fixed_point {
   template <typename T,
             typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE explicit fixed_point(T const& value, scale_type const& scale)
+  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
     // constructing to `Rep` that is wider than `T`
     : _value{detail::shift<Rep, Rad>(static_cast<Rep>(value), scale)}, _scale{scale}
@@ -238,8 +238,10 @@ class fixed_point {
    *
    * @param s scaled_integer that contains scale and already shifted value
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  explicit fixed_point(scaled_integer<Rep> s) : _value{s.value}, _scale{s.scale} {}
+  CUDF_HOST_DEVICE inline explicit fixed_point(scaled_integer<Rep> s)
+    : _value{s.value}, _scale{s.scale}
+  {
+  }
 
   /**
    * @brief "Scale-less" constructor that constructs `fixed_point` number with a specified
@@ -247,7 +249,7 @@ class fixed_point {
    */
   template <typename T,
             typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point(T const& value)
+  CUDF_HOST_DEVICE inline fixed_point(T const& value)
     : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
   }
@@ -256,8 +258,7 @@ class fixed_point {
    * @brief Default constructor that constructs `fixed_point` number with a
    * value and scale of zero
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  fixed_point() : _value{0}, _scale{scale_type{0}} {}
+  CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {}
 
   /**
    * @brief Explicit conversion operator for casting to floating point types
@@ -289,7 +290,7 @@ class fixed_point {
     return static_cast<U>(detail::shift<Rep, Rad>(value, scale_type{-_scale}));
   }
 
-  CUDA_HOST_DEVICE_CALLABLE operator scaled_integer<Rep>() const
+  CUDF_HOST_DEVICE inline operator scaled_integer<Rep>() const
   {
     return scaled_integer<Rep>{_value, _scale};
   }
@@ -299,21 +300,21 @@ class fixed_point {
    *
    * @return The underlying value of the `fixed_point` number
    */
-  CUDA_HOST_DEVICE_CALLABLE rep value() const { return _value; }
+  CUDF_HOST_DEVICE inline rep value() const { return _value; }
 
   /**
    * @brief Method that returns the scale of the `fixed_point` number
    *
    * @return The scale of the `fixed_point` number
    */
-  CUDA_HOST_DEVICE_CALLABLE scale_type scale() const { return _scale; }
+  CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; }
 
   /**
    * @brief Explicit conversion operator to `bool`
    *
    * @return The `fixed_point` value as a boolean (zero is `false`, nonzero is `true`)
    */
-  CUDA_HOST_DEVICE_CALLABLE explicit constexpr operator bool() const
+  CUDF_HOST_DEVICE inline explicit constexpr operator bool() const
   {
     return static_cast<bool>(_value);
   }
@@ -326,7 +327,7 @@ class fixed_point {
    * @return The sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator+=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this + rhs;
     return *this;
@@ -340,7 +341,7 @@ class fixed_point {
    * @return The product
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator*=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this * rhs;
     return *this;
@@ -354,7 +355,7 @@ class fixed_point {
    * @return The difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator-=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this - rhs;
     return *this;
@@ -368,7 +369,7 @@ class fixed_point {
    * @return The quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
+  CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1>& operator/=(fixed_point<Rep1, Rad1> const& rhs)
   {
     *this = *this / rhs;
     return *this;
@@ -379,8 +380,7 @@ class fixed_point {
    *
    * @return The incremented result
    */
-  CUDA_HOST_DEVICE_CALLABLE
-  fixed_point<Rep, Rad>& operator++()
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad>& operator++()
   {
     *this = *this + fixed_point<Rep, Rad>{1, scale_type{_scale}};
     return *this;
@@ -398,7 +398,7 @@ class fixed_point {
    * @return The resulting `fixed_point` sum
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator+(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator+(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -413,7 +413,7 @@ class fixed_point {
    * @return The resulting `fixed_point` difference
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator-(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator-(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -426,7 +426,7 @@ class fixed_point {
    * @return The resulting `fixed_point` product
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator*(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator*(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -439,7 +439,7 @@ class fixed_point {
    * @return The resulting `fixed_point` quotient
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend fixed_point<Rep1, Rad1> operator/(
+  CUDF_HOST_DEVICE inline friend fixed_point<Rep1, Rad1> operator/(
     fixed_point<Rep1, Rad1> const& lhs, fixed_point<Rep1, Rad1> const& rhs);
 
   /**
@@ -454,8 +454,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
@@ -469,8 +469,8 @@ class fixed_point {
    * @return true if `lhs` and `rhs` are not equal, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
@@ -484,8 +484,8 @@ class fixed_point {
    * @return true if `lhs` less than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
@@ -499,8 +499,8 @@ class fixed_point {
    * @return true if `lhs` greater than or equal to `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
-                                                   fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                                 fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
@@ -514,8 +514,8 @@ class fixed_point {
    * @return true if `lhs` less than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
-                                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
@@ -529,8 +529,8 @@ class fixed_point {
    * @return true if `lhs` greater than `rhs`, false if not
    */
   template <typename Rep1, Radix Rad1>
-  CUDA_HOST_DEVICE_CALLABLE friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
-                                                  fixed_point<Rep1, Rad1> const& rhs);
+  CUDF_HOST_DEVICE inline friend bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                                fixed_point<Rep1, Rad1> const& rhs);
 
   /**
    * @brief Method for creating a `fixed_point` number with a new `scale`
@@ -541,7 +541,7 @@ class fixed_point {
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
    */
-  CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep, Rad> rescaled(scale_type scale) const
+  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
     if (scale == _scale) return *this;
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
@@ -580,7 +580,7 @@ class fixed_point {
  * @return true if addition causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto addition_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs > cuda::std::numeric_limits<Rep>::max() - rhs
                  : lhs < cuda::std::numeric_limits<Rep>::min() - rhs;
@@ -595,7 +595,7 @@ CUDA_HOST_DEVICE_CALLABLE auto addition_overflow(T lhs, T rhs)
  * @return true if subtraction causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto subtraction_overflow(T lhs, T rhs)
 {
   return rhs > 0 ? lhs < cuda::std::numeric_limits<Rep>::min() + rhs
                  : lhs > cuda::std::numeric_limits<Rep>::max() + rhs;
@@ -610,7 +610,7 @@ CUDA_HOST_DEVICE_CALLABLE auto subtraction_overflow(T lhs, T rhs)
  * @return true if division causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto division_overflow(T lhs, T rhs)
 {
   return lhs == cuda::std::numeric_limits<Rep>::min() && rhs == -1;
 }
@@ -624,7 +624,7 @@ CUDA_HOST_DEVICE_CALLABLE auto division_overflow(T lhs, T rhs)
  * @return true if multiplication causes overflow, false otherwise
  */
 template <typename Rep, typename T>
-CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs)
 {
   auto const min = cuda::std::numeric_limits<Rep>::min();
   auto const max = cuda::std::numeric_limits<Rep>::max();
@@ -638,8 +638,8 @@ CUDA_HOST_DEVICE_CALLABLE auto multiplication_overflow(T lhs, T rhs)
 
 // PLUS Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const sum   = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value;
@@ -656,8 +656,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Ra
 
 // MINUS Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   auto const diff  = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value;
@@ -674,8 +674,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Ra
 
 // MULTIPLIES Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -689,8 +689,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator*(fixed_point<Rep1, Ra
 
 // DIVISION Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
-                                                            fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Rad1> const& lhs,
+                                                          fixed_point<Rep1, Rad1> const& rhs)
 {
 #if defined(__CUDACC_DEBUG__)
 
@@ -704,8 +704,8 @@ CUDA_HOST_DEVICE_CALLABLE fixed_point<Rep1, Rad1> operator/(fixed_point<Rep1, Ra
 
 // EQUALITY COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value;
@@ -713,8 +713,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator==(fixed_point<Rep1, Rad1> const& lhs,
 
 // EQUALITY NOT COMPARISON Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value;
@@ -722,8 +722,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
 
 // LESS THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value;
@@ -731,8 +731,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
 
 // GREATER THAN OR EQUAL TO Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
-                                          fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
+                                        fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value;
@@ -740,8 +740,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
 
 // LESS THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point<Rep1, Rad1> const& lhs,
-                                         fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value;
@@ -749,8 +749,8 @@ CUDA_HOST_DEVICE_CALLABLE bool operator<(fixed_point<Rep1, Rad1> const& lhs,
 
 // GREATER THAN Operation
 template <typename Rep1, Radix Rad1>
-CUDA_HOST_DEVICE_CALLABLE bool operator>(fixed_point<Rep1, Rad1> const& lhs,
-                                         fixed_point<Rep1, Rad1> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(fixed_point<Rep1, Rad1> const& lhs,
+                                       fixed_point<Rep1, Rad1> const& rhs)
 {
   auto const scale = std::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 2b50e273517..be900f252f6 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -62,13 +62,13 @@ constexpr auto abs(T value)
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE auto min(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto min(T lhs, T rhs)
 {
   return lhs < rhs ? lhs : rhs;
 }
 
 template <typename T>
-CUDA_HOST_DEVICE_CALLABLE auto max(T lhs, T rhs)
+CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 {
   return lhs > rhs ? lhs : rhs;
 }
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 7d0586ed6a6..bdf68037944 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -65,9 +65,9 @@ struct unbound_list_view {
    * @param lists_column The actual source/target lists column
    * @param row_index Index of the row in lists_column that this instance represents
    */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         cudf::detail::lists_column_device_view const& lists_column,
-                                         size_type const& row_index)
+  __device__ inline unbound_list_view(label_type scatter_source_label,
+                                      cudf::detail::lists_column_device_view const& lists_column,
+                                      size_type const& row_index)
     : _label{scatter_source_label}, _row_index{row_index}
   {
     _size = list_device_view{lists_column, row_index}.size();
@@ -81,9 +81,9 @@ struct unbound_list_view {
    * @param row_index Index of the row that this instance represents in the source/target column
    * @param size The number of elements in this list row
    */
-  CUDA_DEVICE_CALLABLE unbound_list_view(label_type scatter_source_label,
-                                         size_type const& row_index,
-                                         size_type const& size)
+  __device__ inline unbound_list_view(label_type scatter_source_label,
+                                      size_type const& row_index,
+                                      size_type const& size)
     : _label{scatter_source_label}, _row_index{row_index}, _size{size}
   {
   }
@@ -91,17 +91,17 @@ struct unbound_list_view {
   /**
    * @brief Returns number of elements in this list row.
    */
-  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
+  __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Returns whether this row came from the `scatter()` source or target
    */
-  CUDA_DEVICE_CALLABLE label_type label() const { return _label; }
+  __device__ inline label_type label() const { return _label; }
 
   /**
    * @brief Returns the index in the source/target column
    */
-  CUDA_DEVICE_CALLABLE size_type row_index() const { return _row_index; }
+  __device__ inline size_type row_index() const { return _row_index; }
 
   /**
    * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
@@ -111,9 +111,9 @@ struct unbound_list_view {
    * @param scatter_target Target column for the scatter operation
    * @return A (bound) list_view for the row that this object represents
    */
-  CUDA_DEVICE_CALLABLE list_device_view
-  bind_to_column(lists_column_device_view const& scatter_source,
-                 lists_column_device_view const& scatter_target) const
+  __device__ inline list_device_view bind_to_column(
+    lists_column_device_view const& scatter_source,
+    lists_column_device_view const& scatter_target) const
   {
     return list_device_view(_label == label_type::SOURCE ? scatter_source : scatter_target,
                             _row_index);
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 659fb1e6b2a..5071f046e0c 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -32,8 +32,8 @@ class list_device_view {
  public:
   list_device_view() = default;
 
-  CUDA_DEVICE_CALLABLE list_device_view(lists_column_device_view const& lists_column,
-                                        size_type const& row_index)
+  __device__ inline list_device_view(lists_column_device_view const& lists_column,
+                                     size_type const& row_index)
     : lists_column(lists_column), _row_index(row_index)
   {
     column_device_view const& offsets = lists_column.offsets();
@@ -69,7 +69,7 @@ class list_device_view {
    * The offset of this element as stored in the child column (i.e. 5)
    * may be fetched using this method.
    */
-  CUDA_DEVICE_CALLABLE size_type element_offset(size_type idx) const
+  __device__ inline size_type element_offset(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "idx out of bounds");
     return begin_offset + idx;
@@ -83,7 +83,7 @@ class list_device_view {
    * @return The element at the specified index of the list row.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE T element(size_type idx) const
+  __device__ inline T element(size_type idx) const
   {
     return lists_column.child().element<T>(element_offset(idx));
   }
@@ -91,7 +91,7 @@ class list_device_view {
   /**
    * @brief Checks whether element is null at specified index in the list row.
    */
-  CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const
+  __device__ inline bool is_null(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "Index out of bounds.");
     auto element_offset = begin_offset + idx;
@@ -101,17 +101,17 @@ class list_device_view {
   /**
    * @brief Checks whether this list row is null.
    */
-  CUDA_DEVICE_CALLABLE bool is_null() const { return lists_column.is_null(_row_index); }
+  __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
 
   /**
    * @brief Fetches the number of elements in this list row.
    */
-  CUDA_DEVICE_CALLABLE size_type size() const { return _size; }
+  __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Fetches the lists_column_device_view that contains this list.
    */
-  CUDA_DEVICE_CALLABLE lists_column_device_view const& get_column() const { return lists_column; }
+  __device__ inline lists_column_device_view const& get_column() const { return lists_column; }
 
   template <typename T>
   struct pair_accessor;
@@ -141,7 +141,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_begin() const
+  __device__ inline const_pair_iterator<T> pair_begin() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(0), pair_accessor<T>{*this}};
   }
@@ -151,7 +151,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_iterator<T> pair_end() const
+  __device__ inline const_pair_iterator<T> pair_end() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                   pair_accessor<T>{*this}};
@@ -173,7 +173,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_rep_iterator<T> pair_rep_begin() const
+  __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(0),
                                       pair_rep_accessor<T>{*this}};
@@ -184,7 +184,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  CUDA_DEVICE_CALLABLE const_pair_rep_iterator<T> pair_rep_end() const
+  __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                       pair_rep_accessor<T>{*this}};
@@ -215,7 +215,7 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDA_HOST_DEVICE_CALLABLE pair_accessor(list_device_view const& _list) : list{_list} {}
+    explicit CUDF_HOST_DEVICE inline pair_accessor(list_device_view const& _list) : list{_list} {}
 
     /**
      * @brief Accessor for the {data, validity} pair at the specified index
@@ -223,8 +223,7 @@ class list_device_view {
      * @param i Index into the list_device_view
      * @return A pair of data element and its validity flag.
      */
-    CUDA_DEVICE_CALLABLE
-    thrust::pair<T, bool> operator()(cudf::size_type i) const
+    __device__ inline thrust::pair<T, bool> operator()(cudf::size_type i) const
     {
       return {list.element<T>(i), !list.is_null(i)};
     }
@@ -253,8 +252,7 @@ class list_device_view {
      *
      * @param _list The `list_device_view` whose rows are being accessed.
      */
-    explicit CUDA_HOST_DEVICE_CALLABLE pair_rep_accessor(list_device_view const& _list)
-      : list{_list}
+    explicit CUDF_HOST_DEVICE inline pair_rep_accessor(list_device_view const& _list) : list{_list}
     {
     }
 
@@ -264,21 +262,20 @@ class list_device_view {
      * @param i Index into the list_device_view
      * @return A pair of data element and its validity flag.
      */
-    CUDA_DEVICE_CALLABLE
-    thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
+    __device__ inline thrust::pair<rep_type, bool> operator()(cudf::size_type i) const
     {
       return {get_rep<T>(i), !list.is_null(i)};
     }
 
    private:
     template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-    CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const
+    __device__ inline rep_type get_rep(cudf::size_type i) const
     {
       return list.element<R>(i);
     }
 
     template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-    CUDA_DEVICE_CALLABLE rep_type get_rep(cudf::size_type i) const
+    __device__ inline rep_type get_rep(cudf::size_type i) const
     {
       return list.element<R>(i).value();
     }
@@ -291,7 +288,7 @@ class list_device_view {
  */
 struct list_size_functor {
   column_device_view const d_column;
-  CUDA_HOST_DEVICE_CALLABLE list_size_functor(column_device_view const& d_col) : d_column(d_col)
+  CUDF_HOST_DEVICE inline list_size_functor(column_device_view const& d_col) : d_column(d_col)
   {
 #if defined(__CUDA_ARCH__)
     cudf_assert(d_col.type().id() == type_id::LIST && "Only list type column is supported");
@@ -299,7 +296,7 @@ struct list_size_functor {
     CUDF_EXPECTS(d_col.type().id() == type_id::LIST, "Only list type column is supported");
 #endif
   }
-  CUDA_DEVICE_CALLABLE size_type operator()(size_type idx)
+  __device__ inline size_type operator()(size_type idx)
   {
     if (d_column.is_null(idx)) return size_type{0};
     auto d_offsets =
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index d8f082c9a42..aff088a7f44 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -46,12 +46,12 @@ class lists_column_device_view {
   /**
    * @brief Fetches number of rows in the lists column
    */
-  CUDA_HOST_DEVICE_CALLABLE cudf::size_type size() const { return underlying.size(); }
+  CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
 
   /**
    * @brief Fetches the offsets column of the underlying list column.
    */
-  CUDA_DEVICE_CALLABLE column_device_view offsets() const
+  __device__ inline column_device_view offsets() const
   {
     return underlying.child(lists_column_view::offsets_column_index);
   }
@@ -59,7 +59,7 @@ class lists_column_device_view {
   /**
    * @brief Fetches the child column of the underlying list column.
    */
-  CUDA_DEVICE_CALLABLE column_device_view child() const
+  __device__ inline column_device_view child() const
   {
     return underlying.child(lists_column_view::child_column_index);
   }
@@ -67,19 +67,19 @@ class lists_column_device_view {
   /**
    * @brief Indicates whether the list column is nullable.
    */
-  CUDA_DEVICE_CALLABLE bool nullable() const { return underlying.nullable(); }
+  __device__ inline bool nullable() const { return underlying.nullable(); }
 
   /**
    * @brief Indicates whether the row (i.e. list) at the specified
    * index is null.
    */
-  CUDA_DEVICE_CALLABLE bool is_null(size_type idx) const { return underlying.is_null(idx); }
+  __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); }
 
   /**
    * @brief Fetches the offset of the underlying column_device_view,
    *        in case it is a sliced/offset column.
    */
-  CUDA_DEVICE_CALLABLE size_type offset() const { return underlying.offset(); }
+  __device__ inline size_type offset() const { return underlying.offset(); }
 
  private:
   column_device_view underlying;
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index 9081fa23eec..8435c47eaf5 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -48,7 +48,7 @@ class get_json_object_options {
    * @brief Returns true/false depending on whether single-quotes for representing strings
    * are allowed.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool get_allow_single_quotes() const { return allow_single_quotes; }
+  CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; }
 
   /**
    * @brief Returns true/false depending on whether individually returned string values have
@@ -72,7 +72,7 @@ class get_json_object_options {
    *
    * @endcode
    */
-  CUDA_HOST_DEVICE_CALLABLE bool get_strip_quotes_from_single_strings() const
+  CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
   {
     return strip_quotes_from_single_strings;
   }
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 238d55d580e..43a90997c86 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -80,7 +80,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
  *
  * @return An empty string
  */
-CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view(); }
+CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
 
 /**
  * @brief Return maximum value associated with the string type
@@ -91,7 +91,7 @@ CUDA_HOST_DEVICE_CALLABLE string_view string_view::min() { return string_view();
  * @return A string value which represents the highest possible valid UTF-8 encoded
  * character.
  */
-CUDA_HOST_DEVICE_CALLABLE string_view string_view::max()
+CUDF_HOST_DEVICE inline string_view string_view::max()
 {
   const char* psentinel{nullptr};
 #if defined(__CUDA_ARCH__)
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index be182cb0e9d..22409ab3dc7 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -51,20 +51,20 @@ class string_view {
   /**
    * @brief Return the number of bytes in this string
    */
-  CUDA_HOST_DEVICE_CALLABLE size_type size_bytes() const { return _bytes; }
+  CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; }
   /**
    * @brief Return the number of characters in this string
    */
-  CUDA_DEVICE_CALLABLE size_type length() const;
+  __device__ inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    */
-  CUDA_HOST_DEVICE_CALLABLE const char* data() const { return _data; }
+  CUDF_HOST_DEVICE inline const char* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
    */
-  CUDA_HOST_DEVICE_CALLABLE bool empty() const { return size_bytes() == 0; }
+  CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; }
 
   /**
    * @brief Handy iterator for navigating through encoded characters.
@@ -76,28 +76,28 @@ class string_view {
     using reference         = char_utf8&;
     using pointer           = char_utf8*;
     using iterator_category = std::input_iterator_tag;
-    CUDA_DEVICE_CALLABLE const_iterator(const string_view& str, size_type pos);
+    __device__ inline const_iterator(const string_view& str, size_type pos);
     const_iterator(const const_iterator& mit) = default;
     const_iterator(const_iterator&& mit)      = default;
     const_iterator& operator=(const const_iterator&) = default;
     const_iterator& operator=(const_iterator&&) = default;
-    CUDA_DEVICE_CALLABLE const_iterator& operator++();
-    CUDA_DEVICE_CALLABLE const_iterator operator++(int);
-    CUDA_DEVICE_CALLABLE const_iterator& operator+=(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator operator+(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator& operator--();
-    CUDA_DEVICE_CALLABLE const_iterator operator--(int);
-    CUDA_DEVICE_CALLABLE const_iterator& operator-=(difference_type);
-    CUDA_DEVICE_CALLABLE const_iterator operator-(difference_type);
-    CUDA_DEVICE_CALLABLE bool operator==(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator!=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator<(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator<=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator>(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE bool operator>=(const const_iterator&) const;
-    CUDA_DEVICE_CALLABLE char_utf8 operator*() const;
-    CUDA_DEVICE_CALLABLE size_type position() const;
-    CUDA_DEVICE_CALLABLE size_type byte_offset() const;
+    __device__ inline const_iterator& operator++();
+    __device__ inline const_iterator operator++(int);
+    __device__ inline const_iterator& operator+=(difference_type);
+    __device__ inline const_iterator operator+(difference_type);
+    __device__ inline const_iterator& operator--();
+    __device__ inline const_iterator operator--(int);
+    __device__ inline const_iterator& operator-=(difference_type);
+    __device__ inline const_iterator operator-(difference_type);
+    __device__ inline bool operator==(const const_iterator&) const;
+    __device__ inline bool operator!=(const const_iterator&) const;
+    __device__ inline bool operator<(const const_iterator&) const;
+    __device__ inline bool operator<=(const const_iterator&) const;
+    __device__ inline bool operator>(const const_iterator&) const;
+    __device__ inline bool operator>=(const const_iterator&) const;
+    __device__ inline char_utf8 operator*() const;
+    __device__ inline size_type position() const;
+    __device__ inline size_type byte_offset() const;
 
    private:
     const char* p{};
@@ -109,24 +109,24 @@ class string_view {
   /**
    * @brief Return new iterator pointing to the beginning of this string
    */
-  CUDA_DEVICE_CALLABLE const_iterator begin() const;
+  __device__ inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    */
-  CUDA_DEVICE_CALLABLE const_iterator end() const;
+  __device__ inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
    *
    * @param pos Character position
    */
-  CUDA_DEVICE_CALLABLE char_utf8 operator[](size_type pos) const;
+  __device__ inline char_utf8 operator[](size_type pos) const;
   /**
    * @brief Return the byte offset from data() for a given character position
    *
    * @param pos Character position
    */
-  CUDA_DEVICE_CALLABLE size_type byte_offset(size_type pos) const;
+  __device__ inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -141,7 +141,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  CUDA_DEVICE_CALLABLE int compare(const string_view& str) const;
+  __device__ inline int compare(const string_view& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -156,32 +156,32 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  CUDA_DEVICE_CALLABLE int compare(const char* str, size_type bytes) const;
+  __device__ inline int compare(const char* str, size_type bytes) const;
 
   /**
    * @brief Returns true if rhs matches this string exactly.
    */
-  CUDA_DEVICE_CALLABLE bool operator==(const string_view& rhs) const;
+  __device__ inline bool operator==(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs does not match this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator!=(const string_view& rhs) const;
+  __device__ inline bool operator!=(const string_view& rhs) const;
   /**
    * @brief Returns true if this string is ordered before rhs.
    */
-  CUDA_DEVICE_CALLABLE bool operator<(const string_view& rhs) const;
+  __device__ inline bool operator<(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs is ordered before this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator>(const string_view& rhs) const;
+  __device__ inline bool operator>(const string_view& rhs) const;
   /**
    * @brief Returns true if this string matches or is ordered before rhs.
    */
-  CUDA_DEVICE_CALLABLE bool operator<=(const string_view& rhs) const;
+  __device__ inline bool operator<=(const string_view& rhs) const;
   /**
    * @brief Returns true if rhs matches or is ordered before this string.
    */
-  CUDA_DEVICE_CALLABLE bool operator>=(const string_view& rhs) const;
+  __device__ inline bool operator>=(const string_view& rhs) const;
 
   /**
    * @brief Returns the character position of the first occurrence where the
@@ -193,9 +193,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if str is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(const string_view& str,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(const string_view& str,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -207,10 +207,10 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(const char* str,
-                                      size_type bytes,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(const char* str,
+                                   size_type bytes,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where
    * character is found in this string within the character range [pos,pos+n).
@@ -221,9 +221,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type find(char_utf8 character,
-                                      size_type pos   = 0,
-                                      size_type count = -1) const;
+  __device__ inline size_type find(char_utf8 character,
+                                   size_type pos   = 0,
+                                   size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * argument str is found in this string within the character range [pos,pos+n).
@@ -234,9 +234,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(const string_view& str,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(const string_view& str,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -248,10 +248,10 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(const char* str,
-                                       size_type bytes,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(const char* str,
+                                    size_type bytes,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where
    * character is found in this string within the character range [pos,pos+n).
@@ -262,9 +262,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  CUDA_DEVICE_CALLABLE size_type rfind(char_utf8 character,
-                                       size_type pos   = 0,
-                                       size_type count = -1) const;
+  __device__ inline size_type rfind(char_utf8 character,
+                                    size_type pos   = 0,
+                                    size_type count = -1) const;
 
   /**
    * @brief Return a sub-string of this string. The original string and device
@@ -274,7 +274,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  CUDA_DEVICE_CALLABLE string_view substr(size_type start, size_type length) const;
+  __device__ inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -284,7 +284,7 @@ class string_view {
    *
    * @return An empty string
    */
-  CUDA_HOST_DEVICE_CALLABLE static string_view min();
+  CUDF_HOST_DEVICE inline static string_view min();
 
   /**
    * @brief Return maximum value associated with the string type
@@ -295,12 +295,12 @@ class string_view {
    * @return A string value which represents the highest possible valid UTF-8 encoded
    * character.
    */
-  CUDA_HOST_DEVICE_CALLABLE static string_view max();
+  CUDF_HOST_DEVICE inline static string_view max();
 
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view() : _data(""), _bytes(0), _length(0) {}
+  CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -308,7 +308,7 @@ class string_view {
    * @param data Device char array encoded in UTF8.
    * @param bytes Number of bytes in data array.
    */
-  CUDA_HOST_DEVICE_CALLABLE string_view(const char* data, size_type bytes)
+  CUDF_HOST_DEVICE inline string_view(const char* data, size_type bytes)
     : _data(data), _bytes(bytes), _length(UNKNOWN_STRING_LENGTH)
   {
   }
@@ -330,7 +330,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  CUDA_DEVICE_CALLABLE size_type character_offset(size_type bytepos) const;
+  __device__ inline size_type character_offset(size_type bytepos) const;
 };
 
 namespace strings {
@@ -386,7 +386,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& character)
+CUDF_HOST_DEVICE inline size_type to_char_utf8(const char* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -413,7 +413,7 @@ CUDA_HOST_DEVICE_CALLABLE size_type to_char_utf8(const char* str, char_utf8& cha
  * @param[out] str Allocated char array with enough space to hold the encoded character.
  * @return The number of bytes in the character
  */
-CUDA_HOST_DEVICE_CALLABLE size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 13d5f8e06bc..459a4182aa0 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -17,19 +17,9 @@
 #pragma once
 
 #ifdef __CUDACC__
-#define CUDA_HOST_DEVICE_CALLABLE __host__ __device__ inline
-#define CUDA_DEVICE_CALLABLE      __device__ inline
-
-// This version of the macro maximizes the chances of inlining when applied to
-// a callable that is called on the GPU.
-#define CUDF_HDFI __host__ __device__ __forceinline__
-#define CUDF_DFI  __device__ __forceinline__
+#define CUDF_HOST_DEVICE __host__ __device__
 #else
-#define CUDA_HOST_DEVICE_CALLABLE inline
-#define CUDA_DEVICE_CALLABLE      inline
-
-#define CUDF_HDFI inline
-#define CUDF_DFI  inline
+#define CUDF_HOST_DEVICE
 #endif
 
 #include <cassert>
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index cbd09fa7b0d..f4a70463de3 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -42,7 +42,7 @@ namespace detail {
 #endif
 
 template <typename T>
-constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits()
+constexpr CUDF_HOST_DEVICE inline std::size_t size_in_bits()
 {
   static_assert(CHAR_BIT == 8, "Size of a byte must be 8 bits.");
   return sizeof(T) * CHAR_BIT;
@@ -58,7 +58,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE std::size_t size_in_bits()
 /**
  * @brief Returns the index of the word containing the specified bit.
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type word_index(size_type bit_index)
 {
   return bit_index / detail::size_in_bits<bitmask_type>();
 }
@@ -66,7 +66,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type word_index(size_type bit_index)
 /**
  * @brief Returns the position within a word of the specified bit.
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_index)
+constexpr CUDF_HOST_DEVICE inline size_type intra_word_index(size_type bit_index)
 {
   return bit_index % detail::size_in_bits<bitmask_type>();
 }
@@ -80,7 +80,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE size_type intra_word_index(size_type bit_ind
  * @param bitmask The bitmask containing the bit to set
  * @param bit_index Index of the bit to set
  */
-CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void set_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] |= (bitmask_type{1} << intra_word_index(bit_index));
@@ -95,7 +95,7 @@ CUDA_HOST_DEVICE_CALLABLE void set_bit_unsafe(bitmask_type* bitmask, size_type b
  * @param bitmask The bitmask containing the bit to clear
  * @param bit_index The index of the bit to clear
  */
-CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline void clear_bit_unsafe(bitmask_type* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   bitmask[word_index(bit_index)] &= ~(bitmask_type{1} << intra_word_index(bit_index));
@@ -109,7 +109,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
  */
-CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
+CUDF_HOST_DEVICE inline bool bit_is_set(bitmask_type const* bitmask, size_type bit_index)
 {
   assert(nullptr != bitmask);
   return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index));
@@ -125,9 +125,9 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type
  * @return false  The specified bit is `0`
  * @return `default_value` if `bitmask` is nullptr
  */
-CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
-                                            size_type bit_index,
-                                            bool default_value)
+CUDF_HOST_DEVICE inline bool bit_value_or(bitmask_type const* bitmask,
+                                          size_type bit_index,
+                                          bool default_value)
 {
   return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value;
 }
@@ -140,7 +140,7 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
  * @param n The number of least significant bits to set
  * @return A bitmask word with `n` least significant bits set
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_least_significant_bits(size_type n)
 {
   constexpr_assert(0 <= n && n < static_cast<size_type>(detail::size_in_bits<bitmask_type>()));
   return ((bitmask_type{1} << n) - 1);
@@ -154,7 +154,7 @@ constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_least_significant_bits(size
  * @param n The number of most significant bits to set
  * @return A bitmask word with `n` most significant bits set
  */
-constexpr CUDA_HOST_DEVICE_CALLABLE bitmask_type set_most_significant_bits(size_type n)
+constexpr CUDF_HOST_DEVICE inline bitmask_type set_most_significant_bits(size_type n)
 {
   constexpr size_type word_size{detail::size_in_bits<bitmask_type>()};
   constexpr_assert(0 <= n && n < word_size);
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index d7d38aba4f3..0c6a6ee244c 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -420,7 +420,9 @@ using scalar_device_type_t = typename type_to_scalar_type_impl<T>::ScalarDeviceT
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl,
           typename Functor,
           typename... Ts>
-CUDF_HDFI constexpr decltype(auto) type_dispatcher(cudf::data_type dtype, Functor f, Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) type_dispatcher(cudf::data_type dtype,
+                                                                          Functor f,
+                                                                          Ts&&... args)
 {
   switch (dtype.id()) {
     case type_id::BOOL8:
@@ -531,7 +533,7 @@ template <typename T1>
 struct double_type_dispatcher_second_type {
 #pragma nv_exec_check_disable
   template <typename T2, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(F&& f, Ts&&... args) const
   {
     return f.template operator()<T1, T2>(std::forward<Ts>(args)...);
   }
@@ -541,7 +543,9 @@ template <template <cudf::type_id> typename IdTypeMap>
 struct double_type_dispatcher_first_type {
 #pragma nv_exec_check_disable
   template <typename T1, typename F, typename... Ts>
-  CUDF_HDFI decltype(auto) operator()(cudf::data_type type2, F&& f, Ts&&... args) const
+  CUDF_HOST_DEVICE __forceinline__ decltype(auto) operator()(cudf::data_type type2,
+                                                             F&& f,
+                                                             Ts&&... args) const
   {
     return type_dispatcher<IdTypeMap>(type2,
                                       detail::double_type_dispatcher_second_type<T1>{},
@@ -566,10 +570,8 @@ struct double_type_dispatcher_first_type {
  */
 #pragma nv_exec_check_disable
 template <template <cudf::type_id> typename IdTypeMap = id_to_type_impl, typename F, typename... Ts>
-CUDF_HDFI constexpr decltype(auto) double_type_dispatcher(cudf::data_type type1,
-                                                          cudf::data_type type2,
-                                                          F&& f,
-                                                          Ts&&... args)
+CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) double_type_dispatcher(
+  cudf::data_type type1, cudf::data_type type2, F&& f, Ts&&... args)
 {
   return type_dispatcher<IdTypeMap>(type1,
                                     detail::double_type_dispatcher_first_type<IdTypeMap>{},
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 040fde6e08d..9f1c1534e9a 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -56,22 +56,22 @@ struct dictionary_wrapper {
   dictionary_wrapper& operator=(const dictionary_wrapper&) = default;
 
   // construct object from type
-  CUDA_HOST_DEVICE_CALLABLE constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
+  CUDF_HOST_DEVICE inline constexpr explicit dictionary_wrapper(value_type v) : _value{v} {}
 
   // conversion operator
-  CUDA_HOST_DEVICE_CALLABLE explicit operator value_type() const { return _value; }
+  CUDF_HOST_DEVICE inline explicit operator value_type() const { return _value; }
   // simple accessor
-  CUDA_HOST_DEVICE_CALLABLE value_type value() const { return _value; }
+  CUDF_HOST_DEVICE inline value_type value() const { return _value; }
 
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type max_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type max_value()
   {
     return std::numeric_limits<value_type>::max();
   }
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type min_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type min_value()
   {
     return std::numeric_limits<value_type>::min();
   }
-  static CUDA_HOST_DEVICE_CALLABLE constexpr value_type lowest_value()
+  static CUDF_HOST_DEVICE inline constexpr value_type lowest_value()
   {
     return std::numeric_limits<IndexType>::lowest();
   }
@@ -82,43 +82,43 @@ struct dictionary_wrapper {
 
 // comparison operators
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator==(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator==(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() == rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator!=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator!=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() != rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator<=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator<=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() <= rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator>=(dictionary_wrapper<Integer> const& lhs,
-                                          dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>=(dictionary_wrapper<Integer> const& lhs,
+                                        dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() >= rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
-                                                   dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline constexpr bool operator<(dictionary_wrapper<Integer> const& lhs,
+                                                 dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() < rhs.value();
 }
 
 template <typename Integer>
-CUDA_HOST_DEVICE_CALLABLE bool operator>(dictionary_wrapper<Integer> const& lhs,
-                                         dictionary_wrapper<Integer> const& rhs)
+CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
+                                       dictionary_wrapper<Integer> const& rhs)
 {
   return lhs.value() > rhs.value();
 }
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 71d9b615153..efa8cdca2cc 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -119,9 +119,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                   !std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                                !std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
@@ -133,9 +133,9 @@ struct compare_functor {
 
   // This is used to compare a scalar and a column value
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
-                                                   std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<!std::is_same_v<LhsViewT, column_device_view> &&
+                                                std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(),
@@ -147,9 +147,9 @@ struct compare_functor {
 
   // This is used to compare 2 column values
   template <typename LhsViewT = LhsDeviceViewT, typename RhsViewT = RhsDeviceViewT>
-  CUDA_DEVICE_CALLABLE typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
-                                                   std::is_same_v<RhsViewT, column_device_view>,
-                                                 OutT>
+  __device__ inline typename std::enable_if_t<std::is_same_v<LhsViewT, column_device_view> &&
+                                                std::is_same_v<RhsViewT, column_device_view>,
+                                              OutT>
   operator()(cudf::size_type i) const
   {
     return cfunc_(lhs_dev_view_.is_valid(i),
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5bd639dd295..dc1cae82796 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -43,9 +43,9 @@ constexpr bool is_bool_result()
 template <typename CastType>
 struct type_casted_accessor {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i,
-                                           column_device_view const& col,
-                                           bool is_scalar) const
+  __device__ inline CastType operator()(cudf::size_type i,
+                                        column_device_view const& col,
+                                        bool is_scalar) const
   {
     if constexpr (column_device_view::has_element_accessor<Element>() and
                   std::is_convertible_v<Element, CastType>)
@@ -61,9 +61,9 @@ struct type_casted_accessor {
 template <typename FromType>
 struct typed_casted_writer {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE void operator()(cudf::size_type i,
-                                       mutable_column_device_view const& col,
-                                       FromType val) const
+  __device__ inline void operator()(cudf::size_type i,
+                                    mutable_column_device_view const& col,
+                                    FromType val) const
   {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 86645e2cb8a..75507d055e0 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -29,7 +29,7 @@ namespace ops {
 
 struct Add {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -37,7 +37,7 @@ struct Add {
 
 struct Sub {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs - rhs)
   {
     return lhs - rhs;
   }
@@ -58,7 +58,7 @@ struct Mul {
            (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -75,7 +75,7 @@ struct Div {
            (is_numeric<TypeLhs>() and is_fixed_point<TypeRhs>());
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs / rhs)
   {
     return lhs / rhs;
   }
@@ -83,7 +83,7 @@ struct Div {
 
 struct TrueDiv {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs)
     -> decltype((static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return (static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -92,7 +92,7 @@ struct TrueDiv {
 
 struct FloorDiv {
   template <typename T1, typename T2>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs)
     -> decltype(floor(static_cast<double>(lhs) / static_cast<double>(rhs)))
   {
     return floor(static_cast<double>(lhs) / static_cast<double>(rhs));
@@ -110,21 +110,21 @@ struct Mod {
            (is_duration<TypeLhs>() and (std::is_integral<TypeRhs>() or is_duration<TypeRhs>()));
   }
   template <typename T1, typename T2, std::enable_if_t<is_supported<T1, T2>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs)
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> decltype(lhs % rhs)
   {
     return lhs % rhs;
   }
   template <typename T1,
             typename T2,
             std::enable_if_t<(std::is_same_v<float, std::common_type_t<T1, T2>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> float
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> float
   {
     return fmodf(static_cast<float>(lhs), static_cast<float>(rhs));
   }
   template <typename T1,
             typename T2,
             std::enable_if_t<(std::is_same_v<double, std::common_type_t<T1, T2>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(T1 const& lhs, T2 const& rhs) -> double
+  __device__ inline auto operator()(T1 const& lhs, T2 const& rhs) -> double
   {
     return fmod(static_cast<double>(lhs), static_cast<double>(rhs));
   }
@@ -138,7 +138,7 @@ struct PMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
     common_t xconv = static_cast<common_t>(x);
@@ -153,7 +153,7 @@ struct PMod {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
     common_t xconv = static_cast<common_t>(x);
@@ -168,7 +168,7 @@ struct PyMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(std::is_integral_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
   {
     return ((x % y) + y) % y;
   }
@@ -177,7 +177,7 @@ struct PyMod {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     double x1 = static_cast<double>(x);
     double y1 = static_cast<double>(y);
@@ -187,7 +187,7 @@ struct PyMod {
   template <typename TypeLhs,
             typename TypeRhs,
             std::enable_if_t<(is_duration<TypeLhs>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(((x % y) + y) % y)
   {
     return ((x % y) + y) % y;
   }
@@ -198,7 +198,7 @@ struct Pow {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return pow(static_cast<double>(x), static_cast<double>(y));
   }
@@ -209,7 +209,7 @@ struct LogBase {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return (std::log(static_cast<double>(x)) / std::log(static_cast<double>(y)));
   }
@@ -220,7 +220,7 @@ struct ATan2 {
             typename TypeRhs,
             std::enable_if_t<(std::is_convertible_v<TypeLhs, double> and
                               std::is_convertible_v<TypeRhs, double>)>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> double
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
     return std::atan2(static_cast<double>(x), static_cast<double>(y));
   }
@@ -228,7 +228,7 @@ struct ATan2 {
 
 struct ShiftLeft {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x << y)
   {
     return (x << y);
   }
@@ -236,7 +236,7 @@ struct ShiftLeft {
 
 struct ShiftRight {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >> y)
   {
     return (x >> y);
   }
@@ -247,7 +247,7 @@ struct ShiftRightUnsigned {
     typename TypeLhs,
     typename TypeRhs,
     std::enable_if_t<(std::is_integral_v<TypeLhs> and not is_boolean<TypeLhs>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y)
   {
     return (static_cast<std::make_unsigned_t<TypeLhs>>(x) >> y);
@@ -256,7 +256,7 @@ struct ShiftRightUnsigned {
 
 struct BitwiseAnd {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x & y)
   {
     return (x & y);
   }
@@ -264,7 +264,7 @@ struct BitwiseAnd {
 
 struct BitwiseOr {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x | y)
   {
     return (x | y);
   }
@@ -272,7 +272,7 @@ struct BitwiseOr {
 
 struct BitwiseXor {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x ^ y)
   {
     return (x ^ y);
   }
@@ -280,7 +280,7 @@ struct BitwiseXor {
 
 struct LogicalAnd {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y)
   {
     return (x && y);
   }
@@ -288,7 +288,7 @@ struct LogicalAnd {
 
 struct LogicalOr {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y)
   {
     return (x || y);
   }
@@ -296,7 +296,7 @@ struct LogicalOr {
 
 struct Equal {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y)
   {
     return (x == y);
   }
@@ -304,7 +304,7 @@ struct Equal {
 
 struct NotEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y)
   {
     return (x != y);
   }
@@ -312,7 +312,7 @@ struct NotEqual {
 
 struct Less {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x < y)
   {
     return (x < y);
   }
@@ -320,7 +320,7 @@ struct Less {
 
 struct Greater {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x > y)
   {
     return (x > y);
   }
@@ -328,7 +328,7 @@ struct Greater {
 
 struct LessEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x <= y)
   {
     return (x <= y);
   }
@@ -336,7 +336,7 @@ struct LessEqual {
 
 struct GreaterEqual {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x >= y)
   {
     return (x >= y);
   }
@@ -344,7 +344,7 @@ struct GreaterEqual {
 
 struct NullEquals {
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
   {
     output_valid = true;
@@ -354,14 +354,14 @@ struct NullEquals {
   }
   // To allow std::is_invocable_v = true
   template <typename TypeLhs, typename TypeRhs>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
 };
 
 struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y))
   {
@@ -382,7 +382,7 @@ struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) > static_cast<common_t>(y) ? x : y));
 };
 
@@ -390,7 +390,7 @@ struct NullMin {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(
+  __device__ inline auto operator()(
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y))
   {
@@ -411,7 +411,7 @@ struct NullMin {
   template <typename TypeLhs,
             typename TypeRhs,
             typename common_t = std::common_type_t<TypeLhs, TypeRhs>>
-  CUDA_DEVICE_CALLABLE auto operator()(TypeLhs x, TypeRhs y)
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y)
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y));
 };
 
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 1e9a39560b8..122ad4a9752 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -63,7 +63,7 @@ enum class rounding_function {
 template <datetime_component Component>
 struct extract_component_operator {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -99,7 +99,7 @@ struct extract_component_operator {
 template <typename DurationType>
 struct RoundFunctor {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE auto operator()(rounding_function round_kind, Timestamp dt)
+  __device__ inline auto operator()(rounding_function round_kind, Timestamp dt)
   {
     switch (round_kind) {
       case rounding_function::CEIL: return cuda::std::chrono::ceil<DurationType>(dt);
@@ -121,7 +121,7 @@ struct RoundingDispatcher {
   }
 
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const
+  __device__ inline Timestamp operator()(Timestamp const ts) const
   {
     switch (component) {
       case rounding_frequency::DAY:
@@ -161,7 +161,7 @@ static __device__ int16_t const days_until_month[2][13] = {
 // date only (without the time component)
 struct extract_last_day_of_month {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE timestamp_D operator()(Timestamp const ts) const
+  __device__ inline timestamp_D operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     const year_month_day ymd(floor<days>(ts));
@@ -175,7 +175,7 @@ struct extract_last_day_of_month {
 // an integer while the other returns a timestamp.
 struct days_in_month_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     auto const date = year_month_day(floor<days>(ts));
@@ -187,7 +187,7 @@ struct days_in_month_op {
 // Extract the day number of the year present in the timestamp
 struct extract_day_num_of_year {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -203,7 +203,7 @@ struct extract_day_num_of_year {
 // Extract the the quarter to which the timestamp belongs to
 struct extract_quarter_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE int16_t operator()(Timestamp const ts) const
+  __device__ inline int16_t operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
 
@@ -220,7 +220,7 @@ struct extract_quarter_op {
 // Returns true if the year is a leap year
 struct is_leap_year_op {
   template <typename Timestamp>
-  CUDA_DEVICE_CALLABLE bool operator()(Timestamp const ts) const
+  __device__ inline bool operator()(Timestamp const ts) const
   {
     using namespace cuda::std::chrono;
     auto const days_since_epoch = floor<days>(ts);
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index cdcf4311be7..02b4f2af724 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -58,7 +58,7 @@ struct is_double_convertible_impl {
 template <typename CastType>
 struct type_casted_accessor {
   template <typename Element>
-  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i, column_device_view const& col) const
+  __device__ inline CastType operator()(cudf::size_type i, column_device_view const& col) const
   {
     if constexpr (column_device_view::has_element_accessor<Element>() and
                   std::is_convertible_v<Element, CastType>)
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index ffc6032dfa1..dde4e00eb4a 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -51,7 +51,7 @@ struct element_arg_minmax_fn {
   bool const has_nulls;
   bool const arg_min;
 
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
+  __device__ inline auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
     // github.com/NVIDIA/thrust/issues/1525
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 8d09728b771..1048a6a71c8 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -79,8 +79,7 @@ struct permuted_row_equality_comparator {
    * @param rhs The index of the second row
    * @returns true if the two specified rows in the permuted order are equivalent
    */
-  CUDA_DEVICE_CALLABLE
-  bool operator()(cudf::size_type lhs, cudf::size_type rhs)
+  __device__ inline bool operator()(cudf::size_type lhs, cudf::size_type rhs)
   {
     return _comparator(_map[lhs], _map[rhs]);
   }
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b9915da90b9..528c949b690 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -62,12 +62,12 @@ struct hash_circular_buffer {
   int available_space{capacity};
   hash_step_callable hash_step;
 
-  CUDA_DEVICE_CALLABLE hash_circular_buffer(hash_step_callable hash_step)
+  __device__ inline hash_circular_buffer(hash_step_callable hash_step)
     : cur{storage}, hash_step{hash_step}
   {
   }
 
-  CUDA_DEVICE_CALLABLE void put(uint8_t const* in, int size)
+  __device__ inline void put(uint8_t const* in, int size)
   {
     int copy_start = 0;
     while (size >= available_space) {
@@ -88,7 +88,7 @@ struct hash_circular_buffer {
     available_space -= size;
   }
 
-  CUDA_DEVICE_CALLABLE void pad(int const space_to_leave)
+  __device__ inline void pad(int const space_to_leave)
   {
     if (space_to_leave > available_space) {
       memset(cur, 0x00, available_space);
@@ -101,12 +101,12 @@ struct hash_circular_buffer {
     available_space = space_to_leave;
   }
 
-  CUDA_DEVICE_CALLABLE const uint8_t& operator[](int idx) const { return storage[idx]; }
+  __device__ inline const uint8_t& operator[](int idx) const { return storage[idx]; }
 };
 
 // Get a uint8_t pointer to a column element and its size as a pair.
 template <typename Element>
-auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(Element const& element)
+auto __device__ inline get_element_pointer_and_size(Element const& element)
 {
   if constexpr (is_fixed_width<Element>() && !is_chrono<Element>()) {
     return thrust::make_pair(reinterpret_cast<uint8_t const*>(&element), sizeof(Element));
@@ -116,7 +116,7 @@ auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(Element const& element)
 }
 
 template <>
-auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(string_view const& element)
+auto __device__ inline get_element_pointer_and_size(string_view const& element)
 {
   return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
 }
@@ -124,12 +124,12 @@ auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(string_view const& elemen
 struct MD5Hasher {
   static constexpr int message_chunk_size = 64;
 
-  CUDA_DEVICE_CALLABLE MD5Hasher(char* result_location)
+  __device__ inline MD5Hasher(char* result_location)
     : result_location(result_location), buffer(md5_hash_step{hash_values})
   {
   }
 
-  CUDA_DEVICE_CALLABLE ~MD5Hasher()
+  __device__ inline ~MD5Hasher()
   {
     // On destruction, finalize the message buffer and write out the current
     // hexadecimal hash value to the result location.
@@ -154,7 +154,7 @@ struct MD5Hasher {
   MD5Hasher& operator=(MD5Hasher&&) = delete;
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE process(Element const& element)
+  void __device__ inline process(Element const& element)
   {
     auto const normalized_element  = normalize_nans_and_zeros(element);
     auto const [element_ptr, size] = get_element_pointer_and_size(normalized_element);
@@ -169,7 +169,7 @@ struct MD5Hasher {
   struct md5_hash_step {
     uint32_t (&hash_values)[4];
 
-    void CUDA_DEVICE_CALLABLE operator()(const uint8_t (&buffer)[message_chunk_size])
+    void __device__ inline operator()(const uint8_t (&buffer)[message_chunk_size])
     {
       uint32_t A = hash_values[0];
       uint32_t B = hash_values[1];
@@ -226,13 +226,13 @@ struct HasherDispatcher {
   Hasher* hasher;
   column_device_view const& input_col;
 
-  CUDA_DEVICE_CALLABLE HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+  __device__ inline HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
     : hasher{hasher}, input_col{input_col}
   {
   }
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE operator()(size_type const row_index) const
+  void __device__ inline operator()(size_type const row_index) const
   {
     if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
                   std::is_same_v<Element, string_view>) {
@@ -249,14 +249,13 @@ struct ListHasherDispatcher {
   Hasher* hasher;
   column_device_view const& input_col;
 
-  CUDA_DEVICE_CALLABLE ListHasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+  __device__ inline ListHasherDispatcher(Hasher* hasher, column_device_view const& input_col)
     : hasher{hasher}, input_col{input_col}
   {
   }
 
   template <typename Element>
-  void CUDA_DEVICE_CALLABLE operator()(size_type const offset_begin,
-                                       size_type const offset_end) const
+  void __device__ inline operator()(size_type const offset_begin, size_type const offset_end) const
   {
     if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
                   std::is_same_v<Element, string_view>) {
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index e15144f9ea5..b25dfd0a621 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -57,10 +57,10 @@ static constexpr uint32_t cycle_entry_cnt = 2 * cycle_years;
  *
  * @return GMT offset
  */
-CUDA_HOST_DEVICE_CALLABLE int32_t get_gmt_offset_impl(int64_t const* ttimes,
-                                                      int32_t const* offsets,
-                                                      size_t count,
-                                                      int64_t ts)
+CUDF_HOST_DEVICE inline int32_t get_gmt_offset_impl(int64_t const* ttimes,
+                                                    int32_t const* offsets,
+                                                    size_t count,
+                                                    int64_t ts)
 {
   // Returns start of the range if all elements are larger than the input timestamp
   auto last_less_equal_ttime_idx = [&](long begin_idx, long end_idx, int64_t ts) {
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index a0554833def..7af1e47087b 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -23,7 +23,7 @@
 namespace cudf {
 namespace detail {
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result get_array_value(T const* devarr, size_type location)
+CUDF_HOST_DEVICE inline Result get_array_value(T const* devarr, size_type location)
 {
   T result;
 #if defined(__CUDA_ARCH__)
@@ -36,7 +36,7 @@ CUDA_HOST_DEVICE_CALLABLE Result get_array_value(T const* devarr, size_type loca
 
 namespace interpolate {
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result linear(T lhs, T rhs, double frac)
+CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
 {
   // TODO: safe operation to avoid overflow/underflow
   // double can fully represent int8-32 value range.
@@ -52,7 +52,7 @@ CUDA_HOST_DEVICE_CALLABLE Result linear(T lhs, T rhs, double frac)
 }
 
 template <typename Result, typename T>
-CUDA_HOST_DEVICE_CALLABLE Result midpoint(T lhs, T rhs)
+CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
   double dlhs = static_cast<double>(lhs);
@@ -61,7 +61,7 @@ CUDA_HOST_DEVICE_CALLABLE Result midpoint(T lhs, T rhs)
 }
 
 template <typename Result>
-CUDA_HOST_DEVICE_CALLABLE Result midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline Result midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow and underflow between int64_t and Result( double )
   int64_t half = lhs / 2 + rhs / 2;
@@ -70,7 +70,7 @@ CUDA_HOST_DEVICE_CALLABLE Result midpoint(int64_t lhs, int64_t rhs)
 }
 
 template <>
-CUDA_HOST_DEVICE_CALLABLE int64_t midpoint(int64_t lhs, int64_t rhs)
+CUDF_HOST_DEVICE inline int64_t midpoint(int64_t lhs, int64_t rhs)
 {
   // caring to avoid integer overflow
   int64_t half   = lhs / 2 + rhs / 2;
@@ -92,8 +92,7 @@ struct quantile_index {
   size_type nearest;
   double fraction;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  quantile_index(size_type count, double quantile)
+  CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
     quantile = std::min(std::max(quantile, 0.0), 1.0);
 
@@ -123,8 +122,10 @@ struct quantile_index {
  * @returns Value of the desired quantile.
  */
 template <typename Result, typename ValueAccessor>
-CUDA_HOST_DEVICE_CALLABLE Result
-select_quantile(ValueAccessor get_value, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile(ValueAccessor get_value,
+                                               size_type size,
+                                               double q,
+                                               interpolation interp)
 {
   if (size < 2) { return get_value(0); }
 
@@ -154,8 +155,10 @@ select_quantile(ValueAccessor get_value, size_type size, double q, interpolation
 }
 
 template <typename Result, typename Iterator>
-CUDA_HOST_DEVICE_CALLABLE Result
-select_quantile_data(Iterator begin, size_type size, double q, interpolation interp)
+CUDF_HOST_DEVICE inline Result select_quantile_data(Iterator begin,
+                                                    size_type size,
+                                                    double q,
+                                                    interpolation interp)
 {
   if (size == 0) return static_cast<Result>(*begin);
 
@@ -184,10 +187,10 @@ select_quantile_data(Iterator begin, size_type size, double q, interpolation int
 }
 
 template <typename Iterator>
-CUDA_HOST_DEVICE_CALLABLE bool select_quantile_validity(Iterator begin,
-                                                        size_type size,
-                                                        double q,
-                                                        interpolation interp)
+CUDF_HOST_DEVICE inline bool select_quantile_validity(Iterator begin,
+                                                      size_type size,
+                                                      double q,
+                                                      interpolation interp)
 {
   quantile_index idx(size, q);
 
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 809f3506c67..bf2c83b5b8d 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -88,8 +88,7 @@ struct min_max_scan_operator {
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
 
-  CUDA_DEVICE_CALLABLE
-  size_type operator()(size_type lhs, size_type rhs) const
+  __device__ inline size_type operator()(size_type lhs, size_type rhs) const
   {
     // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
     // in these cases the return value does not matter since the result is not used
diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp
index 5fabcf5b14e..80a9397922e 100644
--- a/cpp/src/rolling/rolling_detail.hpp
+++ b/cpp/src/rolling/rolling_detail.hpp
@@ -29,7 +29,7 @@ namespace detail {
 // store functor
 template <typename T, bool is_mean = false>
 struct rolling_store_output_functor {
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count) { out = val; }
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count) { out = val; }
 };
 
 // Specialization for MEAN
@@ -38,21 +38,21 @@ struct rolling_store_output_functor<_T, true> {
   // SFINAE for non-bool types
   template <typename T                                                             = _T,
             std::enable_if_t<!(cudf::is_boolean<T>() || cudf::is_timestamp<T>())>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = val / count;
   }
 
   // SFINAE for bool type
   template <typename T = _T, std::enable_if_t<cudf::is_boolean<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<int32_t>(val) / count;
   }
 
   // SFINAE for timestamp types
   template <typename T = _T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE void operator()(T& out, T& val, size_type count)
+  CUDF_HOST_DEVICE inline void operator()(T& out, T& val, size_type count)
   {
     out = static_cast<T>(val.time_since_epoch() / count);
   }
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index e2c342482f1..c61fb8905f5 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -72,22 +72,22 @@ enum class parse_result {
  */
 class parser {
  protected:
-  CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {}
-  CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len)
+  CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
     parse_whitespace();
   }
 
-  CUDA_HOST_DEVICE_CALLABLE parser(parser const& p)
+  CUDF_HOST_DEVICE inline parser(parser const& p)
     : input(p.input), input_len(p.input_len), pos(p.pos)
   {
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; }
-  CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); }
+  CUDF_HOST_DEVICE inline bool eof(const char* p) { return p - input >= input_len; }
+  CUDF_HOST_DEVICE inline bool eof() { return eof(pos); }
 
-  CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace()
+  CUDF_HOST_DEVICE inline bool parse_whitespace()
   {
     while (!eof()) {
       if (is_whitespace(*pos)) {
@@ -99,12 +99,12 @@ class parser {
     return false;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_hex_digit(char c)
+  CUDF_HOST_DEVICE inline bool is_hex_digit(char c)
   {
     return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
   }
 
-  CUDA_HOST_DEVICE_CALLABLE int64_t chars_left() { return input_len - ((pos - input) + 1); }
+  CUDF_HOST_DEVICE inline int64_t chars_left() { return input_len - ((pos - input) + 1); }
 
   /**
    * @brief Parse an escape sequence.
@@ -114,7 +114,7 @@ class parser {
    *
    * @returns True on success or false on fail.
    */
-  CUDA_HOST_DEVICE_CALLABLE bool parse_escape_seq()
+  CUDF_HOST_DEVICE inline bool parse_escape_seq()
   {
     if (*pos != '\\') { return false; }
     char c = *++pos;
@@ -147,9 +147,7 @@ class parser {
    * indicates allowing either single or double quotes (but not a mixture of both).
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str,
-                                                      bool can_be_empty,
-                                                      char quote)
+  CUDF_HOST_DEVICE inline parse_result parse_string(string_view& str, bool can_be_empty, char quote)
   {
     str = string_view(nullptr, 0);
 
@@ -183,7 +181,7 @@ class parser {
   int64_t input_len;
   char const* pos;
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; }
+  CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; }
 };
 
 /**
@@ -370,7 +368,7 @@ class json_state : private parser {
    * to not be present.
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, bool can_be_empty)
+  CUDF_HOST_DEVICE inline parse_result parse_name(string_view& name, bool can_be_empty)
   {
     char const quote = options.get_allow_single_quotes() ? 0 : '\"';
 
@@ -399,7 +397,7 @@ class json_state : private parser {
    * @param val (Output) The string containing the parsed value
    * @returns A result code indicating success, failure or other result.
    */
-  CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val)
+  CUDF_HOST_DEVICE inline parse_result parse_non_string_value(string_view& val)
   {
     if (!parse_whitespace()) { return parse_result::ERROR; }
 
@@ -483,7 +481,7 @@ class json_state : private parser {
     return parse_result::SUCCESS;
   }
 
-  CUDA_HOST_DEVICE_CALLABLE bool is_quote(char c)
+  CUDF_HOST_DEVICE inline bool is_quote(char c)
   {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
@@ -503,12 +501,12 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  CUDA_HOST_DEVICE_CALLABLE path_operator()
+  CUDF_HOST_DEVICE inline path_operator()
     : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
   {
   }
-  CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type,
-                                          json_element_type _expected_type = NONE)
+  CUDF_HOST_DEVICE inline path_operator(path_operator_type _type,
+                                        json_element_type _expected_type = NONE)
     : type(_type), index(-1), expected_type{_expected_type}
   {
   }
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 27556d90b1b..d6b8307c3fb 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -142,7 +142,7 @@ class reprog_device {
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  __host__ __device__ inline int32_t group_counts() const { return _num_capturing_groups; }
+  CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; }
 
   /**
    * @brief Returns the regex instruction object for a given index.
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 52cc69c69b8..0e11e9c1bbd 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -45,12 +45,12 @@ struct alignas(8) relist {
   int16_t* inst_ids{};  // one per instruction
   u_char* mask{};       // bit per instruction
 
-  __host__ __device__ inline static int32_t data_size_for(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t data_size_for(int32_t insts)
   {
     return ((sizeof(ranges[0]) + sizeof(inst_ids[0])) * insts) + ((insts + 7) / 8);
   }
 
-  __host__ __device__ inline static int32_t alloc_size(int32_t insts)
+  CUDF_HOST_DEVICE inline static int32_t alloc_size(int32_t insts)
   {
     int32_t size = sizeof(relist);
     size += data_size_for(insts);
@@ -58,9 +58,9 @@ struct alignas(8) relist {
     return size;
   }
 
-  __host__ __device__ inline relist() {}
+  CUDF_HOST_DEVICE inline relist() {}
 
-  __host__ __device__ inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
+  CUDF_HOST_DEVICE inline relist(int16_t insts, u_char* data = nullptr) : listsize(insts)
   {
     auto ptr = data == nullptr ? reinterpret_cast<u_char*>(this) + sizeof(relist) : data;
     ranges   = reinterpret_cast<int2*>(ptr);
@@ -71,7 +71,7 @@ struct alignas(8) relist {
     reset();
   }
 
-  __host__ __device__ inline void reset()
+  CUDF_HOST_DEVICE inline void reset()
   {
     memset(mask, 0, (listsize + 7) / 8);
     size = 0;
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 5b8f3d1ce9f..5cc4ce5f6c9 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -38,7 +38,7 @@ struct unary_cast {
             typename TargetT                                          = _TargetT,
             typename std::enable_if_t<(cudf::is_numeric<SourceT>() &&
                                        cudf::is_numeric<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element);
   }
@@ -47,7 +47,7 @@ struct unary_cast {
             typename TargetT                                            = _TargetT,
             typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
                                        cudf::is_timestamp<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     // Convert source tick counts into target tick counts without blindly truncating them
     // by dividing the respective duration time periods (which may not work for time before
@@ -59,7 +59,7 @@ struct unary_cast {
             typename TargetT                                           = _TargetT,
             typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
                                        cudf::is_duration<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element)};
   }
@@ -68,7 +68,7 @@ struct unary_cast {
             typename TargetT                                         = _TargetT,
             typename std::enable_if_t<cudf::is_numeric<SourceT>() &&
                                       cudf::is_duration<TargetT>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{static_cast<typename TargetT::rep>(element)};
   }
@@ -77,7 +77,7 @@ struct unary_cast {
             typename TargetT                                           = _TargetT,
             typename std::enable_if_t<(cudf::is_timestamp<SourceT>() &&
                                        cudf::is_duration<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT>(element.time_since_epoch())};
   }
@@ -86,7 +86,7 @@ struct unary_cast {
             typename TargetT                                        = _TargetT,
             typename std::enable_if_t<cudf::is_duration<SourceT>() &&
                                       cudf::is_numeric<TargetT>()>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return static_cast<TargetT>(element.count());
   }
@@ -95,7 +95,7 @@ struct unary_cast {
             typename TargetT                                            = _TargetT,
             typename std::enable_if_t<(cudf::is_duration<SourceT>() &&
                                        cudf::is_timestamp<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(SourceT const element)
+  __device__ inline TargetT operator()(SourceT const element)
   {
     return TargetT{cuda::std::chrono::floor<TargetT::duration>(element)};
   }
@@ -111,7 +111,7 @@ struct fixed_point_unary_cast {
             typename TargetT                                          = _TargetT,
             typename std::enable_if_t<(cudf::is_fixed_point<_SourceT>() &&
                                        cudf::is_numeric<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE TargetT operator()(DeviceT const element)
+  __device__ inline TargetT operator()(DeviceT const element)
   {
     auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
     return static_cast<TargetT>(fp);
@@ -121,7 +121,7 @@ struct fixed_point_unary_cast {
             typename TargetT                                              = _TargetT,
             typename std::enable_if_t<(cudf::is_numeric<_SourceT>() &&
                                        cudf::is_fixed_point<TargetT>())>* = nullptr>
-  CUDA_DEVICE_CALLABLE DeviceT operator()(SourceT const element)
+  __device__ inline DeviceT operator()(SourceT const element)
   {
     return TargetT{element, scale}.value();
   }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 1d23d73bf36..e8102dee2a2 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -39,8 +39,7 @@ template <typename ElementType>
 struct transformer_optional_meanvar {
   using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::optional<ElementType> const& optional)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::optional<ElementType> const& optional)
   {
     if (optional.has_value()) {
       auto v = *optional;
@@ -52,7 +51,7 @@ struct transformer_optional_meanvar {
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(const thrust::optional<T>& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 6c09997456d..99c3bfc2eb4 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -36,8 +36,7 @@ template <typename ElementType>
 struct transformer_pair_meanvar {
   using ResultType = thrust::pair<cudf::meanvar<ElementType>, bool>;
 
-  CUDA_HOST_DEVICE_CALLABLE
-  ResultType operator()(thrust::pair<ElementType, bool> const& pair)
+  CUDF_HOST_DEVICE inline ResultType operator()(thrust::pair<ElementType, bool> const& pair)
   {
     ElementType v = pair.first;
     return {{v, static_cast<ElementType>(v * v), (pair.second) ? 1 : 0}, pair.second};
@@ -46,8 +45,8 @@ struct transformer_pair_meanvar {
 
 struct sum_if_not_null {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
-                                                             const thrust::pair<T, bool>& rhs)
+  CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(const thrust::pair<T, bool>& lhs,
+                                                           const thrust::pair<T, bool>& rhs)
   {
     if (lhs.second & rhs.second)
       return {lhs.first + rhs.first, true};

From d3282cb2ca0e65c10844c44697a5a5ac671a0be9 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Mon, 10 Jan 2022 22:26:03 -0700
Subject: [PATCH 143/202] Fix null check when comparing rows of structs in
 `min` and `max` reduction/groupby operations (#9994)

When comparing structs, we need to flatten its view into a `table_view` and compare the table's rows. Nulls check for the comparison needs to be done by checking nulls in the input structs column at all levels.

This PR fixes a bug that checks for nulls only at the top level. Unit tests designed specifically to detect this bug have also been added.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9994
---
 cpp/src/reductions/struct_minmax_util.cuh |  5 +++--
 cpp/tests/groupby/max_tests.cpp           | 23 ++++++++++++++++++++++-
 cpp/tests/groupby/min_tests.cpp           | 23 ++++++++++++++++++++++-
 3 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index 8a7e94ea4ca..e5832b849bd 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -97,7 +98,7 @@ class comparison_binop_generator {
         table_view{{input}}, {}, std::vector<null_order>{DEFAULT_NULL_ORDER})},
       d_flattened_input_ptr{table_device_view::create(flattened_input, stream)},
       is_min_op(is_min_op),
-      has_nulls{input.has_nulls()},
+      has_nulls{has_nested_nulls(table_view{{input}})},
       null_orders_dvec(0, stream)
   {
     if (is_min_op) {
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 983802cb9a2..47bed11df30 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -388,5 +388,26 @@ TEST_F(groupby_max_struct_test, null_keys_and_values)
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TEST_F(groupby_max_struct_test, values_with_null_child)
+{
+  constexpr int32_t null{0};
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+  auto const vals = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+    return structs_column_wrapper{child1, child2};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+  auto const expect_vals = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1};
+    auto child2 = fixed_width_column_wrapper<int32_t>{-1};
+    return structs_column_wrapper{child1, child2};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index aca3384768c..64bffe1c883 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -387,5 +387,26 @@ TEST_F(groupby_min_struct_test, null_keys_and_values)
   test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+TEST_F(groupby_min_struct_test, values_with_null_child)
+{
+  constexpr int32_t null{0};
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+  auto const vals = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+    return structs_column_wrapper{child1, child2};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+  auto const expect_vals = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1};
+    auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+    return structs_column_wrapper{child1, child2};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf

From 951f630dfe4a5ec5a5afceae4dd97e98e5e8b93f Mon Sep 17 00:00:00 2001
From: AJ Schmidt <ajschmidt8@users.noreply.github.com>
Date: Tue, 11 Jan 2022 09:40:28 -0500
Subject: [PATCH 144/202] Fix `conda` recipes for `custreamz` & `cudf_kafka`
 (#10003)

This PR updates the `conda` recipes for `custreamz` and `cudf_kafka`. The environment variable should be `PYTHON`, not `python` (case sensitive).

Authors:
  - AJ Schmidt (https://github.com/ajschmidt8)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Jeremy Dyer (https://github.com/jdye64)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10003
---
 conda/recipes/cudf_kafka/meta.yaml |  7 ++++---
 conda/recipes/custreamz/meta.yaml  | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index d434e53c9b1..571d1bdea8f 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -3,7 +3,8 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
-{% set py_version = environ.get('python', '3.8') %}
+{% set py_version = environ.get('PY_VER', '3.8') %}
+{% set py_version_numeric = py_version.replace('.', '') %}
 
 package:
   name: cudf_kafka
@@ -14,7 +15,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - CC
     - CXX
@@ -34,7 +35,7 @@ requirements:
   run:
     - python {{ py_version }}
     - libcudf_kafka {{ version }}
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
     - cudf {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 73f4727b70b..7d9529257e6 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -3,7 +3,8 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set cuda_version = '.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
-{% set py_version = environ.get('python', '3.8') %}
+{% set py_version = environ.get('PY_VER', '3.8') %}
+{% set py_version_numeric = py_version.replace('.', '') %}
 
 package:
   name: custreamz
@@ -14,7 +15,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version.replace('.', '') }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: py{{ py_version_numeric }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
@@ -25,15 +26,15 @@ build:
 requirements:
   host:
     - python {{ py_version }}
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
     - cudf_kafka {{ version }}
   run:
     - python {{ py_version }}
-    - streamz 
+    - streamz
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2
     - distributed>=2021.11.1,<=2021.11.2
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version.replace('.', '') }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
     - cudf_kafka {{ version }}
 
 test:                                   # [linux64]

From 07fa8883ae3ad8a8614eba8eca8b2fda79bc5e30 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 11 Jan 2022 07:57:50 -0800
Subject: [PATCH 145/202] Use `cuda::std::is_arithmetic` in `cudf::is_numeric`
 trait. (#9996)

The current implementation of `cudf::is_numeric` is equivalent to the implementation of `cuda::std::is_arithmetic`. This PR simplifies the implementation in cuDF and aligns it with libcudacxx.

Notes:
- `bool` returns true from both `(cuda::)std::is_integral` and `(cuda::)std::is_arithmetic`, so `bool` is considered a "numeric" type. This behavior is unchanged.
- We must use `cuda::std::is_arithmetic` rather than `std::is_arithmetic` to [support 128-bit integer types](https://github.com/NVIDIA/libcudacxx/blob/05e3bae155a17759bc7fd4d1904b6b5da4c9be51/include/cuda/std/detail/libcxx/include/type_traits#L754-L757) (as we do with `cuda::std::is_integral` in the current implementation)

See also: https://en.cppreference.com/w/cpp/types/is_arithmetic

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9996
---
 cpp/include/cudf/utilities/traits.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index d1bd3049ba3..0b3b3a5df76 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -177,7 +177,7 @@ inline bool is_equality_comparable(data_type type)
 template <typename T>
 constexpr inline bool is_numeric()
 {
-  return cuda::std::is_integral<T>() or std::is_floating_point<T>::value;
+  return cuda::std::is_arithmetic<T>();
 }
 
 struct is_numeric_impl {

From 7ec427102a922e4dff6a888683d506be2fd4144d Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Tue, 11 Jan 2022 12:07:13 -0500
Subject: [PATCH 146/202] Remove `CUDA_DEVICE_CALLABLE` macro usage (#10015)

CUDA_DEVICE_CALLABLE was deprecated immediately after merging 8444 which included some new calls that were missed.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10015
---
 java/src/main/native/src/row_conversion.cu | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 3ef092792bf..4a5265b1d2e 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -152,18 +152,15 @@ struct tile_info {
   int end_row;
   int batch_number;
 
-  CUDA_DEVICE_CALLABLE
-  size_type get_shared_row_size(size_type const *const col_offsets,
-                                size_type const *const col_sizes) const {
+  __device__ inline size_type get_shared_row_size(size_type const *const col_offsets,
+                                                  size_type const *const col_sizes) const {
     return util::round_up_unsafe(col_offsets[end_col] + col_sizes[end_col] - col_offsets[start_col],
                                  JCUDF_ROW_ALIGNMENT);
   }
 
-  CUDA_DEVICE_CALLABLE
-  size_type num_cols() const { return end_col - start_col + 1; }
+  __device__ inline size_type num_cols() const { return end_col - start_col + 1; }
 
-  CUDA_DEVICE_CALLABLE
-  size_type num_rows() const { return end_row - start_row + 1; }
+  __device__ inline size_type num_rows() const { return end_row - start_row + 1; }
 };
 
 /**
@@ -194,8 +191,7 @@ struct row_offset_functor {
   row_offset_functor(size_type fixed_width_only_row_size)
       : _fixed_width_only_row_size(fixed_width_only_row_size){};
 
-  CUDA_DEVICE_CALLABLE
-  size_type operator()(int row_number, int tile_row_start) const {
+  __device__ inline size_type operator()(int row_number, int tile_row_start) const {
     return (row_number - tile_row_start) * _fixed_width_only_row_size;
   }
 
@@ -1270,8 +1266,9 @@ template <typename RowSize> struct row_size_functor {
   row_size_functor(size_type row_end, RowSize row_sizes, size_type last_row_end)
       : _row_end(row_end), _row_sizes(row_sizes), _last_row_end(last_row_end) {}
 
-  CUDA_DEVICE_CALLABLE
-  uint64_t operator()(int i) const { return i >= _row_end ? 0 : _row_sizes[i + _last_row_end]; }
+  __device__ inline uint64_t operator()(int i) const {
+    return i >= _row_end ? 0 : _row_sizes[i + _last_row_end];
+  }
 
   size_type _row_end;
   RowSize _row_sizes;

From cc25f3d30ce6f4e4939d5b913ca1a66ab32ac75e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 11 Jan 2022 12:28:59 -0600
Subject: [PATCH 147/202] Match pandas scalar result types in reductions
 (#9717)

Moving this casting logic to python and updating it so that integer sum and product operations give back an `int64` and give back the original column dtype in float cases. This is a breaking change.

Closes https://github.com/rapidsai/cudf/issues/8449

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9717
---
 python/cudf/cudf/_lib/reduce.pyx          | 11 ++++-------
 python/cudf/cudf/core/column/column.py    |  7 +++++++
 python/cudf/cudf/core/column/numerical.py | 11 +++++++++++
 python/cudf/cudf/tests/test_reductions.py |  6 ++----
 4 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 21a039dbf78..ecb787703d2 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -43,13 +43,10 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         to the same type as the input column
     """
 
-    col_dtype = incol.dtype
-    if (
-        reduction_op in ['sum', 'sum_of_squares', 'product']
-        and not is_decimal_dtype(col_dtype)
-    ):
-        col_dtype = np.find_common_type([col_dtype], [np.uint64])
-    col_dtype = col_dtype if dtype is None else dtype
+    col_dtype = (
+        dtype if dtype is not None
+        else incol._reduction_result_dtype(reduction_op)
+    )
 
     cdef column_view c_incol_view = incol.view()
     cdef unique_ptr[scalar] c_result
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a3a8b0c91d1..c1e037499fc 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1235,6 +1235,13 @@ def _process_for_reduction(
             )
         return result_col
 
+    def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
+        """
+        Determine the correct dtype to pass to libcudf based on
+        the input dtype, data dtype, and specific reduction op
+        """
+        return self.dtype
+
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c947440edb1..8f0a858ee34 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -641,6 +641,17 @@ def to_pandas(
             pd_series.index = index
         return pd_series
 
+    def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
+        col_dtype = self.dtype
+        if reduction_op in {"sum", "product"}:
+            col_dtype = (
+                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
+            )
+        elif reduction_op == "sum_of_squares":
+            col_dtype = np.find_common_type([col_dtype], [np.dtype("uint64")])
+
+        return col_dtype
+
 
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list]
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 89d665382d3..4ed6448de50 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -30,8 +30,7 @@ def test_sum(dtype, nelem):
     sr = Series(data)
 
     got = sr.sum()
-    expect = dtype(data.sum())
-
+    expect = data.sum()
     significant = 4 if dtype == np.float32 else 6
     np.testing.assert_approx_equal(expect, got, significant=significant)
 
@@ -83,8 +82,7 @@ def test_product(dtype, nelem):
     sr = Series(data)
 
     got = sr.product()
-    expect = np.product(data)
-
+    expect = pd.Series(data).product()
     significant = 4 if dtype == np.float32 else 6
     np.testing.assert_approx_equal(expect, got, significant=significant)
 

From 88e6a293384224ec3f5564d02cbddfe1a8f3b45b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 11 Jan 2022 13:59:53 -0800
Subject: [PATCH 148/202] Wrap CI script shell variables in quotes to fix local
 testing. (#10018)

This is a tiny PR that wraps shell script variables in quotes. This fixes an issue I saw in the upload script when `${BUILD_MODE}` was not set during local testing of the CI environment.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/10018
---
 ci/benchmark/build.sh | 2 +-
 ci/cpu/upload.sh      | 2 +-
 ci/gpu/build.sh       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 979db1b5034..59bd908d151 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -98,7 +98,7 @@ conda list --show-channel-urls
 ################################################################################
 
 logger "Build libcudf..."
-if [[ ${BUILD_MODE} == "pull-request" ]]; then
+if [[ "${BUILD_MODE}" == "pull-request" ]]; then
     "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests --ptds
 else
     "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf benchmarks tests -l --ptds
diff --git a/ci/cpu/upload.sh b/ci/cpu/upload.sh
index 40e80def8ae..e6ef72d930c 100755
--- a/ci/cpu/upload.sh
+++ b/ci/cpu/upload.sh
@@ -12,7 +12,7 @@ export GPUCI_RETRY_SLEEP=30
 export LABEL_OPTION=${LABEL_OPTION:-"--label main"}
 
 # Skip uploads unless BUILD_MODE == "branch"
-if [ ${BUILD_MODE} != "branch" ]; then
+if [ "${BUILD_MODE}" != "branch" ]; then
   echo "Skipping upload"
   return 0
 fi
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 059e359e4e9..a8afc03af94 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -124,7 +124,7 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     ################################################################################
 
     gpuci_logger "Build from source"
-    if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
         "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests --ptds
     else
         "$WORKSPACE/build.sh" clean libcudf cudf dask_cudf libcudf_kafka cudf_kafka benchmarks tests -l --ptds
@@ -222,7 +222,7 @@ else
     install_dask
 
     gpuci_logger "Build python libs from source"
-    if [[ ${BUILD_MODE} == "pull-request" ]]; then
+    if [[ "${BUILD_MODE}" == "pull-request" ]]; then
         "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka --ptds
     else
         "$WORKSPACE/build.sh" cudf dask_cudf cudf_kafka -l --ptds

From 25a7485eb752c66e042012e78f0832199ab20aeb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 11 Jan 2022 17:28:30 -0500
Subject: [PATCH 149/202] Fix regex doc describing hexadecimal escape
 characters (#10009)

Fixes a documentation error found while diagnosing a hex regex pattern question.

The hex escape sequence only specifies a single character (not a single byte). So this means it can only be used to match ASCII characters (code-points 0-127) and not all UTF-8 characters. This is the same as for octal escape sequences. Also, the example provided for hex in the documentation has been corrected to use a valid ASCII character.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/10009
---
 cpp/doxygen/regex.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index b721448b45a..76ebb48d195 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -30,7 +30,7 @@ The details are based on features documented at https://www.regular-expressions.
 | Literal character | Any character except `[\^$.⎮?*+()` | All characters except the listed special characters match a single instance of themselves | `a` matches `a` |
 | Literal curly braces | `{` and `}` | `{` and `}` are literal characters, unless they are part of a valid regular expression token such as a quantifier `{3}` | `{` matches `{` |
 | Backslash escapes a metacharacter | `\` followed by any of `[\^$.⎮?*+(){}` | A backslash escapes special characters to suppress their special meaning | `\*` matches `*` |
-| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the code page | `\xA9` matches `©` |
+| Hexadecimal escape | `\xFF` where `FF` are 2 hexadecimal digits | Matches the character at the specified position in the ASCII table | `\x40` matches `@` |
 | Character escape | `\n`, `\r` and `\t` | Match an line-feed (LF) character, carriage return (CR) character and a tab character respectively | `\r\n` matches a Windows CRLF line break |
 | Character escape | `\a` | Match the "alert" or "bell" control character (ASCII 0x07) | |
 | Character escape | `\f` | Match the form-feed control character (ASCII 0x0C) | |

From 3216342f01d198cfbe2ef9e2ac861674414dc493 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 11 Jan 2022 17:04:59 -0600
Subject: [PATCH 150/202] Raise in `query` if dtype is not supported (#9921)

Closes https://github.com/rapidsai/cudf/issues/9894

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9921
---
 python/cudf/cudf/tests/test_query.py | 23 ++++++++++++++++++++
 python/cudf/cudf/utils/queryutils.py | 32 ++++++++++++++++++++++------
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index 9a02d5145bb..3de38b2cf6f 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -209,3 +209,26 @@ def test_query_with_index_keyword(query, a_val, b_val, c_val):
     expect = pdf.query(query)
 
     assert_eq(out, expect)
+
+
+@pytest.mark.parametrize(
+    "data, query",
+    [
+        # Only need to test the dtypes that pandas
+        # supports but that we do not
+        (["a", "b", "c"], "data == 'a'"),
+    ],
+)
+def test_query_unsupported_dtypes(data, query):
+    gdf = cudf.DataFrame({"data": data})
+
+    # make sure the query works in pandas
+    pdf = gdf.to_pandas()
+    pdf_result = pdf.query(query)
+
+    expect = pd.DataFrame({"data": ["a"]})
+    assert_eq(expect, pdf_result)
+
+    # but fails in cuDF
+    with pytest.raises(TypeError):
+        gdf.query(query)
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 217466a5a1b..d9153c2b1d2 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -10,9 +10,20 @@
 import cudf
 from cudf.core.column import column_empty
 from cudf.utils import applyutils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
 
 ENVREF_PREFIX = "__CUDF_ENVREF__"
 
+SUPPORTED_QUERY_TYPES = {
+    np.dtype(dt)
+    for dt in NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | BOOL_TYPES
+}
+
 
 class QuerySyntaxError(ValueError):
     pass
@@ -197,6 +208,20 @@ def query_execute(df, expr, callenv):
 
     # compile
     compiled = query_compile(expr)
+    columns = compiled["colnames"]
+
+    # prepare col args
+    colarrays = [cudf.core.dataframe.extract_col(df, col) for col in columns]
+
+    # wait to check the types until we know which cols are used
+    if any(col.dtype not in SUPPORTED_QUERY_TYPES for col in colarrays):
+        raise TypeError(
+            "query only supports numeric, datetime, timedelta, "
+            "or bool dtypes."
+        )
+
+    colarrays = [col.data_array_view for col in colarrays]
+
     kernel = compiled["kernel"]
     # process env args
     envargs = []
@@ -214,13 +239,6 @@ def query_execute(df, expr, callenv):
             raise NameError(msg.format(name))
         else:
             envargs.append(val)
-    columns = compiled["colnames"]
-    # prepare col args
-
-    colarrays = [
-        cudf.core.dataframe.extract_col(df, col).data_array_view
-        for col in columns
-    ]
 
     # allocate output buffer
     nrows = len(df)

From 813ac97b2143c8d1d8ca95435863f5234408a681 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 11 Jan 2022 15:16:25 -0800
Subject: [PATCH 151/202] Use list of column inputs for `apply_boolean_mask`
 (#9832)

This PR brings changes from #9558 to `apply_boolean_mask` and removes the `as_frame` -> `as_column` round trip. Benchmark the column method:

```
------------------------------------- benchmark 'col0': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col0] (afte)      87.0090 (1.0)      132.8980 (1.0)       95.8815 (1.0)
column_apply_boolean_mask[col0] (befo)     210.4580 (2.42)     307.8270 (2.32)     225.4821 (2.35)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col1': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col1] (afte)      74.2240 (1.0)      110.0600 (1.0)       75.6356 (1.0)
column_apply_boolean_mask[col1] (befo)     172.5240 (2.32)     278.5250 (2.53)     176.5672 (2.33)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col2': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col2] (afte)     101.5740 (1.0)      141.8850 (1.0)      110.2334 (1.0)
column_apply_boolean_mask[col2] (befo)     234.1140 (2.30)     312.7140 (2.20)     245.5453 (2.23)
-----------------------------------------------------------------------------------------------------

------------------------------------- benchmark 'col3': 2 tests -------------------------------------
Name (time in us)                               Min                 Max                Mean
-----------------------------------------------------------------------------------------------------
column_apply_boolean_mask[col3] (afte)      88.7710 (1.0)      142.7500 (1.0)       90.5082 (1.0)
column_apply_boolean_mask[col3] (befo)     195.0980 (2.20)     303.1020 (2.12)     199.8368 (2.21)
-----------------------------------------------------------------------------------------------------
```

Dataframe benchmark
```
----------------------------------- benchmark '100': 2 tests -----------------------------------
Name (time in us)                          Min                 Max                Mean
------------------------------------------------------------------------------------------------
df_apply_boolean_mask[100] (afte)     380.6770 (1.05)     654.7080 (1.18)     389.3374 (1.03)
df_apply_boolean_mask[100] (befo)     362.3220 (1.0)      554.6130 (1.0)      378.7087 (1.0)
------------------------------------------------------------------------------------------------

----------------------------------- benchmark '10000': 2 tests -----------------------------------
Name (time in us)                            Min                 Max                Mean
--------------------------------------------------------------------------------------------------
df_apply_boolean_mask[10000] (afte)     399.5240 (1.05)     461.6310 (1.0)      405.1225 (1.04)
df_apply_boolean_mask[10000] (befo)     379.4080 (1.0)      564.5770 (1.22)     389.6990 (1.0)
--------------------------------------------------------------------------------------------------
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9832
---
 python/cudf/cudf/_lib/stream_compaction.pyx | 18 +++++----------
 python/cudf/cudf/core/_base_index.py        | 25 ++++++++++++++++++++-
 python/cudf/cudf/core/algorithms.py         |  4 ++--
 python/cudf/cudf/core/column/column.py      | 10 ++++++---
 python/cudf/cudf/core/frame.py              | 13 -----------
 python/cudf/cudf/core/indexed_frame.py      | 20 +++++++++++++++++
 6 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index ef47e843723..4330c565982 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -75,24 +75,22 @@ def drop_nulls(columns: list, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
-def apply_boolean_mask(source_table, Column boolean_mask):
+def apply_boolean_mask(columns: list, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
 
     Parameters
     ----------
-    source_table : source table whose rows are dropped as per boolean_mask
+    columns : list of columns whose rows are dropped as per boolean_mask
     boolean_mask : a boolean column of same size as source_table
 
     Returns
     -------
-    Frame obtained from applying mask
+    columns obtained from applying mask
     """
 
-    assert pd.api.types.is_bool_dtype(boolean_mask.dtype)
-
     cdef unique_ptr[table] c_result
-    cdef table_view source_table_view = table_view_from_table(source_table)
+    cdef table_view source_table_view = table_view_from_columns(columns)
     cdef column_view boolean_mask_view = boolean_mask.view()
 
     with nogil:
@@ -103,13 +101,7 @@ def apply_boolean_mask(source_table, Column boolean_mask):
             )
         )
 
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=(
-            None if source_table._index
-            is None else source_table._index_names)
-    )
+    return columns_from_unique_ptr(move(c_result))
 
 
 def drop_duplicates(columns: list,
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index aa89b8f849f..683f3fefe1c 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -9,8 +9,15 @@
 import pandas as pd
 
 import cudf
+from cudf._lib.stream_compaction import apply_boolean_mask
 from cudf._typing import DtypeObj
-from cudf.api.types import is_dtype_equal, is_integer, is_list_like, is_scalar
+from cudf.api.types import (
+    is_bool_dtype,
+    is_dtype_equal,
+    is_integer,
+    is_list_like,
+    is_scalar,
+)
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1414,6 +1421,22 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            apply_boolean_mask(list(self._columns), boolean_mask),
+            column_names=self._column_names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
             raise ValueError(f"Out of bound level: {levels}")
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 18c86f82f9c..a2a909968dc 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -5,8 +5,8 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex
+from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.series import Series
 
 
@@ -92,7 +92,7 @@ def _index_or_values_interpolation(column, index=None):
     if num_nan == 0 or num_nan == len(column):
         return column
 
-    to_interp = Frame(data={None: column}, index=index)
+    to_interp = IndexedFrame(data={None: column}, index=index)
     known_x_and_y = to_interp._apply_boolean_mask(as_column(~mask))
 
     known_x = known_x_and_y._index._column.values
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c1e037499fc..a966276842f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -35,6 +35,7 @@
 )
 from cudf._lib.scalar import as_device_scalar
 from cudf._lib.stream_compaction import (
+    apply_boolean_mask,
     distinct_count as cpp_distinct_count,
     drop_duplicates,
     drop_nulls,
@@ -997,9 +998,12 @@ def as_decimal32_column(
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
-        mask = as_column(mask, dtype="bool")
-        return (
-            self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
+        mask = as_column(mask)
+        if not is_bool_dtype(mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
+            self.dtype
         )
 
     def argsort(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0345966d6bd..6e47c0f41cf 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1461,19 +1461,6 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
 
         return self[out_cols]
 
-    def _apply_boolean_mask(self, boolean_mask):
-        """
-        Applies boolean mask to each row of `self`,
-        rows corresponding to `False` is dropped
-        """
-        result = self.__class__._from_data(
-            *libcudf.stream_compaction.apply_boolean_mask(
-                self, as_column(boolean_mask)
-            )
-        )
-        result._copy_type_metadata(self)
-        return result
-
     def interpolate(
         self,
         method="linear",
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f4d4a88195..7c5783bf637 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -19,6 +19,7 @@
 from cudf._typing import ColumnLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
+    is_bool_dtype,
     is_categorical_dtype,
     is_integer_dtype,
     is_list_like,
@@ -1197,6 +1198,25 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def _apply_boolean_mask(self, boolean_mask):
+        """Apply boolean mask to each row of `self`.
+
+        Rows corresponding to `False` is dropped.
+        """
+        boolean_mask = cudf.core.column.as_column(boolean_mask)
+        if not is_bool_dtype(boolean_mask.dtype):
+            raise ValueError("boolean_mask is not boolean type.")
+
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.apply_boolean_mask(
+                list(self._index._columns + self._columns), boolean_mask
+            ),
+            column_names=self._column_names,
+            index_names=self._index.names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
         if level is not None and not isinstance(level, (tuple, list)):

From a43682e99ab618ec2028cd224abcfc56e2b2fabb Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 11 Jan 2022 19:20:48 -0500
Subject: [PATCH 152/202] cudftestutil no longer propagates compiler flags to
 external users (#10017)

Fixes #9952

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/10017
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 84e486c7e18..a8100fb3f92 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -618,7 +618,7 @@ set_target_properties(
 )
 
 target_compile_options(
-  cudftestutil PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
                       "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
 )
 

From 093b0ad62ba44b21df2f6f4d23949ef49469d824 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 12 Jan 2022 08:45:06 -0500
Subject: [PATCH 153/202] Add strings tests to transpose_test.cpp (#9985)

This is a follow on to PR #9937.

Adds to the gtests in  `transpose_test.cpp` to include strings as supported by `cudf::transpose`. No function has changed -- only additional tests have been added.

The utility functions in `transpose_test.cpp` were enhanced to include string types by accepting a column-wrapper type template parameter as required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9985
---
 cpp/include/cudf_test/type_lists.hpp   |  9 +++-
 cpp/tests/transpose/transpose_test.cpp | 58 ++++++++++++++++----------
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 097d072a5b4..3c46b912639 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -158,6 +158,13 @@ std::enable_if_t<cudf::is_timestamp_t<TypeParam>::value, TypeParam> make_type_pa
   return TypeParam{typename TypeParam::duration(init_value)};
 }
 
+template <typename TypeParam, typename T>
+std::enable_if_t<std::is_same_v<TypeParam, std::string>, TypeParam> make_type_param_scalar(
+  T const init_value)
+{
+  return std::to_string(init_value);
+}
+
 /**
  * @brief Type list for all integral types except type bool.
  */
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 7b7b7d8a4a9..e3d9808b211 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 #include <algorithm>
 #include <limits>
 #include <random>
+#include <string>
 
 namespace {
-using cudf::test::fixed_width_column_wrapper;
 
 template <typename T, typename F>
 auto generate_vectors(size_t ncols, size_t nrows, F generator)
@@ -59,10 +59,10 @@ auto transpose_vectors(std::vector<std::vector<T>> const& input)
   return transposed;
 }
 
-template <typename T>
+template <typename T, typename ColumnWrapper>
 auto make_columns(std::vector<std::vector<T>> const& values)
 {
-  std::vector<fixed_width_column_wrapper<T>> columns;
+  std::vector<ColumnWrapper> columns;
   columns.reserve(values.size());
 
   for (auto const& value_col : values) {
@@ -72,11 +72,11 @@ auto make_columns(std::vector<std::vector<T>> const& values)
   return columns;
 }
 
-template <typename T>
+template <typename T, typename ColumnWrapper>
 auto make_columns(std::vector<std::vector<T>> const& values,
                   std::vector<std::vector<cudf::size_type>> const& valids)
 {
-  std::vector<fixed_width_column_wrapper<T>> columns;
+  std::vector<ColumnWrapper> columns;
   columns.reserve(values.size());
 
   for (size_t col = 0; col < values.size(); ++col) {
@@ -86,15 +86,14 @@ auto make_columns(std::vector<std::vector<T>> const& values,
   return columns;
 }
 
-template <typename T>
-auto make_table_view(std::vector<fixed_width_column_wrapper<T>> const& cols)
+template <typename ColumnWrapper>
+auto make_table_view(std::vector<ColumnWrapper> const& cols)
 {
   std::vector<cudf::column_view> views(cols.size());
 
-  std::transform(
-    cols.begin(), cols.end(), views.begin(), [](fixed_width_column_wrapper<T> const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
+  std::transform(cols.begin(), cols.end(), views.begin(), [](auto const& col) {
+    return static_cast<cudf::column_view>(col);
+  });
 
   return cudf::table_view(views);
 }
@@ -102,6 +101,10 @@ auto make_table_view(std::vector<fixed_width_column_wrapper<T>> const& cols)
 template <typename T>
 void run_test(size_t ncols, size_t nrows, bool add_nulls)
 {
+  using ColumnWrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                           cudf::test::strings_column_wrapper,
+                                           cudf::test::fixed_width_column_wrapper<T>>;
+
   std::mt19937 rng(1);
 
   // Generate values as vector of vectors
@@ -109,8 +112,8 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
     ncols, nrows, [&rng]() { return cudf::test::make_type_param_scalar<T>(rng()); });
   auto const valuesT = transpose_vectors(values);
 
-  std::vector<fixed_width_column_wrapper<T>> input_cols;
-  std::vector<fixed_width_column_wrapper<T>> expected_cols;
+  std::vector<ColumnWrapper> input_cols;
+  std::vector<ColumnWrapper> expected_cols;
   std::vector<cudf::size_type> expected_nulls(nrows);
 
   if (add_nulls) {
@@ -129,11 +132,11 @@ void run_test(size_t ncols, size_t nrows, bool add_nulls)
                    });
 
     // Create column wrappers from vector of vectors
-    input_cols    = make_columns(values, valids);
-    expected_cols = make_columns(valuesT, validsT);
+    input_cols    = make_columns<T, ColumnWrapper>(values, valids);
+    expected_cols = make_columns<T, ColumnWrapper>(valuesT, validsT);
   } else {
-    input_cols    = make_columns(values);
-    expected_cols = make_columns(valuesT);
+    input_cols    = make_columns<T, ColumnWrapper>(values);
+    expected_cols = make_columns<T, ColumnWrapper>(valuesT);
   }
 
   // Create table views from column wrappers
@@ -158,7 +161,13 @@ template <typename T>
 class TransposeTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(TransposeTest, cudf::test::FixedWidthTypes);
+// Using std::string here instead of cudf::test::StringTypes allows us to
+// use std::vector<T> utilities in this file just like the fixed-width types.
+// Should consider changing cudf::test::StringTypes to std::string instead of cudf::string_view.
+using StdStringType  = cudf::test::Types<std::string>;
+using TransposeTypes = cudf::test::Concat<cudf::test::FixedWidthTypes, StdStringType>;
+
+TYPED_TEST_SUITE(TransposeTest, TransposeTypes);  // cudf::test::FixedWidthTypes);
 
 TYPED_TEST(TransposeTest, SingleValue) { run_test<TypeParam>(1, 1, false); }
 
@@ -182,11 +191,14 @@ TYPED_TEST(TransposeTest, EmptyTable) { run_test<TypeParam>(0, 0, false); }
 
 TYPED_TEST(TransposeTest, EmptyColumns) { run_test<TypeParam>(10, 0, false); }
 
-TYPED_TEST(TransposeTest, MismatchedColumns)
+class TransposeTestError : public cudf::test::BaseFixture {
+};
+
+TEST_F(TransposeTestError, MismatchedColumns)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> col1({1, 2, 3});
-  fixed_width_column_wrapper<int8_t> col2{{4, 5, 6}};
-  fixed_width_column_wrapper<float> col3{{7, 8, 9}};
+  cudf::test::fixed_width_column_wrapper<uint32_t, int32_t> col1({1, 2, 3});
+  cudf::test::fixed_width_column_wrapper<int8_t> col2{{4, 5, 6}};
+  cudf::test::fixed_width_column_wrapper<float> col3{{7, 8, 9}};
   cudf::table_view input{{col1, col2, col3}};
   EXPECT_THROW(cudf::transpose(input), cudf::logic_error);
 }

From 76f89db80a64a2aa49b618aad80fe80e34e0332f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 12 Jan 2022 12:57:00 -0800
Subject: [PATCH 154/202] Update JNI to use new arena mr constructor (#10027)

And fix a failing test.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10027
---
 java/src/main/native/src/RmmJni.cpp            | 6 +++---
 java/src/test/java/ai/rapids/cudf/RmmTest.java | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index d07b754c8db..769e8d2f356 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -348,10 +348,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
     } else if (use_arena_alloc) {
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
+            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
+            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size);
       }
     } else if (use_cuda_async_alloc) {
       // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index f9d097158b6..c56b131de86 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -414,7 +414,7 @@ public void testCudaAsyncIsIncompatibleWithManaged() {
 
   @Test
   public void testCudaMemoryBuffer() {
-    Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 1024);
+    Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 8 * 1024 * 1024);
     try (CudaMemoryBuffer one = CudaMemoryBuffer.allocate(512);
          CudaMemoryBuffer two = CudaMemoryBuffer.allocate(1024)) {
       assertEquals(512, one.length);

From b8c4816d2ce5205e7b88e5f9be74bf4ea75dfbf5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 12 Jan 2022 15:11:17 -0600
Subject: [PATCH 155/202] Unpin `dask` and `distributed` in CI (#10028)

This PR unpins dask and distributed in CI.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/10028
---
 ci/benchmark/build.sh                    | 2 +-
 ci/gpu/build.sh                          | 2 +-
 conda/environments/cudf_dev_cuda11.5.yml | 4 ++--
 conda/recipes/dask-cudf/meta.yaml        | 8 ++++----
 python/dask_cudf/dev_requirements.txt    | 4 ++--
 python/dask_cudf/setup.py                | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 59bd908d151..534ac19ee98 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
+export DASK_DISTRIBUTED_GIT_TAG='main'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index a8afc03af94..39a39c46eff 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
+export DASK_DISTRIBUTED_GIT_TAG='main'
 
 # ucx-py version
 export UCX_PY_VERSION='0.24.*'
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index cc8d50a1717..c258a5caabb 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -41,8 +41,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
+  - dask>=2021.11.1
+  - distributed>=2021.11.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index da8bcea430a..fd34ff4112d 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
+    - dask>=2021.11.1
+    - distributed>=2021.11.1
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
+    - dask>=2021.11.1
+    - distributed>=2021.11.1
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index db85515f379..d8b0745be79 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.11.1,<=2021.11.2
-distributed>=2021.11.1,<=2021.11.2
+dask>=2021.11.1
+distributed>=2021.11.1
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index b52c2ea37d6..425839772eb 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.11.1,<=2021.11.2",
-    "distributed>=2021.11.1,<=2021.11.2",
+    "dask>=2021.11.1",
+    "distributed>=2021.11.1",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",

From 3176258bb2f1cdd03d80be54fe52208885fc44da Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 12 Jan 2022 15:41:32 -0800
Subject: [PATCH 156/202] Return null count from inplace_bitmask_and. (#9904)

This PR updates the function `cudf::detail::inplace_bitmask_and` to return the null count of the result. This change aligns `inplace_bitmask_and` with behavior changes introduced in #9616 to return null counts from functions acting on bitmasks. This will be helpful for #9621.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9904
---
 cpp/include/cudf/detail/null_mask.hpp |  4 ++--
 cpp/src/bitmask/null_mask.cu          | 14 +++++++-------
 cpp/src/structs/utilities.cpp         |  5 +++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 6ee406de5ef..83ef78a8250 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -268,9 +268,9 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(
  * @param mask_size_bits The number of bits to be ANDed in each mask
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
- * @return rmm::device_buffer Output bitmask
+ * @return Count of set bits
  */
-void inplace_bitmask_and(
+cudf::size_type inplace_bitmask_and(
   device_span<bitmask_type> dest_mask,
   host_span<bitmask_type const*> masks,
   host_span<size_type const> masks_begin_bits,
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index ec3776fb6d5..d1107ad3cfd 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -404,14 +404,14 @@ std::vector<size_type> segmented_null_count(const bitmask_type* bitmask,
 }
 
 // Inplace Bitwise AND of the masks
-void inplace_bitmask_and(device_span<bitmask_type> dest_mask,
-                         host_span<bitmask_type const*> masks,
-                         host_span<size_type const> begin_bits,
-                         size_type mask_size,
-                         rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
+                                    host_span<bitmask_type const*> masks,
+                                    host_span<size_type const> begin_bits,
+                                    size_type mask_size,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  inplace_bitmask_binop(
+  return inplace_bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
     dest_mask,
     masks,
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 43a32c8405a..afea8a55b16 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -322,14 +322,15 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
       reinterpret_cast<bitmask_type const*>(parent_null_mask),
       reinterpret_cast<bitmask_type const*>(current_child_mask)};
     std::vector<size_type> begin_bits{0, 0};
-    cudf::detail::inplace_bitmask_and(
+    auto const valid_count = cudf::detail::inplace_bitmask_and(
       device_span<bitmask_type>(current_child_mask, num_bitmask_words(child.size())),
       masks,
       begin_bits,
       child.size(),
       stream,
       mr);
-    child.set_null_count(UNKNOWN_NULL_COUNT);
+    auto const null_count = child.size() - valid_count;
+    child.set_null_count(null_count);
   }
 
   // If the child is also a struct, repeat for all grandchildren.

From 4950a7ae376200d086d4108edb572d164f4e81c8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 12 Jan 2022 20:16:34 -0800
Subject: [PATCH 157/202] Remove deprecated `method` parameter from `merge` and
 `join`. (#9944)

This PR removes the deprecated `method` parameter from `DataFrame.merge` and `DataFrame.join`. This resolves #9353 and follows up on #9291.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9944
---
 python/cudf/cudf/core/dataframe.py     | 30 +-------------------------
 python/cudf/cudf/tests/test_joining.py |  4 ++--
 2 files changed, 3 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fe6ac8e1529..123f86cc200 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3438,7 +3438,6 @@ def merge(
         sort=False,
         lsuffix=None,
         rsuffix=None,
-        method=None,
         indicator=False,
         suffixes=("_x", "_y"),
     ):
@@ -3490,9 +3489,6 @@ def merge(
         suffixes: Tuple[str, str], defaults to ('_x', '_y')
             Suffixes applied to overlapping column names on the left and right
             sides
-        method :
-            This parameter is unused. It is deprecated and will be removed in a
-            future version.
 
         Returns
         -------
@@ -3554,13 +3550,6 @@ def merge(
         else:
             lsuffix, rsuffix = suffixes
 
-        if method is not None:
-            warnings.warn(
-                "The 'method' argument is deprecated and will be removed "
-                "in a future version of cudf.",
-                FutureWarning,
-            )
-
         # Compute merge
         gdf_result = super()._merge(
             right,
@@ -3578,14 +3567,7 @@ def merge(
 
     @annotate("JOIN", color="blue", domain="cudf_python")
     def join(
-        self,
-        other,
-        on=None,
-        how="left",
-        lsuffix="",
-        rsuffix="",
-        sort=False,
-        method=None,
+        self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -3599,9 +3581,6 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
-        method :
-            This parameter is unused. It is deprecated and will be removed in a
-            future version.
 
         Returns
         -------
@@ -3615,13 +3594,6 @@ def join(
         - *on* is not supported yet due to lack of multi-index support.
         """
 
-        if method is not None:
-            warnings.warn(
-                "The 'method' argument is deprecated and will be removed "
-                "in a future version of cudf.",
-                FutureWarning,
-            )
-
         df = self.merge(
             other,
             left_index=True,
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index d25c6130bfb..2fb7393f5b4 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -256,7 +256,7 @@ def test_dataframe_join_mismatch_cats(how):
 
     pdf1 = pdf1.set_index("join_col")
     pdf2 = pdf2.set_index("join_col")
-    join_gdf = gdf1.join(gdf2, how=how, sort=True, method="hash")
+    join_gdf = gdf1.join(gdf2, how=how, sort=True)
     join_pdf = pdf1.join(pdf2, how=how)
 
     got = join_gdf.fillna(-1).to_pandas()
@@ -403,7 +403,7 @@ def test_dataframe_merge_order():
     gdf2["id"] = [4, 5]
     gdf2["a"] = [7, 8]
 
-    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"], method="hash")
+    gdf = gdf1.merge(gdf2, how="left", on=["id", "a"])
 
     df1 = pd.DataFrame()
     df2 = pd.DataFrame()

From fe71baba07f4b582b5ec2e36ad50301f1186ca34 Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 13 Jan 2022 06:11:42 -0800
Subject: [PATCH 158/202] Fix memory leaks in JNI native code. (#10029)

This commit fixes a couple of minor, host-side memory leaks
in the JNI native code. The objects in question did not need
to go on the heap. They have, in this commit, been switched to
stack objects.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/10029
---
 java/src/main/native/src/ColumnVectorJni.cpp | 4 ++--
 java/src/main/native/src/ColumnViewJni.cpp   | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index e61ab8444d1..b0286f9ac27 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -359,10 +359,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobje
     std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
-    cudf::table_view *input_table = new cudf::table_view(column_views);
+    cudf::table_view input_table{column_views};
 
     std::unique_ptr<cudf::column> result =
-        cudf::hash(*input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
+        cudf::hash(input_table, static_cast<cudf::hash_id>(hash_function_id), seed);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 73ea49c18d9..d2cc2ab7d1c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1604,17 +1604,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
     std::transform(n_cudf_columns.data(), n_cudf_columns.data() + n_cudf_columns.size(),
                    std::back_inserter(column_views),
                    [](auto const &p_column) { return *p_column; });
-    cudf::table_view *input_table = new cudf::table_view(column_views);
+    cudf::table_view input_table{column_views};
 
     cudf::binary_operator op = static_cast<cudf::binary_operator>(bin_op);
     switch (op) {
       case cudf::binary_operator::BITWISE_AND: {
-        auto [new_bitmask, null_count] = cudf::bitmask_and(*input_table);
+        auto [new_bitmask, null_count] = cudf::bitmask_and(input_table);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }
       case cudf::binary_operator::BITWISE_OR: {
-        auto [new_bitmask, null_count] = cudf::bitmask_or(*input_table);
+        auto [new_bitmask, null_count] = cudf::bitmask_or(input_table);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
       }

From d0c85e152db772b37d713b0f86ae787311d673ac Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 13 Jan 2022 12:21:56 -0500
Subject: [PATCH 159/202] build.sh respects the `--build_metrics` and
 `--incl_cache_stats` flags (#10035)

Previously the script would do a comparison check like "$V"=="ON" which isn't a comparison in bash but a joining of strings ( "OFF==ON"). To do a comparison of two strings you need to have white space around `==`.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10035
---
 build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index f5a59b6edcf..45074a6645f 100755
--- a/build.sh
+++ b/build.sh
@@ -186,7 +186,7 @@ if buildAll || hasArg libcudf; then
 
     # get the current count before the compile starts
     FILES_IN_CCACHE=""
-    if [[ "$BUILD_REPORT_INCL_CACHE_STATS"=="ON" && -x "$(command -v ccache)" ]]; then
+    if [[ "$BUILD_REPORT_INCL_CACHE_STATS" == "ON" && -x "$(command -v ccache)" ]]; then
         FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
         echo "$FILES_IN_CCACHE"
         # zero the ccache statistics
@@ -212,7 +212,7 @@ if buildAll || hasArg libcudf; then
     compile_total=$(( compile_end - compile_start ))
 
     # Record build times
-    if [[ "$BUILD_REPORT_METRICS"=="ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
+    if [[ "$BUILD_REPORT_METRICS" == "ON" && -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
         echo "Formatting build metrics"
         python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
         MSG="<p>"

From dbe65f1b977d76c93932a42ec8047690e84f0267 Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 13 Jan 2022 11:09:02 -0700
Subject: [PATCH 160/202] Fix null check when comparing structs in `arg_min`
 operation of reduction/groupby (#10026)

This is another fix for https://github.com/NVIDIA/spark-rapids/pull/4434, when the null order is wrongly handled if the input structs column does not have nulls at the top level but only has null at the children levels.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/10026
---
 cpp/src/reductions/struct_minmax_util.cuh | 15 +++----
 cpp/tests/groupby/max_tests.cpp           | 51 ++++++++++++++++-------
 cpp/tests/groupby/min_tests.cpp           | 51 ++++++++++++++++-------
 3 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/cpp/src/reductions/struct_minmax_util.cuh b/cpp/src/reductions/struct_minmax_util.cuh
index e5832b849bd..1de48ef482d 100644
--- a/cpp/src/reductions/struct_minmax_util.cuh
+++ b/cpp/src/reductions/struct_minmax_util.cuh
@@ -103,13 +103,14 @@ class comparison_binop_generator {
   {
     if (is_min_op) {
       null_orders = flattened_input.null_orders();
-      // Null structs are excluded from the operations, and that is equivalent to considering
-      // nulls as larger than all other non-null STRUCT elements (if finding for ARGMIN), or
-      // smaller than all other non-null STRUCT elements (if finding for ARGMAX).
-      // Thus, we need to set a separate null order for the top level structs column (which is
-      // stored at the first position in the null_orders array) to achieve this purpose.
-      null_orders.front() = cudf::null_order::AFTER;
-      null_orders_dvec    = cudf::detail::make_device_uvector_async(null_orders, stream);
+      // If the input column has nulls (at the top level), null structs are excluded from the
+      // operations, and that is equivalent to considering top-level nulls as larger than all other
+      // non-null STRUCT elements (if finding for ARGMIN), or smaller than all other non-null STRUCT
+      // elements (if finding for ARGMAX). Thus, we need to set a separate null order for the top
+      // level structs column (which is stored at the first position in the null_orders array) to
+      // achieve this purpose.
+      if (input.has_nulls()) { null_orders.front() = cudf::null_order::AFTER; }
+      null_orders_dvec = cudf::detail::make_device_uvector_async(null_orders, stream);
     }
     // else: Don't need to generate nulls order to copy to device memory if we have all null orders
     // are BEFORE (that happens when we have is_min_op == false).
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 47bed11df30..266312d16a2 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -391,22 +391,43 @@ TEST_F(groupby_max_struct_test, null_keys_and_values)
 TEST_F(groupby_max_struct_test, values_with_null_child)
 {
   constexpr int32_t null{0};
-  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
-  auto const vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
-    return structs_column_wrapper{child1, child2};
-  }();
-
-  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
-  auto const expect_vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{-1};
-    return structs_column_wrapper{child1, child2};
-  }();
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{-1};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 
-  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null, null}, nulls_at({0, 1})};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{-1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 64bffe1c883..00fa282cee4 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -390,22 +390,43 @@ TEST_F(groupby_min_struct_test, null_keys_and_values)
 TEST_F(groupby_min_struct_test, values_with_null_child)
 {
   constexpr int32_t null{0};
-  auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
-  auto const vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
-    return structs_column_wrapper{child1, child2};
-  }();
-
-  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
-  auto const expect_vals = [] {
-    auto child1 = fixed_width_column_wrapper<int32_t>{1};
-    auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
-    return structs_column_wrapper{child1, child2};
-  }();
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1, 1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{1};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 
-  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
-  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  {
+    auto const keys = fixed_width_column_wrapper<int32_t>{1, 1};
+    auto const vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{-1, null}, null_at(1)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null, null}, nulls_at({0, 1})};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto const expect_keys = fixed_width_column_wrapper<int32_t>{1};
+    auto const expect_vals = [] {
+      auto child1 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      auto child2 = fixed_width_column_wrapper<int32_t>{{null}, null_at(0)};
+      return structs_column_wrapper{child1, child2};
+    }();
+
+    auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+  }
 }
 
 }  // namespace test

From c07fdabb491815eab8e4eb7655b2946218ee8a42 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 13 Jan 2022 12:16:36 -0600
Subject: [PATCH 161/202] Load balance optimization for contiguous_split
 (#9755)

The existing `contiguous_split` implementation was vulnerable to situations where `number of columns N * number of splits M` was < the number of SMs on the gpu.   This PR implements a postprocessing step which attempts to distribute the amount of bytes to be copied as evenly as possible across all available SMs.

PR has been updated to repartition using a constant chunk size of 1 MB.  This yields better results than the initial approach.

Before/after benchmarks for some particularly degenerate cases (T4)

```
Before (4 partitions)
4GB, 4 columns, no splits                      43.3 ms         43.3 ms            8 bytes_per_second=46.1738G/s
After
4GB, 4 columns, no splits                      10.1 ms         10.1 ms            8 bytes_per_second=198.642G/s
```
```
Before (2 partitions)
1GB, 1 column + validity, no splits         114 ms          114 ms            8 bytes_per_second=17.5212G/s
After
1GB, 1 column + validity, no splits         10.5 ms         10.6 ms            8 bytes_per_second=189.784G/s
```

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Elias Stehle (https://github.com/elstehle)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9755
---
 .../copying/contiguous_split_benchmark.cu     |  65 +++--
 cpp/src/copying/contiguous_split.cu           | 248 +++++++++++++++---
 cpp/tests/copying/split_tests.cpp             |  27 ++
 3 files changed, 277 insertions(+), 63 deletions(-)

diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
index 506d676d196..55e1360efc8 100644
--- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu
+++ b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,10 +34,18 @@ void BM_contiguous_split_common(benchmark::State& state,
                                 int64_t bytes_total)
 {
   // generate splits
-  cudf::size_type split_stride = num_rows / num_splits;
   std::vector<cudf::size_type> splits;
-  for (int idx = 0; idx < num_rows; idx += split_stride) {
-    splits.push_back(std::min(idx + split_stride, static_cast<cudf::size_type>(num_rows)));
+  if (num_splits > 0) {
+    cudf::size_type const split_stride = num_rows / num_splits;
+    // start after the first element.
+    auto iter = thrust::make_counting_iterator(1);
+    splits.reserve(num_splits);
+    std::transform(iter,
+                   iter + num_splits,
+                   std::back_inserter(splits),
+                   [split_stride, num_rows](cudf::size_type i) {
+                     return std::min(i * split_stride, static_cast<cudf::size_type>(num_rows));
+                   });
   }
 
   std::vector<std::unique_ptr<cudf::column>> columns(src_cols.size());
@@ -53,7 +61,8 @@ void BM_contiguous_split_common(benchmark::State& state,
     auto result = cudf::contiguous_split(src_table, splits);
   }
 
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total);
+  // it's 2x bytes_total because we're both reading and writing.
+  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * bytes_total * 2);
 }
 
 class ContiguousSplit : public cudf::benchmark {
@@ -61,13 +70,13 @@ class ContiguousSplit : public cudf::benchmark {
 
 void BM_contiguous_split(benchmark::State& state)
 {
-  int64_t total_desired_bytes = state.range(0);
-  cudf::size_type num_cols    = state.range(1);
-  cudf::size_type num_splits  = state.range(2);
-  bool include_validity       = state.range(3) == 0 ? false : true;
+  int64_t const total_desired_bytes = state.range(0);
+  cudf::size_type const num_cols    = state.range(1);
+  cudf::size_type const num_splits  = state.range(2);
+  bool const include_validity       = state.range(3) == 0 ? false : true;
 
   cudf::size_type el_size = 4;  // ints and floats
-  int64_t num_rows        = total_desired_bytes / (num_cols * el_size);
+  int64_t const num_rows  = total_desired_bytes / (num_cols * el_size);
 
   // generate input table
   srand(31337);
@@ -85,8 +94,10 @@ void BM_contiguous_split(benchmark::State& state)
     }
   }
 
-  size_t total_bytes = total_desired_bytes;
-  if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
+  int64_t const total_bytes =
+    total_desired_bytes +
+    (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
+                      : 0);
 
   BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
 }
@@ -102,17 +113,17 @@ int rand_range(int r)
 
 void BM_contiguous_split_strings(benchmark::State& state)
 {
-  int64_t total_desired_bytes = state.range(0);
-  cudf::size_type num_cols    = state.range(1);
-  cudf::size_type num_splits  = state.range(2);
-  bool include_validity       = state.range(3) == 0 ? false : true;
+  int64_t const total_desired_bytes = state.range(0);
+  cudf::size_type const num_cols    = state.range(1);
+  cudf::size_type const num_splits  = state.range(2);
+  bool const include_validity       = state.range(3) == 0 ? false : true;
 
-  const int64_t string_len = 8;
+  constexpr int64_t string_len = 8;
   std::vector<const char*> h_strings{
     "aaaaaaaa", "bbbbbbbb", "cccccccc", "dddddddd", "eeeeeeee", "ffffffff", "gggggggg", "hhhhhhhh"};
 
-  int64_t col_len_bytes = total_desired_bytes / num_cols;
-  int64_t num_rows      = col_len_bytes / string_len;
+  int64_t const col_len_bytes = total_desired_bytes / num_cols;
+  int64_t const num_rows      = col_len_bytes / string_len;
 
   // generate input table
   srand(31337);
@@ -133,8 +144,10 @@ void BM_contiguous_split_strings(benchmark::State& state)
     }
   }
 
-  size_t total_bytes = total_desired_bytes + (num_rows * sizeof(cudf::size_type));
-  if (include_validity) { total_bytes += num_rows / (sizeof(cudf::bitmask_type) * 8); }
+  int64_t const total_bytes =
+    total_desired_bytes + ((num_rows + 1) * sizeof(cudf::offset_type)) +
+    (include_validity ? (max(int64_t{1}, (num_rows / 32)) * sizeof(cudf::bitmask_type) * num_cols)
+                      : 0);
 
   BM_contiguous_split_common(state, src_cols, num_rows, num_splits, total_bytes);
 }
@@ -157,12 +170,16 @@ CSBM_BENCHMARK_DEFINE(6Gb10ColsValidity, (int64_t)6 * 1024 * 1024 * 1024, 10, 25
 CSBM_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
-CSBM_BENCHMARK_DEFINE(46b10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
+CSBM_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
 
 CSBM_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
+CSBM_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
 
 #define CSBM_STRINGS_BENCHMARK_DEFINE(name, size, num_columns, num_splits, validity) \
   BENCHMARK_DEFINE_F(ContiguousSplitStrings, name)(::benchmark::State & state)       \
@@ -179,8 +196,12 @@ CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb512ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsNoValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(4Gb10ColsValidity, (int64_t)4 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 0);
+CSBM_STRINGS_BENCHMARK_DEFINE(4Gb4ColsValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 4, 0, 1);
 
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb512ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 512, 256, 1);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsNoValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 0);
 CSBM_STRINGS_BENCHMARK_DEFINE(1Gb10ColsValidity, (int64_t)1 * 1024 * 1024 * 1024, 10, 256, 1);
+CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 0);
+CSBM_STRINGS_BENCHMARK_DEFINE(1Gb1ColValidityNoSplits, (int64_t)1 * 1024 * 1024 * 1024, 1, 0, 1);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 8dc93bc1de3..f8c0006ed45 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/discard_iterator.h>
 
 #include <numeric>
@@ -82,16 +83,21 @@ struct src_buf_info {
  * M partitions, then we have N*M destination buffers.
  */
 struct dst_buf_info {
+  // constant across all copy commands for this buffer
   std::size_t buf_size;  // total size of buffer, including padding
   int num_elements;      // # of elements to be copied
   int element_size;      // size of each element in bytes
-  int num_rows;  // # of rows (which may be different from num_elements in the case of validity or
-                 // offset buffers)
-  int src_row_index;       // row index to start reading from from my associated source buffer
+  int num_rows;  // # of rows to be copied(which may be different from num_elements in the case of
+                 // validity or offset buffers)
+
+  int src_element_index;   // element index to start reading from from my associated source buffer
   std::size_t dst_offset;  // my offset into the per-partition allocation
   int value_shift;         // amount to shift values down by (for offset buffers)
   int bit_shift;           // # of bits to shift right by (for validity buffers)
-  size_type valid_count;
+  size_type valid_count;   // validity count for this block of work
+
+  int src_buf_index;  // source buffer index
+  int dst_buf_index;  // destination buffer index
 };
 
 /**
@@ -116,7 +122,7 @@ struct dst_buf_info {
  * @param t Thread index
  * @param num_elements Number of elements to copy
  * @param element_size Size of each element in bytes
- * @param src_row_index Row index to start copying at
+ * @param src_element_index Element index to start copying at
  * @param stride Size of the kernel block
  * @param value_shift Shift incoming 4-byte offset values down by this amount
  * @param bit_shift Shift incoming data right by this many bits
@@ -129,14 +135,14 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
                             int t,
                             std::size_t num_elements,
                             std::size_t element_size,
-                            std::size_t src_row_index,
+                            std::size_t src_element_index,
                             uint32_t stride,
                             int value_shift,
                             int bit_shift,
                             std::size_t num_rows,
                             size_type* valid_count)
 {
-  src += (src_row_index * element_size);
+  src += (src_element_index * element_size);
 
   size_type thread_valid_count = 0;
 
@@ -240,38 +246,36 @@ __device__ void copy_buffer(uint8_t* __restrict__ dst,
 }
 
 /**
- * @brief Kernel which copies a single buffer from a set of partitioned
- * column buffers.
+ * @brief Kernel which copies data from multiple source buffers to multiple
+ * destination buffers.
  *
  * When doing a contiguous_split on X columns comprising N total internal buffers
  * with M splits, we end up having to copy N*M source/destination buffer pairs.
+ * These logical copies are further subdivided to distribute the amount of work
+ * to be done as evenly as possible across the multiprocessors on the device.
  * This kernel is arranged such that each block copies 1 source/destination pair.
- * This function retrieves the relevant buffers and then calls copy_buffer to perform
- * the actual copy.
  *
- * @param num_src_bufs Total number of source buffers (N)
- * @param src_bufs Input source buffers (N)
- * @param dst_bufs Destination buffers (N*M)
+ * @param src_bufs Input source buffers
+ * @param dst_bufs Destination buffers
  * @param buf_info Information on the range of values to be copied for each destination buffer.
  */
 template <int block_size>
-__global__ void copy_partition(int num_src_bufs,
-                               uint8_t const** src_bufs,
-                               uint8_t** dst_bufs,
-                               dst_buf_info* buf_info)
+__global__ void copy_partitions(uint8_t const** src_bufs,
+                                uint8_t** dst_bufs,
+                                dst_buf_info* buf_info)
 {
-  int const partition_index   = blockIdx.x / num_src_bufs;
-  int const src_buf_index     = blockIdx.x % num_src_bufs;
-  std::size_t const buf_index = (partition_index * num_src_bufs) + src_buf_index;
+  auto const buf_index     = blockIdx.x;
+  auto const src_buf_index = buf_info[buf_index].src_buf_index;
+  auto const dst_buf_index = buf_info[buf_index].dst_buf_index;
 
   // copy, shifting offsets and validity bits as needed
   copy_buffer<block_size>(
-    dst_bufs[partition_index] + buf_info[buf_index].dst_offset,
+    dst_bufs[dst_buf_index] + buf_info[buf_index].dst_offset,
     src_bufs[src_buf_index],
     threadIdx.x,
     buf_info[buf_index].num_elements,
     buf_info[buf_index].element_size,
-    buf_info[buf_index].src_row_index,
+    buf_info[buf_index].src_element_index,
     blockDim.x,
     buf_info[buf_index].value_shift,
     buf_info[buf_index].bit_shift,
@@ -728,9 +732,32 @@ struct dst_offset_output_iterator {
   using reference         = std::size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
-  dst_offset_output_iterator operator+ __host__ __device__(int i)
+  dst_offset_output_iterator operator+ __host__ __device__(int i) { return {c + i}; }
+
+  void operator++ __host__ __device__() { c++; }
+
+  reference operator[] __device__(int i) { return dereference(c + i); }
+  reference operator* __device__() { return dereference(c); }
+
+ private:
+  reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; }
+};
+
+/**
+ * @brief Output iterator for writing values to the valid_count field of the
+ * dst_buf_info struct
+ */
+struct dst_valid_count_output_iterator {
+  dst_buf_info* c;
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  dst_valid_count_output_iterator operator+ __host__ __device__(int i)
   {
-    return dst_offset_output_iterator{c + i};
+    return dst_valid_count_output_iterator{c + i};
   }
 
   void operator++ __host__ __device__() { c++; }
@@ -739,7 +766,7 @@ struct dst_offset_output_iterator {
   reference operator* __device__() { return dereference(c); }
 
  private:
-  reference __device__ dereference(dst_buf_info* c) { return c->dst_offset; }
+  reference __device__ dereference(dst_buf_info* c) { return c->valid_count; }
 };
 
 /**
@@ -762,6 +789,148 @@ struct size_of_helper {
   }
 };
 
+/**
+ * @brief Functor for returning the number of chunks an input buffer is being
+ * subdivided into during the repartitioning step.
+ *
+ * Note: columns types which themselves inherently have no data (strings, lists,
+ * structs) return 0.
+ */
+struct num_chunks_func {
+  thrust::pair<size_t, size_t> const* chunks;
+  __device__ size_t operator()(size_type i) const { return thrust::get<0>(chunks[i]); }
+};
+
+void copy_data(int num_bufs,
+               int num_src_bufs,
+               uint8_t const** d_src_bufs,
+               uint8_t** d_dst_bufs,
+               dst_buf_info* _d_dst_buf_info,
+               rmm::cuda_stream_view stream)
+{
+  // Since we parallelize at one block per copy, we are vulnerable to situations where we
+  // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
+  // so we will take the actual set of outgoing source/destination buffers and further partition
+  // them into much smaller chunks in order to drive up the number of blocks and overall occupancy.
+  auto const desired_chunk_size = size_t{1 * 1024 * 1024};
+  rmm::device_uvector<thrust::pair<size_t, size_t>> chunks(num_bufs, stream);
+  thrust::transform(
+    rmm::exec_policy(stream),
+    _d_dst_buf_info,
+    _d_dst_buf_info + num_bufs,
+    chunks.begin(),
+    [desired_chunk_size] __device__(dst_buf_info const& buf) -> thrust::pair<size_t, size_t> {
+      // Total bytes for this incoming partition
+      size_t const bytes = buf.num_elements * buf.element_size;
+
+      // This clause handles nested data types (e.g. list or string) that store no data in the roow
+      // columns, only in their children.
+      if (bytes == 0) { return {1, 0}; }
+
+      // The number of chunks we want to subdivide this buffer into
+      size_t const num_chunks =
+        max(size_t{1}, util::round_up_unsafe(bytes, desired_chunk_size) / desired_chunk_size);
+
+      // NOTE: leaving chunk size as a separate parameter for future tuning
+      // possibilities, even though in the current implementation it will be a
+      // constant.
+      return {num_chunks, desired_chunk_size};
+    });
+
+  rmm::device_uvector<offset_type> chunk_offsets(num_bufs + 1, stream);
+  auto buf_count_iter = cudf::detail::make_counting_transform_iterator(
+    0, [num_bufs, num_chunks = num_chunks_func{chunks.begin()}] __device__(size_type i) {
+      return i == num_bufs ? 0 : num_chunks(i);
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         buf_count_iter,
+                         buf_count_iter + num_bufs + 1,
+                         chunk_offsets.begin(),
+                         0);
+
+  auto out_to_in_index = [chunk_offsets = chunk_offsets.begin(), num_bufs] __device__(size_type i) {
+    return static_cast<size_type>(
+             thrust::upper_bound(thrust::seq, chunk_offsets, chunk_offsets + num_bufs + 1, i) -
+             chunk_offsets) -
+           1;
+  };
+
+  // apply the chunking.
+  auto const num_chunks =
+    cudf::detail::make_counting_transform_iterator(0, num_chunks_func{chunks.begin()});
+  size_type const new_buf_count =
+    thrust::reduce(rmm::exec_policy(stream), num_chunks, num_chunks + chunks.size());
+  rmm::device_uvector<dst_buf_info> d_dst_buf_info(new_buf_count, stream);
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    iter,
+    iter + new_buf_count,
+    [_d_dst_buf_info,
+     d_dst_buf_info = d_dst_buf_info.begin(),
+     chunks         = chunks.begin(),
+     chunk_offsets  = chunk_offsets.begin(),
+     num_bufs,
+     num_src_bufs,
+     out_to_in_index] __device__(size_type i) {
+      size_type const in_buf_index = out_to_in_index(i);
+      size_type const chunk_index  = i - chunk_offsets[in_buf_index];
+      auto const chunk_size        = thrust::get<1>(chunks[in_buf_index]);
+      dst_buf_info const& in       = _d_dst_buf_info[in_buf_index];
+
+      // adjust info
+      dst_buf_info& out = d_dst_buf_info[i];
+      out.element_size  = in.element_size;
+      out.value_shift   = in.value_shift;
+      out.bit_shift     = in.bit_shift;
+      out.valid_count =
+        in.valid_count;  // valid count will be set to 1 if this is a validity buffer
+      out.src_buf_index = in.src_buf_index;
+      out.dst_buf_index = in.dst_buf_index;
+
+      size_type const elements_per_chunk =
+        out.element_size == 0 ? 0 : chunk_size / out.element_size;
+      out.num_elements = ((chunk_index + 1) * elements_per_chunk) > in.num_elements
+                           ? in.num_elements - (chunk_index * elements_per_chunk)
+                           : elements_per_chunk;
+
+      size_type const rows_per_chunk =
+        // if this is a validity buffer, each element is a bitmask_type, which
+        // corresponds to 32 rows.
+        out.valid_count > 0
+          ? elements_per_chunk * static_cast<size_type>(detail::size_in_bits<bitmask_type>())
+          : elements_per_chunk;
+      out.num_rows = ((chunk_index + 1) * rows_per_chunk) > in.num_rows
+                       ? in.num_rows - (chunk_index * rows_per_chunk)
+                       : rows_per_chunk;
+
+      out.src_element_index = in.src_element_index + (chunk_index * elements_per_chunk);
+      out.dst_offset        = in.dst_offset + (chunk_index * chunk_size);
+
+      // out.bytes and out.buf_size are unneeded here because they are only used to
+      // calculate real output buffer sizes. the data we are generating here is
+      // purely intermediate for the purposes of doing more uniform copying of data
+      // underneath the final structure of the output
+    });
+
+  // perform the copy
+  constexpr size_type block_size = 256;
+  copy_partitions<block_size><<<new_buf_count, block_size, 0, stream.value()>>>(
+    d_src_bufs, d_dst_bufs, d_dst_buf_info.data());
+
+  // postprocess valid_counts
+  auto keys = cudf::detail::make_counting_transform_iterator(
+    0, [out_to_in_index] __device__(size_type i) { return out_to_in_index(i); });
+  auto values = thrust::make_transform_iterator(
+    d_dst_buf_info.begin(), [] __device__(dst_buf_info const& info) { return info.valid_count; });
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        keys,
+                        keys + new_buf_count,
+                        values,
+                        thrust::make_discard_iterator(),
+                        dst_valid_count_output_iterator{_d_dst_buf_info});
+}
+
 };  // anonymous namespace
 
 namespace detail {
@@ -933,9 +1102,9 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
         }
       }
 
-      // final row indices and row count
-      int const out_row_index = src_info.is_validity ? row_start / 32 : row_start;
-      int const num_rows      = row_end - row_start;
+      // final element indices and row count
+      int const out_element_index = src_info.is_validity ? row_start / 32 : row_start;
+      int const num_rows          = row_end - row_start;
       // if I am an offsets column, all my values need to be shifted
       int const value_shift = src_info.offsets == nullptr ? 0 : src_info.offsets[row_start];
       // if I am a validity column, we may need to shift bits
@@ -953,15 +1122,17 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
       std::size_t const bytes =
         static_cast<std::size_t>(num_elements) * static_cast<std::size_t>(element_size);
 
-      return dst_buf_info{util::round_up_unsafe(bytes, 64ul),
+      return dst_buf_info{util::round_up_unsafe(bytes, split_align),
                           num_elements,
                           element_size,
                           num_rows,
-                          out_row_index,
+                          out_element_index,
                           0,
                           value_shift,
                           bit_shift,
-                          src_info.is_validity ? 1 : 0};
+                          src_info.is_validity ? 1 : 0,
+                          src_buf_index,
+                          split_index};
     });
 
   // compute total size of each partition
@@ -1043,12 +1214,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   CUDA_TRY(cudaMemcpyAsync(
     d_src_bufs, h_src_bufs, src_bufs_size + dst_bufs_size, cudaMemcpyHostToDevice, stream.value()));
 
-  // copy.  1 block per buffer
-  {
-    constexpr size_type block_size = 256;
-    copy_partition<block_size><<<num_bufs, block_size, 0, stream.value()>>>(
-      num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info);
-  }
+  // perform the copy.
+  copy_data(num_bufs, num_src_bufs, d_src_bufs, d_dst_bufs, d_dst_buf_info, stream);
 
   // DtoH dst info (to retrieve null counts)
   CUDA_TRY(cudaMemcpyAsync(
@@ -1078,7 +1245,6 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
     cols.clear();
   }
-
   return result;
 }
 
@@ -1092,4 +1258,4 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   return cudf::detail::contiguous_split(input, splits, rmm::cuda_stream_default, mr);
 }
 
-};  // namespace cudf
+};  // namespace cudf
\ No newline at end of file
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index b5a793ecd1c..1ee732b8a59 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/filling.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -1315,6 +1316,32 @@ TEST_F(ContiguousSplitUntypedTest, ProgressiveSizes)
   }
 }
 
+TEST_F(ContiguousSplitUntypedTest, ValidityRepartition)
+{
+  // it is tricky to actually get the internal repartitioning/load-balancing code to add new splits
+  // inside a validity buffer.  Under almost all situations, the fraction of bytes that validity
+  // represents is so small compared to the bytes for all other data, that those buffers end up not
+  // getting subdivided. this test forces it happen by using a small, single column of int8's, which
+  // keeps the overall fraction that validity takes up large enough to cause a repartition.
+  srand(0);
+  auto rvalids                   = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    return static_cast<float>(rand()) / static_cast<float>(RAND_MAX) < 0.5f ? 0 : 1;
+  });
+  cudf::size_type const num_rows = 2000000;
+  auto col                       = cudf::sequence(num_rows, cudf::numeric_scalar<int8_t>{0});
+  col->set_null_mask(cudf::test::detail::make_null_mask(rvalids, rvalids + num_rows));
+
+  cudf::table_view t({*col});
+  auto result   = cudf::contiguous_split(t, {num_rows / 2});
+  auto expected = cudf::split(t, {num_rows / 2});
+  CUDF_EXPECTS(result.size() == expected.size(),
+               "Mismatch in split results in ValidityRepartition test");
+
+  for (size_t idx = 0; idx < result.size(); idx++) {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result[idx].table, expected[idx]);
+  }
+}
+
 TEST_F(ContiguousSplitUntypedTest, ValidityEdgeCase)
 {
   // tests an edge case where the splits cause the final validity data to be copied

From 1eceaed26d8242401f2be12b50eb635872fe1bf6 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 14 Jan 2022 06:29:14 +0530
Subject: [PATCH 162/202] Add partitioning support to Parquet chunked writer
 (#10000)

Chunked writer (`class ParquetWriter`) now takes an argument `partition_cols`. For each call to `write_table(df)`, the `df` is partitioned and the parts are appended to the same corresponding file in the dataset directory. This can be used when partitioning is desired but when one wants to avoid making many small files in each sub directory e.g.
Instead of repeated call to `write_to_dataset` like so:
```python
write_to_dataset(df1, root_path, partition_cols=['group'])
write_to_dataset(df2, root_path, partition_cols=['group'])
...
```
which will yield the following structure
```
root_dir/
  group=value1/
    <uuid1>.parquet
    <uuid2>.parquet
    ...
  group=value2/
    <uuid1>.parquet
    <uuid2>.parquet
    ...
  ...
```
One can write with
```python
pw = ParquetWriter(root_path, partition_cols=['group'])
pw.write_table(df1)
pw.write_table(df2)
pw.close()
```
to get the structure
```
root_dir/
  group=value1/
    <uuid1>.parquet
  group=value2/
    <uuid1>.parquet
  ...
```

Closes #7196
Also workaround fixes
fixes #9216
fixes #7011

TODO:

- [x] Tests

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10000
---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |   4 +
 python/cudf/cudf/_lib/parquet.pyx        |  52 +++--
 python/cudf/cudf/io/parquet.py           | 257 ++++++++++++++++++++---
 python/cudf/cudf/tests/test_parquet.py   |  83 +++++++-
 4 files changed, 350 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 07b312361f2..d02fffe9c0d 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -207,6 +207,10 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_chunked_writer& write(
             cudf_table_view.table_view table_,
         ) except+
+        parquet_chunked_writer& write(
+            const cudf_table_view.table_view& table_,
+            const vector[cudf_io_types.partition_info]& partitions,
+        ) except+
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
         ) except+
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 36099b03ef6..16873435e1d 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -411,23 +411,31 @@ cdef class ParquetWriter:
     cdef unique_ptr[cpp_parquet_chunked_writer] writer
     cdef unique_ptr[table_input_metadata] tbl_meta
     cdef cudf_io_types.sink_info sink
-    cdef unique_ptr[cudf_io_types.data_sink] _data_sink
+    cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink
     cdef cudf_io_types.statistics_freq stat_freq
     cdef cudf_io_types.compression_type comp_type
     cdef object index
 
-    def __cinit__(self, object path, object index=None,
+    def __cinit__(self, object filepaths_or_buffers, object index=None,
                   object compression=None, str statistics="ROWGROUP"):
-        self.sink = make_sink_info(path, self._data_sink)
+        filepaths_or_buffers = (
+            list(filepaths_or_buffers)
+            if is_list_like(filepaths_or_buffers)
+            else [filepaths_or_buffers]
+        )
+        self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink)
         self.stat_freq = _get_stat_freq(statistics)
         self.comp_type = _get_comp_type(compression)
         self.index = index
         self.initialized = False
 
-    def write_table(self, table):
+    def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
         if not self.initialized:
-            self._initialize_chunked_state(table)
+            self._initialize_chunked_state(
+                table,
+                num_partitions=len(partitions_info) if partitions_info else 1
+            )
 
         cdef table_view tv
         if self.index is not False and (
@@ -437,8 +445,15 @@ cdef class ParquetWriter:
         else:
             tv = table_view_from_table(table, ignore_index=True)
 
+        cdef vector[cudf_io_types.partition_info] partitions
+        if partitions_info is not None:
+            for part in partitions_info:
+                partitions.push_back(
+                    cudf_io_types.partition_info(part[0], part[1])
+                )
+
         with nogil:
-            self.writer.get()[0].write(tv)
+            self.writer.get()[0].write(tv, partitions)
 
     def close(self, object metadata_file_path=None):
         cdef unique_ptr[vector[uint8_t]] out_metadata_c
@@ -449,7 +464,13 @@ cdef class ParquetWriter:
 
         # Update metadata-collection options
         if metadata_file_path is not None:
-            column_chunks_file_paths.push_back(str.encode(metadata_file_path))
+            if is_list_like(metadata_file_path):
+                for path in metadata_file_path:
+                    column_chunks_file_paths.push_back(str.encode(path))
+            else:
+                column_chunks_file_paths.push_back(
+                    str.encode(metadata_file_path)
+                )
 
         with nogil:
             out_metadata_c = move(
@@ -463,10 +484,13 @@ cdef class ParquetWriter:
             return np.asarray(out_metadata_py)
         return None
 
-    def __dealloc__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
         self.close()
 
-    def _initialize_chunked_state(self, table):
+    def _initialize_chunked_state(self, table, num_partitions=1):
         """ Prepares all the values required to build the
         chunked_parquet_writer_options and creates a writer"""
         cdef table_view tv
@@ -499,10 +523,14 @@ cdef class ParquetWriter:
                 table[name]._column, self.tbl_meta.get().column_metadata[i]
             )
 
-        pandas_metadata = generate_pandas_metadata(table, self.index)
+        index = (
+            False if isinstance(table._index, cudf.RangeIndex) else self.index
+        )
+        pandas_metadata = generate_pandas_metadata(table, index)
+        cdef map[string, string] tmp_user_data
+        tmp_user_data[str.encode("pandas")] = str.encode(pandas_metadata)
         cdef vector[map[string, string]] user_data
-        user_data.resize(1)
-        user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
+        user_data = vector[map[string, string]](num_partitions, tmp_user_data)
 
         cdef chunked_parquet_writer_options args
         with nogil:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index ca03e40e2a6..9694d19e159 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -5,9 +5,11 @@
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
+from typing import Dict, List, Tuple
 from uuid import uuid4
 
 import fsspec
+import numpy as np
 import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
@@ -126,32 +128,21 @@ def write_to_dataset(
 
     if partition_cols is not None and len(partition_cols) > 0:
 
-        data_cols = df.columns.drop(partition_cols)
-        if len(data_cols) == 0:
-            raise ValueError("No data left to save outside partition columns")
-
-        part_names, part_offsets, _, grouped_df = df.groupby(
-            partition_cols
-        )._grouped()
-        if not preserve_index:
-            grouped_df.reset_index(drop=True, inplace=True)
-        grouped_df.drop(columns=partition_cols, inplace=True)
-        # Copy the entire keys df in one operation rather than using iloc
-        part_names = part_names.to_pandas().to_frame(index=False)
-
-        full_paths = []
-        metadata_file_paths = []
-        for keys in part_names.itertuples(index=False):
-            subdir = fs.sep.join(
-                [f"{name}={val}" for name, val in zip(partition_cols, keys)]
-            )
-            prefix = fs.sep.join([root_path, subdir])
-            fs.mkdirs(prefix, exist_ok=True)
-            filename = filename or uuid4().hex + ".parquet"
-            full_path = fs.sep.join([prefix, filename])
-            full_paths.append(full_path)
-            if return_metadata:
-                metadata_file_paths.append(fs.sep.join([subdir, filename]))
+        (
+            full_paths,
+            metadata_file_paths,
+            grouped_df,
+            part_offsets,
+            _,
+        ) = _get_partitioned(
+            df,
+            root_path,
+            partition_cols,
+            filename,
+            fs,
+            preserve_index,
+            **kwargs,
+        )
 
         if return_metadata:
             kwargs["metadata_file_path"] = metadata_file_paths
@@ -164,7 +155,7 @@ def write_to_dataset(
         )
 
     else:
-        filename = filename or uuid4().hex + ".parquet"
+        filename = filename or _generate_filename()
         full_path = fs.sep.join([root_path, filename])
         if return_metadata:
             kwargs["metadata_file_path"] = filename
@@ -737,13 +728,12 @@ def to_parquet(
             )
 
         if partition_offsets:
-            kwargs["partitions_info"] = [
-                (
-                    partition_offsets[i],
-                    partition_offsets[i + 1] - partition_offsets[i],
+            kwargs["partitions_info"] = list(
+                zip(
+                    partition_offsets,
+                    np.roll(partition_offsets, -1) - partition_offsets,
                 )
-                for i in range(0, len(partition_offsets) - 1)
-            ]
+            )[:-1]
 
         return _write_parquet(
             df,
@@ -790,9 +780,210 @@ def merge_parquet_filemetadata(filemetadata_list):
     return libparquet.merge_filemetadata(filemetadata_list)
 
 
+def _generate_filename():
+    return uuid4().hex + ".parquet"
+
+
+def _get_partitioned(
+    df,
+    root_path,
+    partition_cols,
+    filename=None,
+    fs=None,
+    preserve_index=False,
+    **kwargs,
+):
+    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
+    fs.mkdirs(root_path, exist_ok=True)
+    if not (set(df._data) - set(partition_cols)):
+        raise ValueError("No data left to save outside partition columns")
+
+    part_names, part_offsets, _, grouped_df = df.groupby(
+        partition_cols
+    )._grouped()
+    if not preserve_index:
+        grouped_df.reset_index(drop=True, inplace=True)
+    grouped_df.drop(columns=partition_cols, inplace=True)
+    # Copy the entire keys df in one operation rather than using iloc
+    part_names = part_names.to_pandas().to_frame(index=False)
+
+    full_paths = []
+    metadata_file_paths = []
+    for keys in part_names.itertuples(index=False):
+        subdir = fs.sep.join(
+            [f"{name}={val}" for name, val in zip(partition_cols, keys)]
+        )
+        prefix = fs.sep.join([root_path, subdir])
+        fs.mkdirs(prefix, exist_ok=True)
+        filename = filename or _generate_filename()
+        full_path = fs.sep.join([prefix, filename])
+        full_paths.append(full_path)
+        metadata_file_paths.append(fs.sep.join([subdir, filename]))
+
+    return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
+
+
 ParquetWriter = libparquet.ParquetWriter
 
 
+class ParquetDatasetWriter:
+    def __init__(
+        self,
+        path,
+        partition_cols,
+        index=None,
+        compression=None,
+        statistics="ROWGROUP",
+    ) -> None:
+        """
+        Write a parquet file or dataset incrementally
+
+        Parameters
+        ----------
+        path : str
+            File path or Root Directory path. Will be used as Root Directory
+            path while writing a partitioned dataset.
+        partition_cols : list
+            Column names by which to partition the dataset
+            Columns are partitioned in the order they are given
+        index : bool, default None
+            If ``True``, include the dataframe’s index(es) in the file output.
+            If ``False``, they will not be written to the file. If ``None``,
+            index(es) other than RangeIndex will be saved as columns.
+        compression : {'snappy', None}, default 'snappy'
+            Name of the compression to use. Use ``None`` for no compression.
+        statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+            Level at which column statistics should be included in file.
+
+
+        Examples
+        ________
+        Using a context
+
+        >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+        >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+        >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
+        ...     cw.write_table(df1)
+        ...     cw.write_table(df2)
+
+        By manually calling ``close()``
+
+        >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
+        >>> cw.write_table(df1)
+        >>> cw.write_table(df2)
+        >>> cw.close()
+
+        Both the methods will generate the same directory structure
+
+        .. code-block:: bash
+
+            dataset/
+                a=1
+                    <filename>.parquet
+                a=2
+                    <filename>.parquet
+                a=3
+                    <filename>.parquet
+
+        """
+        self.path = path
+        self.common_args = {
+            "index": index,
+            "compression": compression,
+            "statistics": statistics,
+        }
+        self.partition_cols = partition_cols
+        # Collection of `ParquetWriter`s, and the corresponding
+        # partition_col values they're responsible for
+        self._chunked_writers: List[
+            Tuple[libparquet.ParquetWriter, List[str], str]
+        ] = []
+        # Map of partition_col values to their ParquetWriter's index
+        # in self._chunked_writers for reverse lookup
+        self.path_cw_map: Dict[str, int] = {}
+        self.filename = None
+
+    def write_table(self, df):
+        """
+        Write a dataframe to the file/dataset
+        """
+        (
+            paths,
+            metadata_file_paths,
+            grouped_df,
+            offsets,
+            self.filename,
+        ) = _get_partitioned(
+            df,
+            self.path,
+            self.partition_cols,
+            preserve_index=self.common_args["index"],
+            filename=self.filename,
+        )
+
+        existing_cw_batch = defaultdict(dict)
+        new_cw_paths = []
+
+        for path, part_info, meta_path in zip(
+            paths,
+            zip(offsets, np.roll(offsets, -1) - offsets),
+            metadata_file_paths,
+        ):
+            if path in self.path_cw_map:  # path is a currently open file
+                cw_idx = self.path_cw_map[path]
+                existing_cw_batch[cw_idx][path] = part_info
+            else:  # path not currently handled by any chunked writer
+                new_cw_paths.append((path, part_info, meta_path))
+
+        # Write out the parts of grouped_df currently handled by existing cw's
+        for cw_idx, path_to_part_info_map in existing_cw_batch.items():
+            cw = self._chunked_writers[cw_idx][0]
+            # match found paths with this cw's paths and nullify partition info
+            # for partition_col values not in this batch
+            this_cw_part_info = [
+                path_to_part_info_map.get(path, (0, 0))
+                for path in self._chunked_writers[cw_idx][1]
+            ]
+            cw.write_table(grouped_df, this_cw_part_info)
+
+        # Create new cw for unhandled paths encountered in this write_table
+        new_paths, part_info, meta_paths = zip(*new_cw_paths)
+        self._chunked_writers.append(
+            (
+                ParquetWriter(new_paths, **self.common_args),
+                new_paths,
+                meta_paths,
+            )
+        )
+        new_cw_idx = len(self._chunked_writers) - 1
+        self.path_cw_map.update({k: new_cw_idx for k in new_paths})
+        self._chunked_writers[-1][0].write_table(grouped_df, part_info)
+
+    def close(self, return_metadata=False):
+        """
+        Close all open files and optionally return footer metadata as a binary
+        blob
+        """
+
+        metadata = [
+            cw.close(metadata_file_path=meta_path if return_metadata else None)
+            for cw, _, meta_path in self._chunked_writers
+        ]
+
+        if return_metadata:
+            return (
+                merge_parquet_filemetadata(metadata)
+                if len(metadata) > 1
+                else metadata[0]
+            )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        self.close()
+
+
 def _check_decimal128_type(arrow_type):
     if isinstance(arrow_type, pa.Decimal128Type):
         if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9a66de8a3a6..016ed1229f1 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -18,7 +18,11 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
+from cudf.io.parquet import (
+    ParquetDatasetWriter,
+    ParquetWriter,
+    merge_parquet_filemetadata,
+)
 from cudf.testing import dataset_generator as dg
 from cudf.testing._utils import (
     TIMEDELTA_TYPES,
@@ -1573,6 +1577,16 @@ def test_parquet_writer_gpu_chunked(tmpdir, simple_pdf, simple_gdf):
     assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
 
 
+def test_parquet_writer_gpu_chunked_context(tmpdir, simple_pdf, simple_gdf):
+    gdf_fname = tmpdir.join("gdf.parquet")
+
+    with ParquetWriter(gdf_fname) as writer:
+        writer.write_table(simple_gdf)
+        writer.write_table(simple_gdf)
+
+    assert_eq(pd.read_parquet(gdf_fname), pd.concat([simple_pdf, simple_pdf]))
+
+
 def test_parquet_write_bytes_io(simple_gdf):
     output = BytesIO()
     simple_gdf.to_parquet(output)
@@ -1627,6 +1641,73 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
                 assert fn == filename
 
 
+@pytest.mark.parametrize("return_meta", [True, False])
+def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
+
+    df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+
+    cw = ParquetDatasetWriter(gdf_dir, partition_cols=["a"], index=False)
+    cw.write_table(df1)
+    cw.write_table(df2)
+    meta_byte_array = cw.close(return_metadata=return_meta)
+    pdf = cudf.concat([df1, df2]).to_pandas()
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"])
+
+    if return_meta:
+        fmd = pq.ParquetFile(BytesIO(meta_byte_array)).metadata
+        assert fmd.num_rows == len(pdf)
+        assert fmd.num_row_groups == 4
+        files = {
+            os.path.join(directory, files[0])
+            for directory, _, files in os.walk(gdf_dir)
+            if files
+        }
+        meta_files = {
+            os.path.join(gdf_dir, fmd.row_group(i).column(c).file_path)
+            for i in range(fmd.num_row_groups)
+            for c in range(fmd.row_group(i).num_columns)
+        }
+        assert files == meta_files
+
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
+
+
+def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    gdf_dir = str(tmpdir_factory.mktemp("gdf_dir"))
+
+    df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+
+    with ParquetDatasetWriter(
+        gdf_dir, partition_cols=["a"], index=False
+    ) as cw:
+        cw.write_table(df1)
+        cw.write_table(df2)
+
+    pdf = cudf.concat([df1, df2]).to_pandas()
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["a"])
+
+    # Read back with pandas to compare
+    expect_pd = pd.read_parquet(pdf_dir)
+    got_pd = pd.read_parquet(gdf_dir)
+    assert_eq(expect_pd, got_pd)
+
+    # Check that cudf and pd return the same read
+    got_cudf = cudf.read_parquet(gdf_dir)
+    assert_eq(got_pd, got_cudf)
+
+
 @pytest.mark.parametrize("cols", [None, ["b"]])
 def test_parquet_write_to_dataset(tmpdir_factory, cols):
     dir1 = tmpdir_factory.mktemp("dir1")

From ca77542cab1fc0bcf0d1c8cc67f79ef69fb02536 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 14 Jan 2022 10:11:47 -0500
Subject: [PATCH 163/202] Allow custom sort functions for dask-cudf
 `sort_values` (#9789)

Similar to https://github.com/dask/dask/pull/8345, this PR allows the sorting function called on each partition in last step of dask-cudf's `sort_values` to be generalized, along with the kwargs that are supplied to it. This allows `sort_values` to be extended to support more complex ascending / null position handling.

The context for this PR is a desire to simplify the [sorting algorithm](https://github.com/dask-contrib/dask-sql/blob/main/dask_sql/physical/utils/sort.py) used by dask-sql; since it only really differs from dask-cudf's sorting algorithm in that it uses a custom sorting function, it seems like it would be easier to allow for that extension upstream rather than duplicate code in dask-sql.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/9789
---
 python/dask_cudf/dask_cudf/core.py            | 29 +++++++++----------
 python/dask_cudf/dask_cudf/sorting.py         | 21 ++++++++++++--
 python/dask_cudf/dask_cudf/tests/test_sort.py | 19 ++++++++++++
 3 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bf063918c89..e191873f82b 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -235,6 +235,8 @@ def sort_values(
         set_divisions=False,
         ascending=True,
         na_position="last",
+        sort_function=None,
+        sort_function_kwargs=None,
         **kwargs,
     ):
         if kwargs:
@@ -242,21 +244,18 @@ def sort_values(
                 f"Unsupported input arguments passed : {list(kwargs.keys())}"
             )
 
-        if self.npartitions == 1:
-            df = self.map_partitions(
-                M.sort_values, by, ascending=ascending, na_position=na_position
-            )
-        else:
-            df = sorting.sort_values(
-                self,
-                by,
-                max_branch=max_branch,
-                divisions=divisions,
-                set_divisions=set_divisions,
-                ignore_index=ignore_index,
-                ascending=ascending,
-                na_position=na_position,
-            )
+        df = sorting.sort_values(
+            self,
+            by,
+            max_branch=max_branch,
+            divisions=divisions,
+            set_divisions=set_divisions,
+            ignore_index=ignore_index,
+            ascending=ascending,
+            na_position=na_position,
+            sort_function=sort_function,
+            sort_function_kwargs=sort_function_kwargs,
+        )
 
         if ignore_index:
             return df.reset_index(drop=True)
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index e8551493bb1..af40d9ca41b 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -222,6 +222,8 @@ def sort_values(
     ignore_index=False,
     ascending=True,
     na_position="last",
+    sort_function=None,
+    sort_function_kwargs=None,
 ):
     """Sort by the given list/tuple of column names."""
     if not isinstance(ascending, bool):
@@ -235,6 +237,21 @@ def sort_values(
     elif not isinstance(by, list):
         by = [by]
 
+    # parse custom sort function / kwargs if provided
+    sort_kwargs = {
+        "by": by,
+        "ascending": ascending,
+        "na_position": na_position,
+    }
+    if sort_function is None:
+        sort_function = M.sort_values
+    if sort_function_kwargs is not None:
+        sort_kwargs.update(sort_function_kwargs)
+
+    # handle single partition case
+    if npartitions == 1:
+        return df.map_partitions(sort_function, **sort_kwargs)
+
     # Step 1 - Calculate new divisions (if necessary)
     if divisions is None:
         divisions = quantile_divisions(df, by, npartitions)
@@ -265,9 +282,7 @@ def sort_values(
     df3.divisions = (None,) * (df3.npartitions + 1)
 
     # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(
-        M.sort_values, by, ascending=ascending, na_position=na_position
-    )
+    df4 = df3.map_partitions(sort_function, **sort_kwargs)
     if not isinstance(divisions, gd.DataFrame) and set_divisions:
         # Can't have multi-column divisions elsewhere in dask (yet)
         df4.divisions = methods.tolist(divisions)
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index f4ae83245cb..0b258dd33e7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -83,3 +83,22 @@ def test_sort_values_with_nulls(data, by, ascending, na_position):
 
     # cudf ordering for nulls is non-deterministic
     dd.assert_eq(got[by], expect[by], check_index=False)
+
+
+@pytest.mark.parametrize("by", [["a", "b"], ["b", "a"]])
+@pytest.mark.parametrize("nparts", [1, 10])
+def test_sort_values_custom_function(by, nparts):
+    df = cudf.DataFrame({"a": [1, 2, 3] * 20, "b": [4, 5, 6, 7] * 15})
+    ddf = dd.from_pandas(df, npartitions=nparts)
+
+    def f(partition, by_columns, ascending, na_position, **kwargs):
+        return partition.sort_values(
+            by_columns, ascending=ascending, na_position=na_position
+        )
+
+    with dask.config.set(scheduler="single-threaded"):
+        got = ddf.sort_values(
+            by=by[0], sort_function=f, sort_function_kwargs={"by_columns": by}
+        )
+    expect = df.sort_values(by=by)
+    dd.assert_eq(got, expect, check_index=False)

From ce31d7d3ad765c88bef9f5e860abe3e5488a1fbd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 14 Jan 2022 10:55:47 -0500
Subject: [PATCH 164/202] Fix octal pattern matching in regex string (#9993)

Closes #9946

Fixes decoding logic in regex pattern compile step to consume only up to the last octal character. The original logic was incorrectly discarding the next pattern character. And if the octal characters were specified at the end of the pattern invalid bytes were read passed the end of the pattern. This is what caused the intermittent failure since sometimes the invalid bytes were 0 which masked the issue.

This PR also includes tests for octal patterns in various positions in the regex pattern.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Conor Hoekstra (https://github.com/codereport)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9993
---
 cpp/src/strings/regex/regcomp.cpp    |  8 ++++----
 cpp/tests/strings/contains_tests.cpp | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 7da4915d668..8fbd82b8dc7 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -258,10 +258,10 @@ class regex_parser {
       // treating all quoted numbers as Octal, since we are not supporting backreferences
       if (yy >= '0' && yy <= '7') {
         yy         = yy - '0';
-        char32_t c = *exprp++;
+        char32_t c = *exprp;
         while (c >= '0' && c <= '7') {
           yy = (yy << 3) | (c - '0');
-          c  = *exprp++;
+          c  = *(++exprp);
         }
         return CHAR;
       } else {
@@ -926,7 +926,7 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
-#ifndef NDBUG
+#ifndef NDEBUG
 void reprog::print(regex_flags const flags)
 {
   printf("Flags = 0x%08x\n", static_cast<uint32_t>(flags));
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index f95b282171f..48c4aac9e8a 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -237,6 +237,19 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
   }
 }
 
+TEST_F(StringsContainsTests, OctalTest)
+{
+  cudf::test::strings_column_wrapper strings({"AZ", "B", "CDAZEY", ""});
+  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::fixed_width_column_wrapper<bool> expected({1, 0, 1, 0});
+  auto results = cudf::strings::contains_re(strings_view, "\\101");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::contains_re(strings_view, "\\101Z");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = cudf::strings::contains_re(strings_view, "D*\\101\\132");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EmbeddedNullCharacter)
 {
   std::vector<std::string> data(10);

From b01c8464b2e1412263a7368f01242f6cdd58e89b Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Fri, 14 Jan 2022 08:05:54 -0800
Subject: [PATCH 165/202] Allow CuPy 10 (#10048)

Relaxes version constraints to allow CuPy 10.

xref: https://github.com/rapidsai/integration/pull/413

Authors:
  - https://github.com/jakirkham

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Jordan Jacobelli (https://github.com/Ethyling)

URL: https://github.com/rapidsai/cudf/pull/10048
---
 conda/environments/cudf_dev_cuda11.5.yml | 2 +-
 conda/recipes/cudf/meta.yaml             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index c258a5caabb..bbbc754e850 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -9,7 +9,7 @@ channels:
 dependencies:
   - clang=11.1.0
   - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
+  - cupy>=9.5.0,<11.0.0a0
   - rmm=22.02.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 2600ab358cc..a20749bc8c9 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -40,7 +40,7 @@ requirements:
     - python
     - typing_extensions
     - pandas >=1.0,<1.4.0dev0
-    - cupy >=9.5.0,<10.0.0a0
+    - cupy >=9.5.0,<11.0.0a0
     - numba >=0.54
     - numpy
     - {{ pin_compatible('pyarrow', max_pin='x.x.x') }} *cuda

From 12adb8a93dc51e14c106b12c4a68f3a2e1fe3207 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jan 2022 14:59:40 -0600
Subject: [PATCH 166/202] Fix repr and concat of `StructColumn` (#10042)

Fixes: #8963

This PR fixes a trivial issue in `concat` where the assumption was that `_with_type_metadata` is an in-place operation, but it isn't.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10042
---
 python/cudf/cudf/core/dataframe.py  | 14 +++++++++++---
 python/cudf/cudf/core/series.py     |  9 +++++----
 python/cudf/cudf/tests/test_repr.py | 30 +++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 123f86cc200..8fb9b84d96b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1548,10 +1548,18 @@ def _concat(
                     cudf.core.index.as_index(out.index._values)
                 )
 
-        # Reassign precision for any decimal cols
+        # Reassign precision for decimal cols & type schema for struct cols
         for name, col in out._data.items():
-            if isinstance(col, cudf.core.column.Decimal64Column):
-                col = col._with_type_metadata(tables[0]._data[name].dtype)
+            if isinstance(
+                col,
+                (
+                    cudf.core.column.Decimal64Column,
+                    cudf.core.column.StructColumn,
+                ),
+            ):
+                out._data[name] = col._with_type_metadata(
+                    tables[0]._data[name].dtype
+                )
 
         # Reassign index and column names
         if isinstance(objs[0].columns, pd.MultiIndex):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 11166320760..7da3bdbb31e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1448,10 +1448,11 @@ def _concat(cls, objs, axis=0, index=True):
 
         col = concat_columns([o._column for o in objs])
 
-        if isinstance(col, cudf.core.column.Decimal64Column):
-            col = col._with_type_metadata(objs[0]._column.dtype)
-
-        if isinstance(col, cudf.core.column.StructColumn):
+        # Reassign precision for decimal cols & type schema for struct cols
+        if isinstance(
+            col,
+            (cudf.core.column.Decimal64Column, cudf.core.column.StructColumn),
+        ):
             col = col._with_type_metadata(objs[0].dtype)
 
         return cls(data=col, index=index, name=name)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index f8c136b8c2d..82020f30f7c 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1475,3 +1475,33 @@ def test_empty_series_name():
     gs = cudf.from_pandas(ps)
 
     assert ps.__repr__() == gs.__repr__()
+
+
+def test_repr_struct_after_concat():
+    df = cudf.DataFrame(
+        {
+            "a": cudf.Series(
+                [
+                    {"sa": 2056831253},
+                    {"sa": -1463792165},
+                    {"sa": 1735783038},
+                    {"sa": 103774433},
+                    {"sa": -1413247520},
+                ]
+                * 13
+            ),
+            "b": cudf.Series(
+                [
+                    {"sa": {"ssa": 1140062029}},
+                    None,
+                    {"sa": {"ssa": 1998862860}},
+                    {"sa": None},
+                    {"sa": {"ssa": -395088502}},
+                ]
+                * 13
+            ),
+        }
+    )
+    pdf = df.to_pandas()
+
+    assert df.__repr__() == pdf.__repr__()

From 8c8d6ef7fdc8f17159df63182ee9e9b0cf8df3b1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jan 2022 17:22:07 -0600
Subject: [PATCH 167/202] Fix dataframe setitem with `ndarray` types (#10056)

Fixes: #9928

This PR fixes 2d array assignment in `setitem`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/10056
---
 python/cudf/cudf/core/dataframe.py       | 18 ++++++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 11 +++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8fb9b84d96b..6bbb2fca77c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1123,7 +1123,15 @@ def __setitem__(self, arg, value):
                     for col_name in self._data:
                         self._data[col_name][mask] = value
             else:
-                if isinstance(value, DataFrame):
+                if isinstance(value, (cupy.ndarray, np.ndarray)):
+                    _setitem_with_dataframe(
+                        input_df=self,
+                        replace_df=cudf.DataFrame(value),
+                        input_cols=arg,
+                        mask=None,
+                        ignore_index=True,
+                    )
+                elif isinstance(value, DataFrame):
                     _setitem_with_dataframe(
                         input_df=self,
                         replace_df=value,
@@ -6401,6 +6409,7 @@ def _setitem_with_dataframe(
     replace_df: DataFrame,
     input_cols: Any = None,
     mask: Optional[cudf.core.column.ColumnBase] = None,
+    ignore_index: bool = False,
 ):
     """
     This function sets item dataframes relevant columns with replacement df
@@ -6408,6 +6417,7 @@ def _setitem_with_dataframe(
     :param replace_df: Replacement DataFrame to replace values with
     :param input_cols: columns to replace in the input dataframe
     :param mask: boolean mask in case of masked replacing
+    :param ignore_index: Whether to conduct index equality and reindex
     """
 
     if input_cols is None:
@@ -6418,7 +6428,11 @@ def _setitem_with_dataframe(
             "Number of Input Columns must be same replacement Dataframe"
         )
 
-    if len(input_df) != 0 and not input_df.index.equals(replace_df.index):
+    if (
+        not ignore_index
+        and len(input_df) != 0
+        and not input_df.index.equals(replace_df.index)
+    ):
         replace_df = replace_df.reindex(input_df.index)
 
     for col_1, col_2 in zip(input_cols, replace_df.columns):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e5b298a8448..372587ba677 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9030,3 +9030,14 @@ def test_dataframe_add_suffix():
     expected = pdf.add_suffix("_item")
 
     assert_eq(got, expected)
+
+
+def test_dataframe_assign_cp_np_array():
+    m, n = 5, 3
+    cp_ndarray = cupy.random.randn(m, n)
+    pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)})
+    gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)})
+    pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray)
+    gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray
+
+    assert_eq(pdf, gdf)

From e24fa8f0b0cca2c9a441002623fdbc40631ed369 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 14 Jan 2022 16:35:43 -0800
Subject: [PATCH 168/202] Run doctests. (#9815)

This PR adds doctests and resolves #9513. Several issues were found by running doctests that have now been resolved:

- [x] #9821
- [x] #9822
- [x] #9823
- [x] #9824
- [x] #9825
- [x] #9826
- [x] #9827
- [x] #9828 (workaround by deleting doctests)
- [x] #9829

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9815
---
 python/cudf/cudf/__init__.py                |  63 ++++++++++
 python/cudf/cudf/api/__init__.py            |   4 +-
 python/cudf/cudf/api/extensions/__init__.py |   6 +
 python/cudf/cudf/core/_base_index.py        |  14 ++-
 python/cudf/cudf/core/column/categorical.py |   1 -
 python/cudf/cudf/core/dataframe.py          | 122 ++++++++++++--------
 python/cudf/cudf/core/groupby/groupby.py    |  11 +-
 python/cudf/cudf/core/index.py              |  72 ++++++------
 python/cudf/cudf/core/multiindex.py         |  16 +--
 python/cudf/cudf/core/reshape.py            |   3 +-
 python/cudf/cudf/core/scalar.py             |   2 +-
 python/cudf/cudf/core/series.py             |  42 ++++---
 python/cudf/cudf/core/tools/datetimes.py    |   8 +-
 python/cudf/cudf/tests/test_dataframe.py    |  81 ++++++++-----
 python/cudf/cudf/tests/test_doctests.py     | 102 ++++++++++++++++
 python/cudf/cudf/utils/docutils.py          |  14 +--
 python/cudf/cudf/utils/ioutils.py           |   6 +-
 17 files changed, 402 insertions(+), 165 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_doctests.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 2461e7b09bc..4dadf6a1869 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -126,3 +126,66 @@
 
 __version__ = get_versions()["version"]
 del get_versions
+
+__all__ = [
+    "BaseIndex",
+    "CategoricalDtype",
+    "CategoricalIndex",
+    "DataFrame",
+    "DateOffset",
+    "DatetimeIndex",
+    "Decimal32Dtype",
+    "Decimal64Dtype",
+    "Float32Index",
+    "Float64Index",
+    "GenericIndex",
+    "Grouper",
+    "Index",
+    "Int16Index",
+    "Int32Index",
+    "Int64Index",
+    "Int8Index",
+    "IntervalDtype",
+    "IntervalIndex",
+    "ListDtype",
+    "MultiIndex",
+    "NA",
+    "RangeIndex",
+    "Scalar",
+    "Series",
+    "StringIndex",
+    "StructDtype",
+    "TimedeltaIndex",
+    "UInt16Index",
+    "UInt32Index",
+    "UInt64Index",
+    "UInt8Index",
+    "api",
+    "concat",
+    "cut",
+    "date_range",
+    "factorize",
+    "from_dataframe",
+    "from_dlpack",
+    "from_pandas",
+    "get_dummies",
+    "interval_range",
+    "isclose",
+    "melt",
+    "merge",
+    "merge_sorted",
+    "pivot",
+    "read_avro",
+    "read_csv",
+    "read_feather",
+    "read_hdf",
+    "read_json",
+    "read_orc",
+    "read_parquet",
+    "read_text",
+    "set_allocator",
+    "testing",
+    "to_datetime",
+    "to_numeric",
+    "unstack",
+]
diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py
index 21c24015e41..c66bfb4efeb 100644
--- a/python/cudf/cudf/api/__init__.py
+++ b/python/cudf/cudf/api/__init__.py
@@ -1,3 +1,5 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-from cudf.api import types
+from cudf.api import extensions, types
+
+__all__ = ["extensions", "types"]
diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py
index c971e6f7731..eeb5dcdb32a 100644
--- a/python/cudf/cudf/api/extensions/__init__.py
+++ b/python/cudf/cudf/api/extensions/__init__.py
@@ -5,3 +5,9 @@
     register_index_accessor,
     register_series_accessor,
 )
+
+__all__ = [
+    "register_dataframe_accessor",
+    "register_index_accessor",
+    "register_series_accessor",
+]
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 683f3fefe1c..4f2614e843f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -495,7 +495,7 @@ def fillna(self, value, downcast=None):
         >>> import cudf
         >>> index = cudf.Index([1, 2, None, 4])
         >>> index
-        Int64Index([1, 2, null, 4], dtype='int64')
+        Int64Index([1, 2, <NA>, 4], dtype='int64')
         >>> index.fillna(3)
         Int64Index([1, 2, 3, 4], dtype='int64')
         """
@@ -553,7 +553,7 @@ def to_pandas(self):
         >>> type(idx.to_pandas())
         <class 'pandas.core.indexes.numeric.Int64Index'>
         >>> type(idx)
-        <class 'cudf.core.index.GenericIndex'>
+        <class 'cudf.core.index.Int64Index'>
         """
         return pd.Index(self._values.to_pandas(), name=self.name)
 
@@ -942,6 +942,7 @@ def is_interval(self):
         Examples
         --------
         >>> import cudf
+        >>> import pandas as pd
         >>> idx = cudf.from_pandas(
         ...     pd.Index([pd.Interval(left=0, right=5),
         ...               pd.Interval(left=5, right=10)])
@@ -1105,15 +1106,16 @@ def join(
         Examples
         --------
         >>> import cudf
-        >>> lhs = cudf.DataFrame(
-        ...     {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b']
-        ... ).index
+        >>> lhs = cudf.DataFrame({
+        ...     "a": [2, 3, 1],
+        ...     "b": [3, 4, 2],
+        ... }).set_index(['a', 'b']).index
         >>> lhs
         MultiIndex([(2, 3),
                     (3, 4),
                     (1, 2)],
                    names=['a', 'b'])
-        >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index
+        >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index
         >>> rhs
         Int64Index([1, 4, 3], dtype='int64', name='a')
         >>> lhs.join(rhs, how='inner')
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 4be7a422de0..de06e62cbb1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods):
     --------
     >>> s = cudf.Series([1,2,3], dtype='category')
     >>> s
-    >>> s
     0    1
     1    2
     2    3
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6bbb2fca77c..69600426ec0 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -463,12 +463,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     ...     [(t0+ timedelta(seconds=x)) for x in range(n)])
     ... })
     >>> df
-        id                datetimes
-    0    0  2018-10-07T12:00:00.000
-    1    1  2018-10-07T12:00:01.000
-    2    2  2018-10-07T12:00:02.000
-    3    3  2018-10-07T12:00:03.000
-    4    4  2018-10-07T12:00:04.000
+        id            datetimes
+    0    0  2018-10-07 12:00:00
+    1    1  2018-10-07 12:00:01
+    2    2  2018-10-07 12:00:02
+    3    3  2018-10-07 12:00:03
+    4    4  2018-10-07 12:00:04
 
     Build DataFrame via list of rows as tuples:
 
@@ -984,23 +984,34 @@ def __getitem__(self, arg):
 
         Examples
         --------
-        >>> df = DataFrame([('a', list(range(20))),
-        ...                 ('b', list(range(20))),
-        ...                 ('c', list(range(20)))])
-        >>> df[:4]    # get first 4 rows of all columns
+        >>> df = cudf.DataFrame({
+        ...     'a': list(range(10)),
+        ...     'b': list(range(10)),
+        ...     'c': list(range(10)),
+        ... })
+
+        Get first 4 rows of all columns.
+
+        >>> df[:4]
            a  b  c
         0  0  0  0
         1  1  1  1
         2  2  2  2
         3  3  3  3
-        >>> df[-5:]  # get last 5 rows of all columns
-             a   b   c
-        15  15  15  15
-        16  16  16  16
-        17  17  17  17
-        18  18  18  18
-        19  19  19  19
-        >>> df[['a', 'c']] # get columns a and c
+
+        Get last 5 rows of all columns.
+
+        >>> df[-5:]
+           a  b  c
+        5  5  5  5
+        6  6  6  6
+        7  7  7  7
+        8  8  8  8
+        9  9  9  9
+
+        Get columns a and c.
+
+        >>> df[['a', 'c']]
            a  c
         0  0  0
         1  1  1
@@ -1012,8 +1023,17 @@ def __getitem__(self, arg):
         7  7  7
         8  8  8
         9  9  9
-        >>> df[[True, False, True, False]] # mask the entire dataframe,
-        # returning the rows specified in the boolean mask
+
+        Return the rows specified in the boolean mask.
+
+        >>> df[[True, False, True, False, True,
+        ...     False, True, False, True, False]]
+           a  b  c
+        0  0  0  0
+        2  2  2  2
+        4  4  4  4
+        6  6  6  6
+        8  8  8  8
         """
         if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
             return self._get_columns_by_label(arg, downcast=True)
@@ -1261,10 +1281,12 @@ def memory_usage(self, index=True, deep=False):
         object     40000
         bool        5000
         dtype: int64
+
         Use a Categorical for efficient storage of an object-dtype column with
         many repeated values.
+
         >>> df['object'].astype('category').memory_usage(deep=True)
-        5048
+        5008
         """
         if deep:
             warnings.warn(
@@ -2225,11 +2247,11 @@ def reindex(
         3    3  13.0
         4    4  14.0
         >>> df_new
-           key   val  sum
-        0    0  10.0  NaN
-        3    3  13.0  NaN
-        4    4  14.0  NaN
-        5   -1   NaN  NaN
+           key   val   sum
+        0     0  10.0  <NA>
+        3     3  13.0  <NA>
+        4     4  14.0  <NA>
+        5  <NA>  <NA>  <NA>
         """
 
         if labels is None and index is None and columns is None:
@@ -3701,10 +3723,10 @@ def query(self, expr, local_dict=None):
 
         Examples
         --------
-        >>> import cudf
-        >>> a = ('a', [1, 2, 2])
-        >>> b = ('b', [3, 4, 5])
-        >>> df = cudf.DataFrame([a, b])
+        >>> df = cudf.DataFrame({
+        ...     "a": [1, 2, 2],
+        ...     "b": [3, 4, 5],
+        ... })
         >>> expr = "(a == 2 and b == 4) or (b == 3)"
         >>> df.query(expr)
            a  b
@@ -3720,8 +3742,8 @@ def query(self, expr, local_dict=None):
         >>> df['datetimes'] = data
         >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
         >>> df.query('datetimes==@search_date')
-                        datetimes
-        1 2018-10-08T00:00:00.000
+           datetimes
+        1 2018-10-08
 
         Using local_dict:
 
@@ -3732,9 +3754,9 @@ def query(self, expr, local_dict=None):
         >>> df['datetimes'] = data
         >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d')
         >>> df.query('datetimes==@search_date',
-        ...         local_dict={'search_date':search_date2})
-                        datetimes
-        1 2018-10-08T00:00:00.000
+        ...          local_dict={'search_date': search_date2})
+           datetimes
+        1 2018-10-08
         """
         # can't use `annotate` decorator here as we inspect the calling
         # environment.
@@ -4189,18 +4211,23 @@ def info(
         dtypes: float64(1), int64(1), object(1)
         memory usage: 130.0+ bytes
 
-        Pipe output of DataFrame.info to buffer instead of sys.stdout,
-        get buffer content and writes to a text file:
+        Pipe output of DataFrame.info to a buffer instead of sys.stdout and
+        print buffer contents:
 
         >>> import io
         >>> buffer = io.StringIO()
         >>> df.info(buf=buffer)
-        >>> s = buffer.getvalue()
-        >>> with open("df_info.txt", "w",
-        ...           encoding="utf-8") as f:
-        ...     f.write(s)
-        ...
-        369
+        >>> print(buffer.getvalue())
+        <class 'cudf.core.dataframe.DataFrame'>
+        RangeIndex: 5 entries, 0 to 4
+        Data columns (total 3 columns):
+         #   Column     Non-Null Count  Dtype
+        ---  ------     --------------  -----
+         0   int_col    5 non-null      int64
+         1   text_col   5 non-null      object
+         2   float_col  5 non-null      float64
+        dtypes: float64(1), int64(1), object(1)
+        memory usage: 130.0+ bytes
 
         The `memory_usage` parameter allows deep introspection mode, specially
         useful for big DataFrames and fine-tune memory optimization:
@@ -5761,7 +5788,7 @@ def stack(self, level=-1, dropna=True):
         Examples
         --------
         >>> import cudf
-        >>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]})
+        >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]})
         >>> df.stack()
         0  a    0
            b    1
@@ -6084,8 +6111,11 @@ def explode(self, column, ignore_index=False):
         Examples
         --------
         >>> import cudf
-        >>> cudf.DataFrame(
-                {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]})
+        >>> df = cudf.DataFrame({
+        ...     "a": [[1, 2, 3], [], None, [4, 5]],
+        ...     "b": [11, 22, 33, 44],
+        ... })
+        >>> df
                    a   b
         0  [1, 2, 3]  11
         1         []  22
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 08ef3f07776..5b041ba53b9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     --------
     >>> import cudf
     >>> import pandas as pd
-    >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon',
-    ...                               'Parrot', 'Parrot'],
-    ...                    'Max Speed': [380., 370., 24., 26.]})
+    >>> df = cudf.DataFrame({
+    ...     'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+    ...     'Max Speed': [380., 370., 24., 26.],
+    ... })
     >>> df
        Animal  Max Speed
     0  Falcon      380.0
@@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
     Parrot       25.0
 
     >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
-    ... ['Captive', 'Wild', 'Captive', 'Wild']]
+    ...           ['Captive', 'Wild', 'Captive', 'Wild']]
     >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
     >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]},
-            index=index)
+    ...     index=index)
     >>> df
                     Max Speed
     Animal Type
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 859a81bc5f4..1e493708415 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1535,9 +1535,11 @@ class DatetimeIndex(GenericIndex):
     --------
     >>> import cudf
     >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a")
-    DatetimeIndex(['1970-01-01 00:00:00.001000', '1970-01-01 00:00:00.002000',
-                   '1970-01-01 00:00:00.003000', '1970-01-01 00:00:00.004000'],
-                  dtype='datetime64[ms]', name='a')
+    DatetimeIndex(['1970-01-01 00:00:00.000000001',
+                   '1970-01-01 00:00:00.000000002',
+                   '1970-01-01 00:00:00.000000003',
+                   '1970-01-01 00:00:00.000000004'],
+                  dtype='datetime64[ns]', name='a')
     """
 
     def __init__(
@@ -1899,12 +1901,13 @@ def ceil(self, freq):
         Examples
         --------
         >>> import cudf
-        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00",
-        ... "1999-12-31 18:40:00"])
+        >>> gIndex = cudf.DatetimeIndex([
+        ...     "2020-05-31 08:05:42",
+        ...     "1999-12-31 18:40:30",
+        ... ])
         >>> gIndex.ceil("T")
-        DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'],
-        dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.ceil(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -1930,12 +1933,13 @@ def floor(self, freq):
         Examples
         --------
         >>> import cudf
-        >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59"
-        ... ,"1999-12-31 18:44:59"])
+        >>> gIndex = cudf.DatetimeIndex([
+        ...     "2020-05-31 08:59:59",
+        ...     "1999-12-31 18:44:59",
+        ... ])
         >>> gIndex.floor("T")
-        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'],
-        dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.floor(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -1967,21 +1971,14 @@ def round(self, freq):
         ...     "2001-01-01 00:05:04",
         ... ], dtype="datetime64[ns]")
         >>> dt_idx
-        DatetimeIndex(['2001-01-01 00:04:45',
-                '2001-01-01 00:05:04',
-                '2001-01-01 00:04:58'],
-                dtype='datetime64[ns]', freq=None)
+        DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58',
+                       '2001-01-01 00:05:04'],
+                      dtype='datetime64[ns]')
         >>> dt_idx.round('H')
-        DatetimeIndex(['2001-01-01',
-                    '2001-01-01',
-                    '2001-01-01'],
-                    dtype='datetime64[ns]', freq=None)
+        DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]')
         >>> dt_idx.round('T')
-        DatetimeIndex(['2001-01-01 00:05:00',
-                    '2001-01-01 00:05:00',
-                    '2001-01-01 00:05:00'],
-                    dtype='datetime64[ns]', freq=None)
-        """
+        DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
+        """  # noqa: E501
         out_column = self._values.round(freq)
 
         return self.__class__._from_data({self.name: out_column})
@@ -2018,14 +2015,15 @@ class TimedeltaIndex(GenericIndex):
     --------
     >>> import cudf
     >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324],
-    ...     dtype='timedelta64[ns]')
-    TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234',
-                    '00:00:00.004234'],
-                dtype='timedelta64[ns]')
-    >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]',
+    ...     dtype="timedelta64[ns]")
+    TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232',
+                    '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'],
+                  dtype='timedelta64[ns]')
+    >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]",
     ...     name="delta-index")
-    TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'],
-                dtype='timedelta64[s]', name='delta-index')
+    TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03',
+                    '0 days 00:00:04'],
+                  dtype='timedelta64[s]', name='delta-index')
     """
 
     def __init__(
@@ -2154,11 +2152,11 @@ class CategoricalIndex(GenericIndex):
     >>> import pandas as pd
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a")
-    CategoricalIndex([1, 2, <NA>, <NA>], categories=[1, 2], ordered=False, name='a', dtype='category', name='a')
+    CategoricalIndex([1, 2, <NA>, <NA>], categories=[1, 2], ordered=False, dtype='category', name='a')
 
     >>> cudf.CategoricalIndex(
     ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a")
-    CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a')
+    CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
     def __init__(
@@ -2449,9 +2447,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
         >>> import cudf
         >>> import pandas as pd
         >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3])
-        IntervalIndex([(0, 1], (1, 2], (2, 3]],
-                    closed='right',
-                    dtype='interval[int64]')
+        IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval')
         """
         if copy:
             breaks = column.as_column(breaks, dtype=dtype).copy()
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index b333c862f21..3acc947c649 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -190,7 +190,7 @@ def rename(self, names, inplace=False):
         Renaming each levels of a MultiIndex to specified name:
 
         >>> midx = cudf.MultiIndex.from_product(
-                [('A', 'B'), (2020, 2021)], names=['c1', 'c2'])
+        ...     [('A', 'B'), (2020, 2021)], names=['c1', 'c2'])
         >>> midx.rename(['lv1', 'lv2'])
         MultiIndex([('A', 2020),
                     ('A', 2021),
@@ -1086,7 +1086,7 @@ def values(self):
             [4, 2],
             [5, 1]])
         >>> type(midx.values)
-        <class 'cupy.core.core.ndarray'>
+        <class 'cupy._core.core.ndarray'>
         """
         return self.to_frame(index=False).values
 
@@ -1587,13 +1587,13 @@ def get_loc(self, key, method=None, tolerance=None):
         --------
         >>> import cudf
         >>> mi = cudf.MultiIndex.from_tuples(
-            [('a', 'd'), ('b', 'e'), ('b', 'f')])
+        ...     [('a', 'd'), ('b', 'e'), ('b', 'f')])
         >>> mi.get_loc('b')
         slice(1, 3, None)
         >>> mi.get_loc(('b', 'e'))
         1
         >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples(
-            [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')])
+        ...     [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')])
         >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas
         slice(1, 4, 2)
 
@@ -1609,10 +1609,10 @@ def get_loc(self, key, method=None, tolerance=None):
 
                 >>> import pandas as pd
                 >>> import cudf
-                >>> x = pd.MultiIndex.from_tuples(
-                            [(2, 1, 1), (1, 2, 3), (1, 2, 1),
-                                (1, 1, 1), (1, 1, 1), (2, 2, 1)]
-                        )
+                >>> x = pd.MultiIndex.from_tuples([
+                ...     (2, 1, 1), (1, 2, 3), (1, 2, 1),
+                ...     (1, 1, 1), (1, 1, 1), (2, 2, 1),
+                ... ])
                 >>> x.get_loc(1)
                 array([False,  True,  True,  True,  True, False])
                 >>> cudf.from_pandas(x).get_loc(1)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1733a6c0b9a..68113cfdca9 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -891,7 +891,7 @@ def pivot(data, index=None, columns=None, values=None):
     Examples
     --------
     >>> a = cudf.DataFrame()
-    >>> a['a'] = [1, 1, 2, 2],
+    >>> a['a'] = [1, 1, 2, 2]
     >>> a['b'] = ['a', 'b', 'a', 'b']
     >>> a['c'] = [1, 2, 3, 4]
     >>> a.pivot(index='a', columns='b')
@@ -973,6 +973,7 @@ def unstack(df, level, fill_value=None):
 
     Examples
     --------
+    >>> df = cudf.DataFrame()
     >>> df['a'] = [1, 1, 1, 2, 2]
     >>> df['b'] = [1, 2, 3, 1, 2]
     >>> df['c'] = [5, 6, 7, 8, 9]
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 787b28e213c..37bb8e32c5a 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -32,7 +32,7 @@ class Scalar(object):
     >>> cudf.Scalar(42, dtype='int64') + np.int8(21)
     Scalar(63, dtype=int64)
     >>> x = cudf.Scalar(42, dtype='datetime64[s]')
-    >>> y = cudf.Scalar(21, dtype='timedelta64[ns])
+    >>> y = cudf.Scalar(21, dtype='timedelta64[ns]')
     >>> x - y
     Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns])
     >>> cudf.Series([1,2,3]) + cudf.Scalar(1)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 7da3bdbb31e..6842a05a505 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -512,13 +512,26 @@ def from_pandas(cls, s, nan_as_null=None):
     @property
     def dt(self):
         """
-        Accessor object for datetimelike properties of the Series values.
+        Accessor object for datetime-like properties of the Series values.
 
         Examples
         --------
+        >>> s = cudf.Series(cudf.date_range(
+        ...   start='2001-02-03 12:00:00',
+        ...   end='2001-02-03 14:00:00',
+        ...   freq='1H'))
         >>> s.dt.hour
+        0    12
+        1    13
+        dtype: int16
         >>> s.dt.second
+        0    0
+        1    0
+        dtype: int16
         >>> s.dt.day
+        0    3
+        1    3
+        dtype: int16
 
         Returns
         -------
@@ -674,10 +687,12 @@ def drop(
            y    3
         2  x    4
            y    5
+        dtype: int64
         >>> s.drop(labels='y', level=1)
         0  x    0
         1  x    2
         2  x    4
+        Name: 2, dtype: int64
         """
         if labels is not None:
             if index is not None or columns is not None:
@@ -1032,7 +1047,7 @@ def memory_usage(self, index=True, deep=False):
         --------
         >>> s = cudf.Series(range(3), index=['a','b','c'])
         >>> s.memory_usage()
-        48
+        43
 
         Not including the index gives the size of the rest of the data, which
         is necessarily smaller:
@@ -1539,7 +1554,7 @@ def dropna(self, axis=0, inplace=False, how=None):
         >>> ser
         0       1
         1       2
-        2    null
+        2    <NA>
         dtype: int64
 
         Drop null values from a Series.
@@ -1800,7 +1815,7 @@ def data(self):
         3    4
         dtype: int64
         >>> series.data
-        <cudf.core.buffer.Buffer object at 0x7f23c192d110>
+        <cudf.core.buffer.Buffer object at 0x...>
         >>> series.data.to_host_array()
         array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
                0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
@@ -1824,14 +1839,7 @@ def as_mask(self):
         >>> import cudf
         >>> s = cudf.Series([True, False, True])
         >>> s.as_mask()
-        <cudf.core.buffer.Buffer object at 0x7f23c3eed0d0>
-        >>> s.as_mask().to_host_array()
-        array([  5,   0,   0,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,
-                 0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0, 181, 164,
-               188,   1,   0,   0,   0,   0, 255, 255, 255, 255, 255, 255, 255,
-               127, 253, 214,  62, 241,   1,   0,   0,   0,   0,   0,   0,   0,
-                 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
-             dtype=uint8)
+        <cudf.core.buffer.Buffer object at 0x...>
         """
         if not is_bool_dtype(self.dtype):
             raise TypeError(
@@ -2805,11 +2813,11 @@ def autocorr(self, lag=1):
         Examples
         --------
         >>> import cudf
-        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05])
+        >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17])
         >>> s.autocorr()
-        0.10355263309024071
+        0.1438853844...
         >>> s.autocorr(lag=2)
-        -0.9999999999999999
+        -0.9647548490...
         """
         return self.corr(self.shift(lag))
 
@@ -3584,7 +3592,7 @@ def keys(self):
         dtype: int64
 
         >>> sr.keys()
-        RangeIndex(start=0, stop=6)
+        RangeIndex(start=0, stop=6, step=1)
         >>> sr = cudf.Series(['a', 'b', 'c'])
         >>> sr
         0    a
@@ -3592,7 +3600,7 @@ def keys(self):
         2    c
         dtype: object
         >>> sr.keys()
-        RangeIndex(start=0, stop=3)
+        RangeIndex(start=0, stop=3, step=1)
         >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c'])
         >>> sr
         a    1
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 15426d0173a..62c31691ac1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -397,10 +397,10 @@ class DateOffset:
     --------
     >>> from cudf import DateOffset
     >>> ts = cudf.Series([
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
-    ], dtype='datetime64[ns])
+    ...     "2000-01-01 00:00:00.012345678",
+    ...     "2000-01-31 00:00:00.012345678",
+    ...     "2000-02-29 00:00:00.012345678",
+    ... ], dtype='datetime64[ns]')
     >>> ts + DateOffset(months=3)
     0   2000-04-01 00:00:00.012345678
     1   2000-04-30 00:00:00.012345678
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 372587ba677..6171f20929d 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -746,18 +746,31 @@ def test_index_astype(nelem):
     np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy())
 
 
-def test_dataframe_to_string():
-    pd.options.display.max_rows = 5
-    pd.options.display.max_columns = 8
-    # Test basic
+def test_dataframe_to_string_with_skipped_rows():
+    # Test skipped rows
     df = cudf.DataFrame(
         {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}
     )
-    string = str(df)
 
-    assert string.splitlines()[-1] == "[6 rows x 2 columns]"
+    with pd.option_context("display.max_rows", 5):
+        got = df.to_string()
+
+    expect = textwrap.dedent(
+        """\
+            a   b
+        0   1  11
+        1   2  12
+        .. ..  ..
+        4   5  15
+        5   6  16
+
+        [6 rows x 2 columns]"""
+    )
+    assert got == expect
 
-    # Test skipped columns
+
+def test_dataframe_to_string_with_skipped_rows_and_columns():
+    # Test skipped rows and skipped columns
     df = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4, 5, 6],
@@ -766,11 +779,26 @@ def test_dataframe_to_string():
             "d": [11, 12, 13, 14, 15, 16],
         }
     )
-    string = df.to_string()
 
-    assert string.splitlines()[-1] == "[6 rows x 4 columns]"
+    with pd.option_context("display.max_rows", 5, "display.max_columns", 3):
+        got = df.to_string()
+
+    expect = textwrap.dedent(
+        """\
+            a  ...   d
+        0   1  ...  11
+        1   2  ...  12
+        .. ..  ...  ..
+        4   5  ...  15
+        5   6  ...  16
+
+        [6 rows x 4 columns]"""
+    )
+    assert got == expect
+
 
-    # Test masked
+def test_dataframe_to_string_with_masked_data():
+    # Test masked data
     df = cudf.DataFrame(
         {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]}
     )
@@ -783,34 +811,33 @@ def test_dataframe_to_string():
     assert masked.null_count == 2
     df["c"] = masked
 
-    # check data
+    # Check data
     values = masked.copy()
     validids = [0, 2, 3, 5]
     densearray = masked.dropna().to_numpy()
     np.testing.assert_equal(data[validids], densearray)
-    # valid position is correct
-
+    # Valid position is correct
     for i in validids:
         assert data[i] == values[i]
-    # null position is correct
+    # Null position is correct
     for i in range(len(values)):
         if i not in validids:
             assert values[i] is cudf.NA
 
-    pd.options.display.max_rows = 10
-    got = df.to_string()
+    with pd.option_context("display.max_rows", 10):
+        got = df.to_string()
 
-    expect = """
-a b  c
-0 1 11 0
-1 2 12 <NA>
-2 3 13 2
-3 4 14 3
-4 5 15 <NA>
-5 6 16 5
-"""
-    # values should match despite whitespace difference
-    assert got.split() == expect.split()
+    expect = textwrap.dedent(
+        """\
+           a   b     c
+        0  1  11     0
+        1  2  12  <NA>
+        2  3  13     2
+        3  4  14     3
+        4  5  15  <NA>
+        5  6  16     5"""
+    )
+    assert got == expect
 
 
 def test_dataframe_to_string_wide(monkeypatch):
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
new file mode 100644
index 00000000000..05d6886c297
--- /dev/null
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -0,0 +1,102 @@
+import contextlib
+import doctest
+import inspect
+import io
+import os
+
+import numpy as np
+import pytest
+
+import cudf
+
+
+def _name_in_all(parent, name):
+    return name in getattr(parent, "__all__", [])
+
+
+def _is_public_name(parent, name):
+    return not name.startswith("_")
+
+
+def _find_doctests_in_obj(obj, finder=None, criteria=None):
+    """Find all doctests in an object.
+
+    Parameters
+    ----------
+    obj : module or class
+        The object to search for docstring examples.
+    finder : doctest.DocTestFinder, optional
+        The DocTestFinder object to use. If not provided, a DocTestFinder is
+        constructed.
+    criteria : callable, optional
+        Callable indicating whether to recurse over members of the provided
+        object. If not provided, names not defined in the object's ``__all__``
+        property are ignored.
+
+    Yields
+    ------
+    doctest.DocTest
+        The next doctest found in the object.
+    """
+    if finder is None:
+        finder = doctest.DocTestFinder()
+    if criteria is None:
+        criteria = _name_in_all
+    for docstring in finder.find(obj):
+        if docstring.examples:
+            yield docstring
+    for name, member in inspect.getmembers(obj):
+        # Only recurse over members matching the criteria
+        if not criteria(obj, name):
+            continue
+        # Recurse over the public API of modules (objects defined in the
+        # module's __all__)
+        if inspect.ismodule(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_name_in_all
+            )
+        # Recurse over the public API of classes (attributes not prefixed with
+        # an underscore)
+        if inspect.isclass(member):
+            yield from _find_doctests_in_obj(
+                member, finder, criteria=_is_public_name
+            )
+
+
+class TestDoctests:
+    @pytest.fixture(autouse=True)
+    def chdir_to_tmp_path(cls, tmp_path):
+        # Some doctests generate files, so this fixture runs the tests in a
+        # temporary directory.
+        original_directory = os.getcwd()
+        os.chdir(tmp_path)
+        yield
+        os.chdir(original_directory)
+
+    @pytest.mark.parametrize(
+        "docstring",
+        _find_doctests_in_obj(cudf),
+        ids=lambda docstring: docstring.name,
+    )
+    def test_docstring(self, docstring):
+        # We ignore differences in whitespace in the doctest output, and enable
+        # the use of an ellipsis "..." to match any string in the doctest
+        # output. An ellipsis is useful for, e.g., memory addresses or
+        # imprecise floating point values.
+        optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+        runner = doctest.DocTestRunner(optionflags=optionflags)
+
+        # These global names are pre-defined and can be used in doctests
+        # without first importing them.
+        globals = dict(cudf=cudf, np=np,)
+        docstring.globs = globals
+
+        # Capture stdout and include failing outputs in the traceback.
+        doctest_stdout = io.StringIO()
+        with contextlib.redirect_stdout(doctest_stdout):
+            runner.run(docstring)
+            results = runner.summarize()
+        assert not results.failed, (
+            f"{results.failed} of {results.attempted} doctests failed for "
+            f"{docstring.name}:\n{doctest_stdout.getvalue()}"
+        )
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 7a4a2673f9b..2fcf996b641 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -225,13 +225,13 @@ def wrapper(func):
         2   2010-01-01
         dtype: datetime64[s]
         >>> s.describe()
-        count                                3
-        mean     2006-09-01 08:00:00.000000000
-        min      2000-01-01 00:00:00.000000000
-        25%      2004-12-31 12:00:00.000000000
-        50%      2010-01-01 00:00:00.000000000
-        75%      2010-01-01 00:00:00.000000000
-        max      2010-01-01 00:00:00.000000000
+        count                     3
+        mean    2006-09-01 08:00:00
+        min     2000-01-01 00:00:00
+        25%     2004-12-31 12:00:00
+        50%     2010-01-01 00:00:00
+        75%     2010-01-01 00:00:00
+        max     2010-01-01 00:00:00
         dtype: object
 
         Describing a ``DataFrame``. By default only numeric fields are
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index b881f9372bc..6f958860dad 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -972,9 +972,9 @@
 >>> import cudf
 >>> filename = 'foo.csv'
 >>> df = cudf.DataFrame({'x': [0, 1, 2, 3],
-                         'y': [1.0, 3.3, 2.2, 4.4],
-                         'z': ['a', 'b', 'c', 'd']})
->>> df = df.set_index([3, 2, 1, 0])
+...                      'y': [1.0, 3.3, 2.2, 4.4],
+...                      'z': ['a', 'b', 'c', 'd']})
+>>> df = df.set_index(cudf.Series([3, 2, 1, 0]))
 >>> df.to_csv(filename)
 
 """

From 7ff5f128bec185c40017bab20c08f1342fa6b74e Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Sat, 15 Jan 2022 14:12:12 -0700
Subject: [PATCH 169/202] Support structs for `cudf::contains` with
 column/scalar input (#9929)

This PR adds support for `cudf::contains` so we can check whether a structs column contains a scalar struct element.

Partially addresses #8965. This does not support checking if structs given in a structs column exist in another structs column. Such cases will be supported when the new data structure mentioned in https://github.com/rapidsai/cudf/issues/9413 is merged into cudf.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9929
---
 cpp/src/search/search.cu                |  66 +++++--
 cpp/tests/search/search_struct_test.cpp | 241 +++++++++++++++++++++++-
 2 files changed, 288 insertions(+), 19 deletions(-)

diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index 241b3c595f1..81ed3cfbd51 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,11 +173,56 @@ bool contains_scalar_dispatch::operator()<cudf::list_view>(column_view const&,
 }
 
 template <>
-bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const&,
-                                                             scalar const&,
-                                                             rmm::cuda_stream_view)
+bool contains_scalar_dispatch::operator()<cudf::struct_view>(column_view const& col,
+                                                             scalar const& value,
+                                                             rmm::cuda_stream_view stream)
 {
-  CUDF_FAIL("struct_view type not supported yet");
+  CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match");
+
+  auto const scalar_table = static_cast<struct_scalar const*>(&value)->view();
+  CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(),
+               "struct scalar and structs column must have the same number of children");
+  for (size_type i = 0; i < col.num_children(); ++i) {
+    CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(),
+                 "scalar and column children types must match");
+  }
+
+  // Prepare to flatten the structs column and scalar.
+  auto const has_null_elements =
+    has_nested_nulls(table_view{std::vector<column_view>{col.child_begin(), col.child_end()}}) ||
+    has_nested_nulls(scalar_table);
+  auto const flatten_nullability = has_null_elements
+                                     ? structs::detail::column_nullability::FORCE
+                                     : structs::detail::column_nullability::MATCH_INCOMING;
+
+  // Flatten the input structs column, only materialize the bitmask if there is null in the input.
+  auto const col_flattened =
+    structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability);
+  auto const val_flattened =
+    structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability);
+
+  // The struct scalar only contains the struct member columns.
+  // Thus, if there is any null in the input, we must exclude the first column in the flattened
+  // table of the input column from searching because that column is the materialized bitmask of
+  // the input structs column.
+  auto const col_flattened_content  = col_flattened.flattened_columns();
+  auto const col_flattened_children = table_view{std::vector<column_view>{
+    col_flattened_content.begin() + static_cast<size_type>(has_null_elements),
+    col_flattened_content.end()}};
+
+  auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream);
+  auto const d_val_ptr          = table_device_view::create(val_flattened, stream);
+
+  auto const start_iter = thrust::make_counting_iterator<size_type>(0);
+  auto const end_iter   = start_iter + col.size();
+  auto const comp       = row_equality_comparator(
+    nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL);
+  auto const found_iter = thrust::find_if(
+    rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) {
+      return comp(idx, 0);  // compare col[idx] == val[0].
+    });
+
+  return found_iter != end_iter;
 }
 
 template <>
@@ -203,7 +248,6 @@ namespace detail {
 bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream)
 {
   if (col.is_empty()) { return false; }
-
   if (not value.is_valid(stream)) { return col.has_nulls(); }
 
   return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream);
@@ -264,20 +308,14 @@ struct multi_contains_dispatch {
 
 template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<list_view>(
-  column_view const& haystack,
-  column_view const& needles,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
 {
   CUDF_FAIL("list_view type not supported");
 }
 
 template <>
 std::unique_ptr<column> multi_contains_dispatch::operator()<struct_view>(
-  column_view const& haystack,
-  column_view const& needles,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
 {
   CUDF_FAIL("struct_view type not supported");
 }
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index db2ecb89d6a..a1f0b1d81cf 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -35,15 +36,14 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve
 constexpr int32_t null{0};  // Mark for null child elements
 constexpr int32_t XXX{0};   // Mark for null struct elements
 
-template <typename T>
-struct TypedStructSearchTest : public cudf::test::BaseFixture {
-};
-
 using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                      cudf::test::FloatingPointTypes,
                                      cudf::test::DurationTypes,
                                      cudf::test::TimestampTypes>;
 
+template <typename T>
+struct TypedStructSearchTest : public cudf::test::BaseFixture {
+};
 TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes);
 
 namespace {
@@ -353,3 +353,234 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity);
 }
+
+template <typename T>
+struct TypedScalarStructContainTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes);
+
+TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const col = [] {
+    auto child = col_wrapper{};
+    return structs_col{{child}};
+  }();
+
+  auto const val = [] {
+    auto child = col_wrapper{1};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child});
+  }();
+
+  EXPECT_EQ(false, cudf::contains(col, val));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const col = [] {
+    auto child1 = col_wrapper{1, 2, 3};
+    auto child2 = col_wrapper{4, 5, 6};
+    auto child3 = strings_col{"x", "y", "z"};
+    return structs_col{{child1, child2, child3}};
+  }();
+
+  auto const val1 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"x"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+  auto const val2 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"a"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+
+  EXPECT_EQ(true, cudf::contains(col, val1));
+  EXPECT_EQ(false, cudf::contains(col, val2));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t dont_care{0};
+
+  auto const col_original = [] {
+    auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care};
+    auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care};
+    auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"};
+    return structs_col{{child1, child2, child3}};
+  }();
+  auto const col = cudf::slice(col_original, {2, 5})[0];
+
+  auto const val1 = [] {
+    auto child1 = col_wrapper{1};
+    auto child2 = col_wrapper{4};
+    auto child3 = strings_col{"x"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+  auto const val2 = [] {
+    auto child1 = col_wrapper{dont_care};
+    auto child2 = col_wrapper{dont_care};
+    auto child3 = strings_col{"dont_care"};
+    return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+  }();
+
+  EXPECT_EQ(true, cudf::contains(col, val1));
+  EXPECT_EQ(false, cudf::contains(col, val2));
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t null{0};
+
+  // Test with nulls at the top level.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{1, null, 3};
+      auto child2 = col_wrapper{4, null, 6};
+      auto child3 = strings_col{"x", "" /*NULL*/, "z"};
+      return structs_col{{child1, child2, child3}, null_at(1)};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"a"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls at the children level.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{{1, null, 3}, null_at(1)};
+      auto child2 = col_wrapper{{4, null, 6}, null_at(1)};
+      auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)};
+      return structs_col{{child1, child2, child3}};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{""};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls in the input scalar.
+  {
+    auto const col = [] {
+      auto child1 = col_wrapper{1, 2, 3};
+      auto child2 = col_wrapper{4, 5, 6};
+      auto child3 = strings_col{"x", "y", "z"};
+      return structs_col{{child1, child2, child3}};
+    }();
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"" /*NULL*/}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+}
+
+TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests)
+{
+  using col_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  constexpr int32_t dont_care{0};
+  constexpr int32_t null{0};
+
+  // Test with nulls at the top level.
+  {
+    auto const col_original = [] {
+      auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care};
+      auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care};
+      auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"};
+      return structs_col{{child1, child2, child3}, null_at(3)};
+    }();
+    auto const col = cudf::slice(col_original, {2, 5})[0];
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"x"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{"a"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+
+  // Test with nulls at the children level.
+  {
+    auto const col_original = [] {
+      auto child1 =
+        col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)};
+      auto child2 =
+        col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)};
+      auto child3 = strings_col{
+        {"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)};
+      return structs_col{{child1, child2, child3}, null_at(1)};
+    }();
+    auto const col = cudf::slice(col_original, {2, 5})[0];
+
+    auto const val1 = [] {
+      auto child1 = col_wrapper{1};
+      auto child2 = col_wrapper{4};
+      auto child3 = strings_col{{"x"}, null_at(0)};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+    auto const val2 = [] {
+      auto child1 = col_wrapper{dont_care};
+      auto child2 = col_wrapper{dont_care};
+      auto child3 = strings_col{"dont_care"};
+      return cudf::struct_scalar(std::vector<cudf::column_view>{child1, child2, child3});
+    }();
+
+    EXPECT_EQ(true, cudf::contains(col, val1));
+    EXPECT_EQ(false, cudf::contains(col, val2));
+  }
+}

From e4a16ae2550f5a7481887b28c7a60fc14fea2f5c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 18 Jan 2022 06:31:14 -0800
Subject: [PATCH 170/202] Implement mixed equality/conditional joins (#9917)

This PR implements mixed equality/inequality joins for inner, left, and full joins. This resolves #9696 and contributes to #5401. For the moment, all APIs are functional only, but an object-oriented API is planned to support caching of the hash table.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Jason Lowe (https://github.com/jlowe)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9917
---
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/ast/detail/expression_evaluator.cuh  |   4 +-
 cpp/include/cudf/join.hpp                     | 267 +++++++-
 cpp/include/cudf/table/row_operators.cuh      |   7 +-
 cpp/src/join/conditional_join.cu              |  21 +-
 cpp/src/join/conditional_join_kernels.cuh     |   6 +-
 cpp/src/join/hash_join.cu                     |  43 --
 cpp/src/join/hash_join.cuh                    |  44 +-
 cpp/src/join/join_common_utils.cuh            |  11 +
 cpp/src/join/join_common_utils.hpp            |   9 +
 cpp/src/join/mixed_join.cu                    | 557 +++++++++++++++
 cpp/src/join/mixed_join_kernels.cuh           | 322 +++++++++
 cpp/tests/CMakeLists.txt                      |   2 +-
 cpp/tests/join/conditional_join_tests.cu      |  13 +-
 cpp/tests/join/mixed_join_tests.cu            | 643 ++++++++++++++++++
 15 files changed, 1884 insertions(+), 66 deletions(-)
 create mode 100644 cpp/src/join/mixed_join.cu
 create mode 100644 cpp/src/join/mixed_join_kernels.cuh
 create mode 100644 cpp/tests/join/mixed_join_tests.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a8100fb3f92..2f51f582e12 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -322,6 +322,7 @@ add_library(
   src/jit/parser.cpp
   src/jit/type.cpp
   src/join/conditional_join.cu
+  src/join/mixed_join.cu
   src/join/cross_join.cu
   src/join/hash_join.cu
   src/join/join.cu
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 0b739482c4d..ecd46ec2c23 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -429,7 +429,7 @@ struct expression_evaluator {
   __device__ __forceinline__ void evaluate(
     expression_result<ResultSubclass, T, result_has_nulls>& output_object,
     cudf::size_type const row_index,
-    IntermediateDataType<has_nulls>* thread_intermediate_storage)
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
     evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage);
   }
@@ -452,7 +452,7 @@ struct expression_evaluator {
     cudf::size_type const left_row_index,
     cudf::size_type const right_row_index,
     cudf::size_type const output_row_index,
-    IntermediateDataType<has_nulls>* thread_intermediate_storage)
+    IntermediateDataType<has_nulls>* thread_intermediate_storage) const
   {
     cudf::size_type operator_source_index{0};
     for (cudf::size_type operator_index = 0; operator_index < plan.operators.size();
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 8ea6bd1a6cc..30400074c50 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -19,6 +19,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -701,7 +702,7 @@ conditional_inner_join(
  * The first returned vector contains all the row indices from the left
  * table (in unspecified order). The corresponding value in the
  * second returned vector is either (1) the row index of the matched row
- * from the right table, if there is a match  or  (2) an unspecified
+ * from the right table, if there is a match or (2) an unspecified
  * out-of-bounds value.
  *
  * If the provided predicate returns NULL for a pair of rows
@@ -858,6 +859,270 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   std::optional<std::size_t> output_size = {},
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables.
+ *
+ * The first returned vector contains the row indices from the left
+ * table that have a match in the right table (in unspecified order).
+ * The corresponding values in the second returned vector are
+ * the matched row indices from the right table.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{1}, {0}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed inner join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_inner_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables,
+ * or null matches for rows in left that have no match in right.
+ *
+ * The first returned vector contains the row indices from the left
+ * tables that have a match in the right tables (in unspecified order).
+ * The corresponding value in the second returned vector is either (1)
+ * the row index of the matched row from the right tables, or (2) an
+ * unspecified out-of-bounds value.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{0, 1, 2}, {None, 0, None}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_left_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed left join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_left_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns a pair of row index vectors corresponding to all pairs of
+ * rows between the specified tables where the columns of the equality table
+ * are equal and the predicate evaluates to true on the conditional tables,
+ * or null matches for rows in either pair of tables that have no matches in
+ * the other pair.
+ *
+ * Taken pairwise, the values from the returned vectors are one of:
+ * (1) row indices corresponding to matching rows from the left and
+ * right tables, (2) a row index and an unspecified out-of-bounds value,
+ * representing a row from one table without a match in the other.
+ *
+ * If the provided predicate returns NULL for a pair of rows
+ * (left, right), that pair is not included in the output. It is the user's
+ * responsiblity to choose a suitable compare_nulls value AND use appropriate
+ * null-safe operators in the expression.
+ *
+ * If the provided output size or per-row counts are incorrect, behavior is undefined.
+ *
+ * @code{.pseudo}
+ * left_equality: {{0, 1, 2}}
+ * right_equality: {{1, 2, 3}}
+ * left_conditional: {{4, 4, 4}}
+ * right_conditional: {{3, 4, 5}}
+ * Expression: Left.Column_0 > Right.Column_0
+ * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}}
+ * @endcode
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size_data An optional pair of values indicating the exact output size and the
+ * number of matches for each row in the larger of the two input tables, left or right (may be
+ * precomputed using the corresponding mixed_full_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
+ * the result of performing a mixed full join between the four input tables.
+ */
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_full_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls = null_equality::EQUAL,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * mixed inner join between the specified tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_inner_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Returns the exact number of matches (rows) when performing a
+ * mixed left join between the specified tables where the columns of the
+ * equality table are equal and the predicate evaluates to true on the
+ * conditional tables.
+ *
+ * If the provided predicate returns NULL for a pair of rows (left, right),
+ * that pair is not included in the output. It is the user's responsiblity to
+ * choose a suitable compare_nulls value AND use appropriate null-safe
+ * operators in the expression.
+ *
+ * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
+ * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
+ * match.
+ * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
+ * match.
+ *
+ * @param left_equality The left table used for the equality join.
+ * @param right_equality The right table used for the equality join.
+ * @param left_conditional The left table used for the conditional join.
+ * @param right_conditional The right table used for the conditional join.
+ * @param binary_predicate The condition on which to join.
+ * @param compare_nulls Whether or not null values join to each other or not.
+ * @param output_size An optional pair of values indicating the exact output size and the number of
+ * matches for each row in the larger of the two input tables, left or right (may be precomputed
+ * using the corresponding mixed_inner_join_size API).
+ * @param mr Device memory resource used to allocate the returned table and columns' device memory
+ *
+ * @return A pair containing the size that would result from performing the
+ * requested join and the number of matches for each row in one of the two
+ * tables. Which of the two tables is an implementation detail and should not
+ * be relied upon, simply passed to the corresponding `mixed_left_join` API as
+ * is.
+ */
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls         = null_equality::EQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * conditional inner join between the specified tables where the predicate
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 32ddd1ef49a..a3b08fda15d 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -492,8 +492,11 @@ template <template <typename> class hash_function, typename Nullate>
 class row_hasher {
  public:
   row_hasher() = delete;
-  row_hasher(Nullate has_nulls, table_device_view t) : _table{t}, _has_nulls{has_nulls} {}
-  row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
+  CUDF_HOST_DEVICE row_hasher(Nullate has_nulls, table_device_view t)
+    : _table{t}, _has_nulls{has_nulls}
+  {
+  }
+  CUDF_HOST_DEVICE row_hasher(Nullate has_nulls, table_device_view t, uint32_t seed)
     : _table{t}, _seed(seed), _has_nulls{has_nulls}
   {
   }
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 3992361ab1c..c3dc343dd2d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -97,9 +97,11 @@ conditional_join(table_view const& left,
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
   auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+                               DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-  join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+  join_kind const kernel_join_type =
+    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
 
   // If the join size was not provided as an input, compute it here.
   std::size_t join_size;
@@ -197,6 +199,13 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::mr::device_memory_resource* mr)
 {
+  // Until we add logic to handle the number of non-matches in the right table,
+  // full joins are not supported in this function. Note that this does not
+  // prevent actually performing full joins since we do that by calculating the
+  // left join and then concatenating the complementary right indices.
+  CUDF_EXPECTS(join_type != join_kind::FULL_JOIN,
+               "Size estimation is not available for full joins.");
+
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
@@ -232,8 +241,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // If none of the input columns actually contain nulls, we can still use the
   // non-nullable version of the expression evaluation code path for
   // performance, so we capture that information as well.
-  auto const nullable  = cudf::nullable(left) || cudf::nullable(right);
-  auto const has_nulls = nullable && (cudf::has_nulls(left) || cudf::has_nulls(right));
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
 
   auto const parser =
     ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
@@ -246,11 +254,10 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
   auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+                               DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
-  assert(join_type != join_kind::FULL_JOIN);
-
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
   CHECK_CUDA(stream.value());
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 4aceb79a9aa..746377296b5 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ __global__ void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * blockDim.x;
-  cudf::size_type const stride         = blockDim.x * gridDim.x;
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
   auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
@@ -169,7 +169,7 @@ __global__ void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
   unsigned int const activemask = __ballot_sync(0xffffffff, outer_row_index < outer_num_rows);
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index c259be2a285..7590c93f0c3 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -19,7 +19,6 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -34,10 +33,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-
-}  // anonymous namespace
-
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
   table_view const& probe, table_view const& build)
 {
@@ -46,44 +41,6 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
   return std::make_pair(std::move(empty_probe), std::move(empty_build));
 }
 
-/**
- * @brief Builds the hash table based on the given `build_table`.
- *
- * @param build Table of columns used to build join hash.
- * @param hash_table Build hash table.
- * @param compare_nulls Controls whether null join-key values should match or not.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- *
- */
-void build_join_hash_table(cudf::table_view const& build,
-                           multimap_type& hash_table,
-                           null_equality compare_nulls,
-                           rmm::cuda_stream_view stream)
-{
-  auto build_table_ptr = cudf::table_device_view::create(build, stream);
-
-  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
-
-  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
-  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
-  make_pair_function pair_func{hash_build, empty_key_sentinel};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
-
-  size_type const build_table_num_rows{build_table_ptr->num_rows()};
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
-  } else {
-    thrust::counting_iterator<size_type> stencil(0);
-    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
-  }
-}
-
 /**
  * @brief Probes the `hash_table` built from `build_table` for tuples in `probe_table`,
  * and returns the output indices of `build_table` and `probe_table` as a combined table.
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 5a042f65aad..4005d6101bd 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -20,6 +20,7 @@
 
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
@@ -57,7 +58,8 @@ constexpr auto remap_sentinel_hash(H hash, S sentinel)
  */
 class make_pair_function {
  public:
-  make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel)
+  CUDF_HOST_DEVICE make_pair_function(row_hash const& hash,
+                                      hash_value_type const empty_key_sentinel)
     : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
   {
   }
@@ -143,6 +145,46 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
 std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
                                                 std::unique_ptr<cudf::table>&& right);
 
+/**
+ * @brief Builds the hash table based on the given `build_table`.
+ *
+ * @tparam MultimapType The type of the hash table
+ *
+ * @param build Table of columns used to build join hash.
+ * @param hash_table Build hash table.
+ * @param compare_nulls Controls whether null join-key values should match or not.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ *
+ */
+template <typename MultimapType>
+void build_join_hash_table(cudf::table_view const& build,
+                           MultimapType& hash_table,
+                           null_equality compare_nulls,
+                           rmm::cuda_stream_view stream)
+{
+  auto build_table_ptr = cudf::table_device_view::create(build, stream);
+
+  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
+
+  row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_build, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  size_type const build_table_num_rows{build_table_ptr->num_rows()};
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first;
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
+  }
+}
 }  // namespace detail
 
 struct hash_join::hash_join_impl {
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 2fd0207a2c0..b778f13b5e1 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -45,6 +45,17 @@ class row_is_valid {
 
 /**
  * @brief Device functor to determine if two pairs are identical.
+ *
+ * This equality comparator is designed for use with cuco::static_multimap's
+ * pair* APIs, which will compare equality based on comparing (key, value)
+ * pairs. In the context of joins, these pairs are of the form
+ * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
+ * equal to the hash of the hash of some row in the multimap, at which point we need an
+ * equality comparator that will check whether the contents of the rows are
+ * identical. This comparator does so by verifying key equality (i.e. that
+ * probe_row_hash == build_row_hash) and then using a row_equality_comparator
+ * to compare the contents of the row indices that are stored as the payload in
+ * the hash map.
  */
 class pair_equality {
  public:
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 9a7540bcd33..938a85247f8 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -51,6 +51,15 @@ using multimap_type =
                         hash_table_allocator_type,
                         cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
+// Multimap type used for mixed joins. TODO: This is a temporary alias used
+// until the mixed joins are converted to using CGs properly. Right now it's
+// using a cooperative group of size 1.
+using mixed_multimap_type = cuco::static_multimap<hash_value_type,
+                                                  size_type,
+                                                  cuda::thread_scope_device,
+                                                  hash_table_allocator_type,
+                                                  cuco::double_hashing<1, hash_type, hash_type>>;
+
 using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;
 
 using row_equality = cudf::row_equality_comparator<cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
new file mode 100644
index 00000000000..c609b58132c
--- /dev/null
+++ b/cpp/src/join/mixed_join.cu
@@ -0,0 +1,557 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/join.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+#include <join/hash_join.cuh>
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+#include <join/mixed_join_kernels.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <optional>
+#include <utility>
+
+#include <cstdio>
+
+namespace cudf {
+namespace detail {
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  join_kind join_type,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const& output_size_data,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  // We can immediately filter out cases where the right table is empty. In
+  // some cases, we return all the rows of the left table with a corresponding
+  // null index for the right table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, and full all return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream);
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, left semi, and inner joins all return empty sets.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                              std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+      // Full joins need to return the trivial complement.
+      case join_kind::FULL_JOIN: {
+        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream);
+        return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe     = swap_tables ? right_equality : left_equality;
+  auto& build     = swap_tables ? left_equality : right_equality;
+  auto probe_view = table_device_view::create(probe, stream);
+  auto build_view = table_device_view::create(build, stream);
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  // Don't use multimap_type because we want a CG size of 1.
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    std::numeric_limits<hash_value_type>::max(),
+    cudf::detail::JoinNoneValue,
+    stream.value(),
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream}};
+
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto hash_table_view = hash_table.get_device_view();
+
+  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view = table_device_view::create(right_conditional, stream);
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  join_kind const kernel_join_type =
+    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
+
+  // If the join size data was not provided as an input, compute it here.
+  std::size_t join_size;
+  // Using an optional because we only need to allocate a new vector if one was
+  // not passed as input, and rmm::device_uvector is not default constructible
+  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
+  device_span<size_type const> matches_per_row_span{};
+
+  if (output_size_data.has_value()) {
+    join_size            = output_size_data->first;
+    matches_per_row_span = output_size_data->second;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    CHECK_CUDA(stream.value());
+
+    matches_per_row =
+      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+    // Note that the view goes out of scope after this else statement, but the
+    // data owned by matches_per_row stays alive so the data pointer is valid.
+    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    matches_per_row_span = cudf::device_span<size_type const>{
+      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+    if (has_nulls) {
+      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    } else {
+      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_conditional_view,
+          *right_conditional_view,
+          *probe_view,
+          *build_view,
+          equality_probe,
+          kernel_join_type,
+          hash_table_view,
+          parser.device_expression_data,
+          swap_tables,
+          size.data(),
+          mutable_matches_per_row_span);
+    }
+    CHECK_CUDA(stream.value());
+    join_size = size.value(stream);
+  }
+
+  // The initial early exit clauses guarantee that we will not reach this point
+  // unless both the left and right tables are non-empty. Under that
+  // constraint, neither left nor full joins can return an empty result since
+  // at minimum we are guaranteed null matches for all non-matching rows. In
+  // all other cases (inner, left semi, and left anti joins) if we reach this
+  // point we can safely return an empty result.
+  if (join_size == 0) {
+    return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
+                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
+  }
+
+  // Given the number of matches per row, we need to compute the offsets for insertion.
+  auto join_result_offsets =
+    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
+  thrust::exclusive_scan(rmm::exec_policy{stream},
+                         matches_per_row_span.begin(),
+                         matches_per_row_span.end(),
+                         join_result_offsets.begin());
+
+  auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+  auto const& join_output_r = right_indices->data();
+
+  if (has_nulls) {
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        join_output_r,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  } else {
+    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        kernel_join_type,
+        hash_table_view,
+        join_output_l,
+        join_output_r,
+        parser.device_expression_data,
+        join_result_offsets.data(),
+        swap_tables);
+  }
+
+  CHECK_CUDA(stream.value());
+
+  auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices));
+
+  // For full joins, get the indices in the right table that were not joined to
+  // by any row in the left table.
+  if (join_type == join_kind::FULL_JOIN) {
+    auto complement_indices = detail::get_left_join_indices_complement(
+      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+    join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
+  }
+  return join_indices;
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
+compute_mixed_join_output_size(table_view const& left_equality,
+                               table_view const& right_equality,
+                               table_view const& left_conditional,
+                               table_view const& right_conditional,
+                               ast::expression const& binary_predicate,
+                               null_equality compare_nulls,
+                               join_kind join_type,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  // Until we add logic to handle the number of non-matches in the right table,
+  // full joins are not supported in this function. Note that this does not
+  // prevent actually performing full joins since we do that by calculating the
+  // left join and then concatenating the complementary right indices.
+  CUDF_EXPECTS(join_type != join_kind::FULL_JOIN,
+               "Size estimation is not available for full joins.");
+
+  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
+               "The left conditional and equality tables must have the same number of rows.");
+  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
+               "The right conditional and equality tables must have the same number of rows.");
+
+  auto const right_num_rows{right_conditional.num_rows()};
+  auto const left_num_rows{left_conditional.num_rows()};
+  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+
+  // The "outer" table is the larger of the two tables. The kernels are
+  // launched with one thread per row of the outer table, which also means that
+  // it is the probe table for the hash
+  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+
+  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
+    static_cast<std::size_t>(outer_num_rows), stream, mr);
+  auto matches_per_row_span = cudf::device_span<size_type>{
+    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
+
+  // We can immediately filter out cases where one table is empty. In
+  // some cases, we return all the rows of the other table with a corresponding
+  // null index for the empty table; in others, we return an empty output.
+  if (right_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, and full all return all the row indices from left
+      // with a corresponding NULL from the right.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::FULL_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
+        return {left_num_rows, std::move(matches_per_row)};
+      }
+      // Inner and left semi joins return empty output because no matches can exist.
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
+        return {0, std::move(matches_per_row)};
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left_num_rows == 0) {
+    switch (join_type) {
+      // Left, left anti, left semi, and inner joins all return empty sets.
+      case join_kind::LEFT_JOIN:
+      case join_kind::LEFT_ANTI_JOIN:
+      case join_kind::INNER_JOIN:
+      case join_kind::LEFT_SEMI_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
+        return {0, std::move(matches_per_row)};
+      }
+      // Full joins need to return the trivial complement.
+      case join_kind::FULL_JOIN: {
+        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
+        return {right_num_rows, std::move(matches_per_row)};
+      }
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  // If evaluating the expression may produce null outputs we create a nullable
+  // output column and follow the null-supporting expression evaluation code
+  // path.
+  auto const has_nulls =
+    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream);
+
+  auto const parser = ast::detail::expression_parser{
+    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a boolean output.");
+
+  // TODO: The non-conditional join impls start with a dictionary matching,
+  // figure out what that is and what it's needed for (and if conditional joins
+  // need to do the same).
+  auto& probe     = swap_tables ? right_equality : left_equality;
+  auto& build     = swap_tables ? left_equality : right_equality;
+  auto probe_view = table_device_view::create(probe, stream);
+  auto build_view = table_device_view::create(build, stream);
+  row_equality equality_probe{
+    cudf::nullate::DYNAMIC{has_nulls}, *probe_view, *build_view, compare_nulls};
+
+  // Don't use multimap_type because we want a CG size of 1.
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    std::numeric_limits<hash_value_type>::max(),
+    cudf::detail::JoinNoneValue,
+    stream.value(),
+    detail::hash_table_allocator_type{default_allocator<char>{}, stream}};
+
+  // TODO: To add support for nested columns we will need to flatten in many
+  // places. However, this probably isn't worth adding any time soon since we
+  // won't be able to support AST conditions for those types anyway.
+  build_join_hash_table(build, hash_table, compare_nulls, stream);
+  auto hash_table_view = hash_table.get_device_view();
+
+  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
+  auto right_conditional_view = table_device_view::create(right_conditional, stream);
+
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+  CHECK_CUDA(stream.value());
+
+  // Determine number of output rows without actually building the output to simply
+  // find what the size of the output will be.
+  if (has_nulls) {
+    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  } else {
+    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_conditional_view,
+        *right_conditional_view,
+        *probe_view,
+        *build_view,
+        equality_probe,
+        join_type,
+        hash_table_view,
+        parser.device_expression_data,
+        swap_tables,
+        size.data(),
+        matches_per_row_span);
+  }
+  CHECK_CUDA(stream.value());
+
+  return {size.value(stream), std::move(matches_per_row)};
+}
+
+}  // namespace detail
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_inner_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::INNER_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_inner_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size(left_equality,
+                                                right_equality,
+                                                left_conditional,
+                                                right_conditional,
+                                                binary_predicate,
+                                                compare_nulls,
+                                                detail::join_kind::INNER_JOIN,
+                                                rmm::cuda_stream_default,
+                                                mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_left_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::LEFT_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_join_size(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::compute_mixed_join_output_size(left_equality,
+                                                right_equality,
+                                                left_conditional,
+                                                right_conditional,
+                                                binary_predicate,
+                                                compare_nulls,
+                                                detail::join_kind::LEFT_JOIN,
+                                                rmm::cuda_stream_default,
+                                                mr);
+}
+
+std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+          std::unique_ptr<rmm::device_uvector<size_type>>>
+mixed_full_join(
+  table_view const& left_equality,
+  table_view const& right_equality,
+  table_view const& left_conditional,
+  table_view const& right_conditional,
+  ast::expression const& binary_predicate,
+  null_equality compare_nulls,
+  std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::mixed_join(left_equality,
+                            right_equality,
+                            left_conditional,
+                            right_conditional,
+                            binary_predicate,
+                            compare_nulls,
+                            detail::join_kind::FULL_JOIN,
+                            output_size_data,
+                            rmm::cuda_stream_default,
+                            mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
new file mode 100644
index 00000000000..9812d4c4b7d
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernels.cuh
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <join/join_common_utils.cuh>
+#include <join/join_common_utils.hpp>
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cooperative_groups.h>
+
+#include <cub/cub.cuh>
+#include <cuco/detail/pair.cuh>
+#include <thrust/equal.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+
+namespace cudf {
+namespace detail {
+namespace cg = cooperative_groups;
+
+/**
+ * @brief Device functor to determine if two pairs are identical.
+ *
+ * This equality comparator is designed for use with cuco::static_multimap's
+ * pair* APIs, which will compare equality based on comparing (key, value)
+ * pairs. In the context of joins, these pairs are of the form
+ * (row_hash, row_id). A hash probe hit indicates that hash of a probe row's hash is
+ * equal to the hash of the hash of some row in the multimap, at which point we need an
+ * equality comparator that will check whether the contents of the rows are
+ * identical. This comparator does so by verifying key equality (i.e. that
+ * probe_row_hash == build_row_hash) and then using a row_equality_comparator
+ * to compare the contents of the row indices that are stored as the payload in
+ * the hash map.
+ *
+ * This particular comparator is a specialized version of the pair_equality used in hash joins. This
+ * version also checks the expression_evaluator.
+ */
+template <bool has_nulls>
+class pair_expression_equality {
+ public:
+  __device__ pair_expression_equality(
+    cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator,
+    cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage,
+    bool const swap_tables,
+    row_equality const& equality_probe)
+    : evaluator{evaluator},
+      thread_intermediate_storage{thread_intermediate_storage},
+      swap_tables{swap_tables},
+      equality_probe{equality_probe}
+  {
+  }
+
+  // The parameters are build/probe rather than left/right because the operator
+  // is called by cuco's kernels with parameters in this order (note that this
+  // is an implementation detail that we should eventually stop relying on by
+  // defining operators with suitable heterogeneous typing). Rather than
+  // converting to left/right semantics, we can operate directly on build/probe
+  // until we get to the expression evaluator, which needs to convert back to
+  // left/right semantics because the conditional expression need not be
+  // commutative.
+  __device__ __forceinline__ bool operator()(const pair_type& build_row,
+                                             const pair_type& probe_row) const noexcept
+  {
+    auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    // Three levels of checks:
+    // 1. Row hashes of the columns involved in the equality condition are equal.
+    // 2. The contents of the columns involved in the equality condition are equal.
+    // 3. The predicate evaluated on the relevant columns (already encoded in the evaluator)
+    // evaluates to true.
+    if ((probe_row.first == build_row.first) &&
+        equality_probe(probe_row.second, build_row.second)) {
+      auto const lrow_idx = swap_tables ? build_row.second : probe_row.second;
+      auto const rrow_idx = swap_tables ? probe_row.second : build_row.second;
+      evaluator.evaluate(output_dest, lrow_idx, rrow_idx, 0, thread_intermediate_storage);
+      return (output_dest.is_valid() && output_dest.value());
+    }
+    return false;
+  }
+
+ private:
+  cudf::ast::detail::IntermediateDataType<has_nulls>* thread_intermediate_storage;
+  cudf::ast::detail::expression_evaluator<has_nulls> const& evaluator;
+  bool const swap_tables;
+  row_equality const& equality_probe;
+};
+
+/**
+ * @brief Computes the output size of joining the left table to the right table.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+template <int block_size, bool has_nulls>
+__global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
+{
+  // The (required) extern storage of the shared memory array leads to
+  // conflicting declarations between different templates. The easiest
+  // workaround is to declare an arbitrary (here char) array type then cast it
+  // after the fact to the appropriate type.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
+
+  std::size_t thread_counter{0};
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const stride         = block_size * gridDim.x;
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
+  // same but at some point we may want to benchmark that.
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    // Figure out the number of elements for this key.
+    cg::thread_block_tile<1> this_thread = cg::this_thread();
+    auto query_pair                      = pair_func(outer_row_index);
+    // TODO: Address asymmetry in operator.
+    auto count_equality = pair_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+    // TODO: This entire kernel probably won't work for left anti joins since I
+    // need to use a normal map (not a multimap), so this condition is probably
+    // overspecified at the moment.
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+        join_type == join_kind::FULL_JOIN) {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count_outer(this_thread, query_pair, count_equality);
+    } else {
+      matches_per_row[outer_row_index] =
+        hash_table_view.pair_count(this_thread, query_pair, count_equality);
+    }
+    thread_counter += matches_per_row[outer_row_index];
+  }
+
+  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
+
+  // Add block counter to global counter
+  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+}
+
+/**
+ * @brief Performs a join using the combination of a hash lookup to identify
+ * equal rows between one pair of tables and the evaluation of an expression
+ * containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam output_cache_size The side of the shared memory buffer to cache join
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[out] join_output_r The right result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <cudf::size_type block_size,
+          cudf::size_type output_cache_size,
+          bool has_nulls,
+          typename OutputIt1,
+          typename OutputIt2>
+__global__ void mixed_join(table_device_view left_table,
+                           table_device_view right_table,
+                           table_device_view probe,
+                           table_device_view build,
+                           row_equality const equality_probe,
+                           join_kind const join_type,
+                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                           OutputIt1 join_output_l,
+                           OutputIt2 join_output_r,
+                           cudf::ast::detail::expression_device_view device_expression_data,
+                           cudf::size_type const* join_result_offsets,
+                           bool const swap_tables)
+{
+  // Normally the casting of a shared memory array is used to create multiple
+  // arrays of different types from the shared memory buffer, but here it is
+  // used to circumvent conflicts between arrays of different types between
+  // different template instantiations due to the extern specifier.
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  cudf::size_type const left_num_rows  = left_table.num_rows();
+  cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  // TODO: The hash join code assumes that nulls exist here, so I'm doing the
+  // same but at some point we may want to benchmark that.
+  row_hash hash_probe{nullate::DYNAMIC{has_nulls}, probe};
+  auto const empty_key_sentinel = hash_table_view.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  if (outer_row_index < outer_num_rows) {
+    // Figure out the number of elements for this key.
+    cg::thread_block_tile<1> this_thread = cg::this_thread();
+    // Figure out the number of elements for this key.
+    auto query_pair = pair_func(outer_row_index);
+    auto equality   = pair_expression_equality<has_nulls>{
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+
+    auto probe_key_begin       = thrust::make_discard_iterator();
+    auto probe_value_begin     = swap_tables ? join_output_r + join_result_offsets[outer_row_index]
+                                             : join_output_l + join_result_offsets[outer_row_index];
+    auto contained_key_begin   = thrust::make_discard_iterator();
+    auto contained_value_begin = swap_tables ? join_output_l + join_result_offsets[outer_row_index]
+                                             : join_output_r + join_result_offsets[outer_row_index];
+
+    // TODO: This entire kernel probably won't work for left anti joins since I
+    // need to use a normal map (not a multimap), so this condition is probably
+    // overspecified at the moment.
+    if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
+        join_type == join_kind::FULL_JOIN) {
+      hash_table_view.pair_retrieve_outer(this_thread,
+                                          query_pair,
+                                          probe_key_begin,
+                                          probe_value_begin,
+                                          contained_key_begin,
+                                          contained_value_begin,
+                                          equality);
+    } else {
+      hash_table_view.pair_retrieve(this_thread,
+                                    query_pair,
+                                    probe_key_begin,
+                                    probe_value_begin,
+                                    contained_key_begin,
+                                    contained_value_begin,
+                                    equality);
+    }
+  }
+}
+
+}  // namespace detail
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d90260400a0..e23403e68e4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ ConfigureTest(
 # * join tests ------------------------------------------------------------------------------------
 ConfigureTest(
   JOIN_TEST join/join_tests.cpp join/conditional_join_tests.cu join/cross_join_tests.cpp
-  join/semi_anti_join_tests.cpp
+  join/semi_anti_join_tests.cpp join/mixed_join_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 9f9547b06cf..702acb884e4 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -30,6 +30,7 @@
 #include <thrust/sort.h>
 
 #include <algorithm>
+#include <iostream>
 #include <random>
 #include <stdexcept>
 #include <tuple>
@@ -125,7 +126,7 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
 }  // namespace
 
 /**
- * The principal fixture for all conditional joins.
+ * Fixture for all nested loop conditional joins.
  */
 template <typename T>
 struct ConditionalJoinTest : public cudf::test::BaseFixture {
@@ -341,7 +342,7 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
 };
 
 /**
- * Tests of inner joins.
+ * Tests of conditional inner joins.
  */
 template <typename T>
 struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -527,7 +528,7 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsNoOutputRowAllEqual)
 };
 
 /**
- * Tests of left joins.
+ * Tests of conditional left joins.
  */
 template <typename T>
 struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -582,7 +583,7 @@ TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHashNulls)
 };
 
 /**
- * Tests of full joins.
+ * Tests of conditional full joins.
  */
 template <typename T>
 struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
@@ -762,7 +763,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
 };
 
 /**
- * Tests of left semi joins.
+ * Tests of conditional left semi joins.
  */
 template <typename T>
 struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
@@ -809,7 +810,7 @@ TYPED_TEST(ConditionalLeftSemiJoinTest, TestCompareRandomToHashNulls)
 };
 
 /**
- * Tests of left anti joins.
+ * Tests of conditional left anti joins.
  */
 template <typename T>
 struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
new file mode 100644
index 00000000000..d6a348698b5
--- /dev/null
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -0,0 +1,643 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+
+#include <algorithm>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace {
+using PairJoinReturn   = std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                                 std::unique_ptr<rmm::device_uvector<cudf::size_type>>>;
+using SingleJoinReturn = std::unique_ptr<rmm::device_uvector<cudf::size_type>>;
+using NullMaskVector   = std::vector<bool>;
+
+template <typename T>
+using ColumnVector = std::vector<std::vector<T>>;
+
+template <typename T>
+using NullableColumnVector = std::vector<std::pair<std::vector<T>, NullMaskVector>>;
+
+constexpr cudf::size_type JoinNoneValue =
+  std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
+
+// Common column references.
+const auto col_ref_left_0  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+const auto col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+
+// Common expressions.
+auto left_zero_eq_right_zero =
+  cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+
+// Generate a single pair of left/right non-nullable columns of random data
+// suitable for testing a join against a reference join implementation.
+template <typename T>
+std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
+  unsigned int N_left            = 10000,
+  unsigned int num_repeats_left  = 10,
+  unsigned int N_right           = 10000,
+  unsigned int num_repeats_right = 10)
+{
+  // Generate columns of num_repeats repeats of the integer range [0, num_unique),
+  // then merge a shuffled version and compare to hash join.
+  unsigned int num_unique_left  = N_left / num_repeats_left;
+  unsigned int num_unique_right = N_right / num_repeats_right;
+
+  std::vector<T> left(N_left);
+  std::vector<T> right(N_right);
+
+  for (unsigned int i = 0; i < num_repeats_left; ++i) {
+    std::iota(std::next(left.begin(), num_unique_left * i),
+              std::next(left.begin(), num_unique_left * (i + 1)),
+              0);
+  }
+  for (unsigned int i = 0; i < num_repeats_right; ++i) {
+    std::iota(std::next(right.begin(), num_unique_right * i),
+              std::next(right.begin(), num_unique_right * (i + 1)),
+              0);
+  }
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::shuffle(left.begin(), left.end(), gen);
+  std::shuffle(right.begin(), right.end(), gen);
+  return std::make_pair(std::move(left), std::move(right));
+}
+
+// Generate a single pair of left/right nullable columns of random data
+// suitable for testing a join against a reference join implementation.
+template <typename T>
+std::pair<std::pair<std::vector<T>, std::vector<bool>>,
+          std::pair<std::vector<T>, std::vector<bool>>>
+gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_repeats = 10)
+{
+  auto [left, right] = gen_random_repeated_columns<T>(N, num_repeats);
+
+  std::vector<bool> left_nulls(N);
+  std::vector<bool> right_nulls(N);
+
+  // Seed with a real random value, if available
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> uniform_dist(0, 1);
+
+  std::generate(left_nulls.begin(), left_nulls.end(), [&uniform_dist, &gen]() {
+    return uniform_dist(gen) > 0.5;
+  });
+  std::generate(right_nulls.begin(), right_nulls.end(), [&uniform_dist, &gen]() {
+    return uniform_dist(gen) > 0.5;
+  });
+
+  return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)),
+                        std::make_pair(std::move(right), std::move(right_nulls)));
+}
+
+}  // namespace
+
+/**
+ * Fixture for all mixed hash + conditional joins.
+ */
+template <typename T>
+struct MixedJoinTest : public cudf::test::BaseFixture {
+  /**
+   * Convenience utility for parsing initializer lists of input data into
+   * suitable inputs for tables.
+   */
+  template <typename U>
+  std::tuple<std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::test::fixed_width_column_wrapper<T>>,
+             std::vector<cudf::column_view>,
+             std::vector<cudf::column_view>,
+             cudf::table_view,
+             cudf::table_view,
+             cudf::table_view,
+             cudf::table_view>
+  parse_input(std::vector<U> left_data,
+              std::vector<U> right_data,
+              std::vector<cudf::size_type> equality_columns,
+              std::vector<cudf::size_type> conditional_columns)
+  {
+    auto wrapper_generator = [](U& v) {
+      if constexpr (std::is_same_v<U, std::vector<T>>) {
+        return cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end());
+      } else if constexpr (std::is_same_v<U, std::pair<std::vector<T>, std::vector<bool>>>) {
+        return cudf::test::fixed_width_column_wrapper<T>(
+          v.first.begin(), v.first.end(), v.second.begin());
+      }
+      throw std::runtime_error("Invalid input to parse_input.");
+      return cudf::test::fixed_width_column_wrapper<T>();
+    };
+
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> left_wrappers;
+    std::vector<cudf::column_view> left_columns;
+    for (auto v : left_data) {
+      left_wrappers.push_back(wrapper_generator(v));
+      left_columns.push_back(left_wrappers.back());
+    }
+
+    std::vector<cudf::test::fixed_width_column_wrapper<T>> right_wrappers;
+    std::vector<cudf::column_view> right_columns;
+    for (auto v : right_data) {
+      right_wrappers.push_back(wrapper_generator(v));
+      right_columns.push_back(right_wrappers.back());
+    }
+
+    auto left  = cudf::table_view(left_columns);
+    auto right = cudf::table_view(right_columns);
+
+    return std::make_tuple(std::move(left_wrappers),
+                           std::move(right_wrappers),
+                           std::move(left_columns),
+                           std::move(right_columns),
+                           left.select(equality_columns),
+                           right.select(equality_columns),
+                           left.select(conditional_columns),
+                           right.select(conditional_columns));
+  }
+};
+
+/**
+ * Fixture for join types that return both left and right indices (inner, left,
+ * and full joins).
+ */
+template <typename T>
+struct MixedJoinPairReturnTest : public MixedJoinTest<T> {
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * verify that the outputs match the expected outputs (up to order).
+   */
+  virtual void _test(cudf::table_view left_equality,
+                     cudf::table_view right_equality,
+                     cudf::table_view left_conditional,
+                     cudf::table_view right_conditional,
+                     cudf::ast::operation predicate,
+                     std::vector<cudf::size_type> expected_counts,
+                     std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+                     cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    auto [result_size, actual_counts] = this->join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    EXPECT_TRUE(result_size == expected_outputs.size());
+
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_counts_cw(
+      expected_counts.begin(), expected_counts.end());
+    auto const actual_counts_view =
+      cudf::column_view(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
+                        actual_counts->size(),
+                        actual_counts->data());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_counts_cw, actual_counts_view);
+
+    auto result = this->join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      // Note: Not trying to be terribly efficient here since these tests are
+      // small, otherwise a batch copy to host before constructing the tuples
+      // would be important.
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test(ColumnVector<T> left_data,
+            ColumnVector<T> right_data,
+            std::vector<cudf::size_type> equality_columns,
+            std::vector<cudf::size_type> conditional_columns,
+            cudf::ast::operation predicate,
+            std::vector<cudf::size_type> expected_counts,
+            std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_counts,
+                expected_outputs);
+  }
+
+  /*
+   * Perform a join of tables constructed from two input data sets according to
+   * the provided predicate and verify that the outputs match the expected
+   * outputs (up to order).
+   */
+  void test_nulls(NullableColumnVector<T> left_data,
+                  NullableColumnVector<T> right_data,
+                  std::vector<cudf::size_type> equality_columns,
+                  std::vector<cudf::size_type> conditional_columns,
+                  cudf::ast::operation predicate,
+                  std::vector<cudf::size_type> expected_counts,
+                  std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+                  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+  {
+    // Note that we need to maintain the column wrappers otherwise the
+    // resulting column views will be referencing potentially invalid memory.
+    auto [left_wrappers,
+          right_wrappers,
+          left_columns,
+          right_columns,
+          left_equality,
+          right_equality,
+          left_conditional,
+          right_conditional] =
+      this->parse_input(left_data, right_data, equality_columns, conditional_columns);
+    this->_test(left_equality,
+                right_equality,
+                left_conditional,
+                right_conditional,
+                predicate,
+                expected_counts,
+                expected_outputs,
+                compare_nulls);
+  }
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join API.
+   */
+  virtual PairJoinReturn join(cudf::table_view left_equality,
+                              cudf::table_view right_equality,
+                              cudf::table_view left_conditional,
+                              cudf::table_view right_conditional,
+                              cudf::ast::operation predicate,
+                              cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+
+  /**
+   * This method must be implemented by subclasses for specific types of joins.
+   * It should be a simply forwarding of arguments to the appropriate cudf
+   * mixed join size computation API.
+   */
+  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
+};
+
+/**
+ * Tests of mixed inner joins.
+ */
+template <typename T>
+struct MixedInnerJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_inner_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_inner_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedInnerJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedInnerJoinTest, Empty)
+{
+  this->test({}, {}, {}, {}, left_zero_eq_right_zero, {}, {});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicEquality)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 1, 0},
+             {{1, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicNullEqualityEqual)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0, 1, 1},
+                   {{1, 1}, {2, 2}},
+                   cudf::null_equality::EQUAL);
+};
+
+TYPED_TEST(MixedInnerJoinTest, BasicNullEqualityUnequal)
+{
+  this->test_nulls({{{0, 1, 2}, {1, 1, 0}}, {{3, 4, 5}, {1, 1, 1}}, {{10, 20, 30}, {1, 1, 1}}},
+                   {{{0, 1, 3}, {1, 1, 0}}, {{5, 4, 5}, {1, 1, 1}}, {{30, 40, 30}, {1, 1, 1}}},
+                   {0},
+                   {1, 2},
+                   left_zero_eq_right_zero,
+                   {0, 1, 0},
+                   {{1, 1}},
+                   cudf::null_equality::UNEQUAL);
+};
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricEquality)
+{
+  this->test({{0, 2, 1}, {3, 5, 4}, {10, 30, 20}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1},
+             {{2, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricLeftLargerEquality)
+{
+  this->test({{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1, 0},
+             {{2, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricLeftLargerGreater)
+{
+  auto col_ref_left_1  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto col_ref_right_1 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+  auto condition =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {0, 1},
+             condition,
+             {0, 0, 1, 0, 0, 0, 0, 1, 1, 0},
+             {{2, 2}, {7, 0}, {8, 1}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, AsymmetricRightLargerEquality)
+{
+  this->test({{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {{0, 2, 1, 4}, {3, 5, 4, 10}, {10, 30, 20, 100}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {0, 0, 1, 0},
+             {{1, 2}});
+}
+
+TYPED_TEST(MixedInnerJoinTest, BasicInequality)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {0, 0, 0, 1},
+             {{3, 3}});
+}
+
+/**
+ * Tests of mixed left joins.
+ */
+template <typename T>
+struct MixedLeftJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_left_join_size(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+};
+
+TYPED_TEST_SUITE(MixedLeftJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedLeftJoinTest, Basic)
+{
+  this->test({{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+             {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+             {0},
+             {1, 2},
+             left_zero_eq_right_zero,
+             {1, 1, 1},
+             {{0, JoinNoneValue}, {1, 1}, {2, JoinNoneValue}});
+}
+
+TYPED_TEST(MixedLeftJoinTest, Basic2)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {1, 1, 1, 1},
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}, {3, 3}});
+}
+
+/**
+ * Tests of mixed full joins.
+ */
+template <typename T>
+struct MixedFullJoinTest : public MixedJoinPairReturnTest<T> {
+  PairJoinReturn join(cudf::table_view left_equality,
+                      cudf::table_view right_equality,
+                      cudf::table_view left_conditional,
+                      cudf::table_view right_conditional,
+                      cudf::ast::operation predicate,
+                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    return cudf::mixed_full_join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+  }
+
+  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
+    cudf::table_view left_equality,
+    cudf::table_view right_equality,
+    cudf::table_view left_conditional,
+    cudf::table_view right_conditional,
+    cudf::ast::operation predicate,
+    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    // Full joins don't actually support size calculations, and there's no easy way to spoof it.
+    CUDF_FAIL("Size calculation not supported for full joins.");
+  }
+
+  /*
+   * Override method to remove size calculation testing since it's not possible for full joins.
+   */
+  void _test(cudf::table_view left_equality,
+             cudf::table_view right_equality,
+             cudf::table_view left_conditional,
+             cudf::table_view right_conditional,
+             cudf::ast::operation predicate,
+             std::vector<cudf::size_type> expected_counts,
+             std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs,
+             cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
+  {
+    auto result = this->join(
+      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
+    for (size_t i = 0; i < result.first->size(); ++i) {
+      result_pairs.push_back({result.first->element(i, rmm::cuda_stream_default),
+                              result.second->element(i, rmm::cuda_stream_default)});
+    }
+    std::sort(result_pairs.begin(), result_pairs.end());
+    std::sort(expected_outputs.begin(), expected_outputs.end());
+
+    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+  }
+};
+
+TYPED_TEST_SUITE(MixedFullJoinTest, cudf::test::IntegralTypesNotBool);
+
+TYPED_TEST(MixedFullJoinTest, Basic)
+{
+  this->test(
+    {{0, 1, 2}, {3, 4, 5}, {10, 20, 30}},
+    {{0, 1, 3}, {5, 4, 5}, {30, 40, 50}},
+    {0},
+    {1, 2},
+    left_zero_eq_right_zero,
+    {1, 1, 1},
+    {{0, JoinNoneValue}, {1, 1}, {2, JoinNoneValue}, {JoinNoneValue, 0}, {JoinNoneValue, 2}});
+}
+
+TYPED_TEST(MixedFullJoinTest, Basic2)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto const col_ref_left_2  = cudf::ast::column_reference(1, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_2 = cudf::ast::column_reference(1, cudf::ast::table_reference::RIGHT);
+
+  auto scalar_1        = cudf::numeric_scalar<TypeParam>(35);
+  auto const literal_1 = cudf::ast::literal(scalar_1);
+
+  auto const op1 =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref_left_1, col_ref_right_1);
+  auto const op2 = cudf::ast::operation(cudf::ast::ast_operator::LESS, literal_1, col_ref_right_2);
+
+  auto const predicate = cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, op1, op2);
+
+  this->test({{0, 1, 2, 4}, {3, 4, 5, 6}, {10, 20, 30, 40}},
+             {{0, 1, 3, 4}, {5, 4, 5, 7}, {30, 40, 50, 60}},
+             {0},
+             {1, 2},
+             predicate,
+             {1, 1, 1, 1},
+             {{0, JoinNoneValue},
+              {1, JoinNoneValue},
+              {2, JoinNoneValue},
+              {3, 3},
+              {JoinNoneValue, 0},
+              {JoinNoneValue, 1},
+              {JoinNoneValue, 2}});
+}

From 5ea3df6e21f3da1a6680e75175438f3e68eeb75c Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Tue, 18 Jan 2022 17:01:15 +0100
Subject: [PATCH 171/202] Remove python constraints in cutreamz and cudf_kafka
 recipes (#10052)

Python constraints should not be set in `meta.yaml` file as  python version is specified during conda build using the `--python` flag

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10052
---
 conda/recipes/cudf_kafka/meta.yaml | 4 ++--
 conda/recipes/custreamz/meta.yaml  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 571d1bdea8f..acb56c464e4 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -27,13 +27,13 @@ requirements:
   build:
     - cmake >=3.20.1
   host:
-    - python {{ py_version }}
+    - python
     - cython >=0.29,<0.30
     - cudf {{ version }}
     - libcudf_kafka {{ version }}
     - setuptools
   run:
-    - python {{ py_version }}
+    - python
     - libcudf_kafka {{ version }}
     - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
     - cudf {{ version }}
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 7d9529257e6..8bcdd1ec61e 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -25,11 +25,11 @@ build:
 
 requirements:
   host:
-    - python {{ py_version }}
+    - python
     - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
     - cudf_kafka {{ version }}
   run:
-    - python {{ py_version }}
+    - python
     - streamz
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2

From 45c20d1be889a44b60c3eefd1b893c71e36faf16 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Tue, 18 Jan 2022 11:13:19 -0500
Subject: [PATCH 172/202] `decimal128` Support for `to/from_arrow` (#9986)

Resolves C++ side of https://github.com/rapidsai/cudf/issues/9980.

The reason this PR is breaking is because Arrow only has a notion of `decimal128` (see `arrow::Type::DECIMAL`). We can still support both `decimal64` **and** `decimal128` for `to_arrow` but for `from_arrow` it only makes sense to support one of them, and `decimal128` (now that we have it) is the logical choice. Therfore, the switching of the return type of a column coming `from_arrow` from `decimal64` to `decimal128` is a breaking change.

Requires:
* https://github.com/rapidsai/cudf/issues/7314
* https://github.com/rapidsai/cudf/pull/9533

Authors:
   - Conor Hoekstra (https://github.com/codereport)

Approvers:
   - Devavret Makkar (https://github.com/devavret)
   - Mike Wilson (https://github.com/hyperbolic2346)
---
 cpp/src/interop/from_arrow.cu         |  31 ++---
 cpp/src/interop/to_arrow.cu           |  36 +++++-
 cpp/tests/interop/arrow_utils.hpp     |  26 +++-
 cpp/tests/interop/from_arrow_test.cpp |  81 +++++--------
 cpp/tests/interop/to_arrow_test.cpp   | 165 ++++++++++++++++++--------
 5 files changed, 218 insertions(+), 121 deletions(-)

diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index edd3ce2ed07..99b657fb9d5 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
     case arrow::Type::LIST: return data_type(type_id::LIST);
     case arrow::Type::DECIMAL: {
       auto const type = static_cast<arrow::Decimal128Type const*>(&arrow_type);
-      return data_type{type_id::DECIMAL64, -type->scale()};
+      return data_type{type_id::DECIMAL128, -type->scale()};
     }
     case arrow::Type::STRUCT: return data_type(type_id::STRUCT);
     default: CUDF_FAIL("Unsupported type_id conversion to cudf");
@@ -177,35 +177,27 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    rmm::mr::device_memory_resource* mr);
 
 template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal64>(
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
   arrow::Array const& array,
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  using DeviceType = int64_t;
+  using DeviceType = __int128_t;
 
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
-  auto data_buffer               = array.data()->buffers[1];
-  auto const num_rows            = static_cast<size_type>(array.length());
-
-  rmm::device_uvector<DeviceType> buf(num_rows * BIT_WIDTH_RATIO, stream);
-  rmm::device_uvector<DeviceType> out_buf(num_rows, stream, mr);
+  auto data_buffer    = array.data()->buffers[1];
+  auto const num_rows = static_cast<size_type>(array.length());
+  auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto mutable_column_view = col->mutable_view();
 
   CUDA_TRY(cudaMemcpyAsync(
-    reinterpret_cast<uint8_t*>(buf.data()),
+    mutable_column_view.data<DeviceType>(),
     reinterpret_cast<const uint8_t*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
-    buf.size() * sizeof(DeviceType),
+    sizeof(DeviceType) * num_rows,
     cudaMemcpyDefault,
     stream.value()));
 
-  auto every_other = [] __device__(size_type i) { return 2 * i; };
-  auto gather_map  = cudf::detail::make_counting_transform_iterator(0, every_other);
-
-  thrust::gather(
-    rmm::exec_policy(stream), gather_map, gather_map + num_rows, buf.data(), out_buf.data());
-
   auto null_mask = [&] {
     if (not skip_mask and array.null_bitmap_data()) {
       auto temp_mask = get_mask_buffer(array, stream, mr);
@@ -221,7 +213,8 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal64>(
     return rmm::device_buffer{};
   }();
 
-  return std::make_unique<cudf::column>(type, num_rows, out_buf.release(), std::move(null_mask));
+  col->set_null_mask(std::move(null_mask));
+  return col;
 }
 
 template <>
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index e6db5807dde..27e47061b67 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -176,6 +176,40 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
   return std::make_shared<arrow::Decimal128Array>(data);
 }
 
+template <>
+std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>(
+  column_view input,
+  cudf::type_id,
+  column_metadata const&,
+  arrow::MemoryPool* ar_mr,
+  rmm::cuda_stream_view stream)
+{
+  using DeviceType = __int128_t;
+
+  rmm::device_uvector<DeviceType> buf(input.size(), stream);
+
+  thrust::copy(rmm::exec_policy(stream),  //
+               input.begin<DeviceType>(),
+               input.end<DeviceType>(),
+               buf.begin());
+
+  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
+
+  CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                           buf.data(),
+                           buf_size_in_bytes,
+                           cudaMemcpyDeviceToHost,
+                           stream.value()));
+
+  auto type    = arrow::decimal(18, -input.type().scale());
+  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
+  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
+  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
+
+  return std::make_shared<arrow::Decimal128Array>(data);
+}
+
 template <>
 std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view input,
                                                                   cudf::type_id id,
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 78967fbaf30..4da8170e75f 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -176,3 +176,27 @@ std::shared_ptr<arrow::Array> get_arrow_list_array(
 
 std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_tables(
   cudf::size_type length = 10000);
+
+template <typename T>
+[[nodiscard]] auto make_decimal128_arrow_array(std::vector<T> const& data,
+                                               std::optional<std::vector<int>> const& validity,
+                                               int32_t scale) -> std::shared_ptr<arrow::Array>
+{
+  auto constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(T);
+
+  std::shared_ptr<arrow::Array> arr;
+  arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -scale),
+                                           arrow::default_memory_pool());
+
+  for (T i = 0; i < static_cast<T>(data.size() / BIT_WIDTH_RATIO); ++i) {
+    if (validity.has_value() and not validity.value()[i]) {
+      decimal_builder.AppendNull();
+    } else {
+      decimal_builder.Append(reinterpret_cast<const uint8_t*>(data.data() + BIT_WIDTH_RATIO * i));
+    }
+  }
+
+  CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+  return arr;
+}
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 946ac7fc891..f5176cda768 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -354,21 +354,16 @@ TEST_P(FromArrowTestSlice, SliceTest)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TEST_F(FromArrowTest, FixedPointTable)
+TEST_F(FromArrowTest, FixedPoint128Table)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto const data     = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
-    auto const col      = fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6}, scale_type{i});
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6};
+    auto const col      = fp_wrapper<__int128_t>(data.cbegin(), data.cend(), scale_type{scale});
     auto const expected = cudf::table_view({col});
 
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
-    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
-                                 data.size() / BIT_WIDTH_RATIO);
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const arr = make_decimal128_arrow_array(data, std::nullopt, scale);
 
     auto const field         = arrow::field("a", arr->type());
     auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -381,24 +376,18 @@ TEST_F(FromArrowTest, FixedPointTable)
   }
 }
 
-TEST_F(FromArrowTest, FixedPointTableLarge)
+TEST_F(FromArrowTest, FixedPoint128TableLarge)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
-  auto constexpr NUM_ELEMENTS    = 1000;
+  auto constexpr NUM_ELEMENTS = 1000;
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
-    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
-    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
-    auto iota        = thrust::make_counting_iterator(1);
-    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota           = thrust::make_counting_iterator(1);
+    auto const data     = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col      = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
     auto const expected = cudf::table_view({col});
 
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
-    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()), NUM_ELEMENTS);
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const arr = make_decimal128_arrow_array(data, std::nullopt, scale);
 
     auto const field         = arrow::field("a", arr->type());
     auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -411,25 +400,18 @@ TEST_F(FromArrowTest, FixedPointTableLarge)
   }
 }
 
-TEST_F(FromArrowTest, FixedPointTableNulls)
+TEST_F(FromArrowTest, FixedPoint128TableNulls)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
     auto const col =
-      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
     auto const expected = cudf::table_view({col});
 
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
-    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
-                                 data.size() / BIT_WIDTH_RATIO);
-    decimal_builder.AppendNull();
-    decimal_builder.AppendNull();
-
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const arr = make_decimal128_arrow_array(data, validity, scale);
 
     auto const field         = arrow::field("a", arr->type());
     auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -442,28 +424,21 @@ TEST_F(FromArrowTest, FixedPointTableNulls)
   }
 }
 
-TEST_F(FromArrowTest, FixedPointTableNullsLarge)
+TEST_F(FromArrowTest, FixedPoint128TableNullsLarge)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
-  auto constexpr NUM_ELEMENTS    = 1000;
+  auto constexpr NUM_ELEMENTS = 1000;
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto every_other = [](auto i) { return i % BIT_WIDTH_RATIO ? 0 : i / BIT_WIDTH_RATIO; };
-    auto transform   = cudf::detail::make_counting_transform_iterator(BIT_WIDTH_RATIO, every_other);
-    auto const data  = std::vector<int64_t>(transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO);
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % 2 ? 0 : 1; };
+    auto validity    = cudf::detail::make_counting_transform_iterator(0, every_other);
     auto iota        = thrust::make_counting_iterator(1);
-    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, transform, scale_type{i});
+    auto const data  = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, validity, scale_type{scale});
     auto const expected = cudf::table_view({col});
 
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(10, -i), arrow::default_memory_pool());
-    for (int64_t i = 0; i < NUM_ELEMENTS / BIT_WIDTH_RATIO; ++i) {
-      decimal_builder.Append(reinterpret_cast<const uint8_t*>(data.data() + 4 * i));
-      decimal_builder.AppendNull();
-    }
-
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const arr = make_decimal128_arrow_array(
+      data, std::vector<int32_t>(validity, validity + NUM_ELEMENTS), scale);
 
     auto const field         = arrow::field("a", arr->type());
     auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 98031f42a9c..52f2d5709d2 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -356,21 +356,16 @@ TEST_F(ToArrowTest, StructColumn)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TEST_F(ToArrowTest, FixedPointTable)
+TEST_F(ToArrowTest, FixedPoint64Table)
 {
   using namespace numeric;
-  auto constexpr const BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
-
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{i});
-    auto const input = cudf::table_view({col});
 
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col         = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input       = cudf::table_view({col});
     auto const expect_data = std::vector<int64_t>{-1, -1, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
-    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(expect_data.data()),
-                                 expect_data.size() / BIT_WIDTH_RATIO);
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+
+    auto const arr = make_decimal128_arrow_array(expect_data, std::nullopt, scale);
 
     auto const field                = arrow::field("a", arr->type());
     auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -383,29 +378,69 @@ TEST_F(ToArrowTest, FixedPointTable)
   }
 }
 
-TEST_F(ToArrowTest, FixedPointTableLarge)
+TEST_F(ToArrowTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col         = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input       = cudf::table_view({col});
+    auto const expect_data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+
+    auto const arr = make_decimal128_arrow_array(expect_data, std::nullopt, scale);
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPoint64TableLarge)
 {
   using namespace numeric;
   auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
   auto constexpr NUM_ELEMENTS    = 1000;
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto iota        = thrust::make_counting_iterator(1);
-    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{i});
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
     auto const input = cudf::table_view({col});
 
-    auto every_other = [](auto i) { return i % 2 == 0 ? i / 2 : 0; };
-    auto transform   = cudf::detail::make_counting_transform_iterator(2, every_other);
+    auto const every_other = [](auto i) { return i % 2 == 0 ? i / 2 : 0; };
+    auto const transform   = cudf::detail::make_counting_transform_iterator(2, every_other);
     auto const expect_data =
       std::vector<int64_t>{transform, transform + NUM_ELEMENTS * BIT_WIDTH_RATIO};
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
 
-    // Note: For some reason, decimal_builder.AppendValues with NUM_ELEMENTS >= 1000 doesn't work
-    for (int i = 0; i < NUM_ELEMENTS; ++i)
-      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 2 * i));
+    auto const arr = make_decimal128_arrow_array(expect_data, std::nullopt, scale);
 
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota        = thrust::make_counting_iterator(1);
+    auto const col         = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input       = cudf::table_view({col});
+    auto const expect_data = std::vector<__int128_t>{iota, iota + NUM_ELEMENTS};
+
+    auto const arr = make_decimal128_arrow_array(expect_data, std::nullopt, scale);
 
     auto const field                = arrow::field("a", arr->type());
     auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -418,25 +453,42 @@ TEST_F(ToArrowTest, FixedPointTableLarge)
   }
 }
 
-TEST_F(ToArrowTest, FixedPointTableNullsSimple)
+TEST_F(ToArrowTest, FixedPoint64TableNullsSimple)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
-    auto const data = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0};
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 0, 0, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
     auto const col =
-      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{i});
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
     auto const input = cudf::table_view({col});
 
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
-    decimal_builder.AppendValues(reinterpret_cast<const uint8_t*>(data.data()),
-                                 data.size() / BIT_WIDTH_RATIO);
-    decimal_builder.AppendNull();
-    decimal_builder.AppendNull();
+    auto const arr = make_decimal128_arrow_array(data, validity, scale);
 
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto const field         = arrow::field("a", arr->type());
+    auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
+    auto const arrow_table   = arrow::Table::Make(schema, {arr});
+
+    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
+TEST_F(ToArrowTest, FixedPoint128TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const arr = make_decimal128_arrow_array(data, validity, scale);
 
     auto const field         = arrow::field("a", arr->type());
     auto const schema_vector = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -449,26 +501,20 @@ TEST_F(ToArrowTest, FixedPointTableNullsSimple)
   }
 }
 
-TEST_F(ToArrowTest, FixedPointTableNulls)
+TEST_F(ToArrowTest, FixedPoint64TableNulls)
 {
   using namespace numeric;
-  auto constexpr BIT_WIDTH_RATIO = 2;  // Array::Type:type::DECIMAL (128) / int64_t
 
-  for (auto const i : {3, 2, 1, 0, -1, -2, -3}) {
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
     auto const col = fp_wrapper<int64_t>(
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{i});
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{scale});
     auto const input = cudf::table_view({col});
 
     auto const expect_data =
       std::vector<int64_t>{1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0};
-    std::shared_ptr<arrow::Array> arr;
-    arrow::Decimal128Builder decimal_builder(arrow::decimal(18, -i), arrow::default_memory_pool());
-    for (int64_t i = 0; i < input.column(0).size() / BIT_WIDTH_RATIO; ++i) {
-      decimal_builder.Append(reinterpret_cast<const uint8_t*>(expect_data.data() + 4 * i));
-      decimal_builder.AppendNull();
-    }
+    auto const validity = std::vector<int32_t>{1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
 
-    CUDF_EXPECTS(decimal_builder.Finish(&arr).ok(), "Failed to build array");
+    auto arr = make_decimal128_arrow_array(expect_data, validity, scale);
 
     auto const field                = arrow::field("a", arr->type());
     auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
@@ -481,6 +527,31 @@ TEST_F(ToArrowTest, FixedPointTableNulls)
   }
 }
 
+TEST_F(ToArrowTest, FixedPoint128TableNulls)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col = fp_wrapper<__int128_t>(
+      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const expect_data = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    auto const validity    = std::vector<int32_t>{1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+
+    auto arr = make_decimal128_arrow_array(expect_data, validity, scale);
+
+    auto const field                = arrow::field("a", arr->type());
+    auto const schema_vector        = std::vector<std::shared_ptr<arrow::Field>>({field});
+    auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
+    auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
+
+    auto const got_arrow_table = cudf::to_arrow(input, {{"a"}});
+
+    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  }
+}
+
 struct ToArrowTestSlice
   : public ToArrowTest,
     public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {

From 04b79ac755649bce59f49c06a27d451994cfd20b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 18 Jan 2022 15:40:13 -0500
Subject: [PATCH 173/202] Remove implicit copy due to conversion from
 cudf::size_type and size_t (#10045)

Detected when compiling with gcc-11 by the new `range-loop-construct` warning

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10045
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 82161233a92..3d43042842f 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -171,8 +171,9 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
-      for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
-        CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
+      for (const auto& stripe_idx : user_specified_stripes[src_file_idx]) {
+        CUDF_EXPECTS(stripe_idx < static_cast<decltype(stripe_idx)>(
+                                    per_file_metadata[src_file_idx].ff.stripes.size()),
                      "Invalid stripe index");
         stripe_infos.push_back(
           std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));

From 8d7330fbfbcf3de99686cb3d877f14bd8d5de403 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 18 Jan 2022 15:13:25 -0600
Subject: [PATCH 174/202] Add support for `decimal128` in cudf python (#9533)

Resolves: https://github.com/rapidsai/cudf/issues/10031

Depends on https://github.com/rapidsai/cudf/pull/9483, https://github.com/rapidsai/cudf/pull/9986

Note: The CI for this PR is not going to pass until #9986 is admin-merged(Admin merge needed since #9986 requires this PR changes too).

- [x] Introduced `Decimal128Dtype` and `Decimal128Column`.
- [x] Enabled python side support for the above both.
- [x] Enables complete support for `Decimal32Column` which is currently lacking.
- [x] Enabled orc writer to use decimal128.
- [x] Enabled parquet to read a decimal128 type.
- [x] Enabled Scalar support for `Decimal128Dtype`.
- [x] Covered all decimal types in `string` <-> `decimal` conversions.
- [x] **Made `Decimal128Dtype` the default type while reading in a Decimal Series or Scalar. User can specify to choose a specific decimal type by passing a `dtype`.** (Breaking)
- [x] **Fixed issues in the binop precision & scale calculation logic to correctly choose a decimal type.** (Breaking)
- [x] Fixed type metadata handling issues seen across APIs while making changes.
- [x] Added parametrizations for all missing `decimal32` tests.
- [x] Added parametrizations for `decimal128` along with existing decimal type-specific tests.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9533
---
 python/cudf/cudf/__init__.py                  |   4 +-
 python/cudf/cudf/_lib/column.pyx              |   9 +-
 python/cudf/cudf/_lib/cpp/scalar/scalar.pxd   |   5 +-
 python/cudf/cudf/_lib/cpp/types.pxd           |   7 +-
 .../cudf/cudf/_lib/cpp/wrappers/decimals.pxd  |   7 +-
 python/cudf/cudf/_lib/orc.pyx                 |   3 +-
 python/cudf/cudf/_lib/scalar.pyx              |  25 +-
 .../strings/convert/convert_fixed_point.pyx   |  22 +-
 python/cudf/cudf/_lib/types.pyx               |  13 +-
 python/cudf/cudf/api/types.py                 |  14 +-
 python/cudf/cudf/core/column/__init__.py      |   5 +-
 python/cudf/cudf/core/column/column.py        |  52 ++-
 python/cudf/cudf/core/column/decimal.py       | 327 ++++++++++--------
 python/cudf/cudf/core/column/numerical.py     |  21 +-
 python/cudf/cudf/core/column/string.py        |   4 +-
 python/cudf/cudf/core/dataframe.py            |   4 +-
 python/cudf/cudf/core/dtypes.py               | 161 +++------
 python/cudf/cudf/core/groupby/groupby.py      |   6 +-
 python/cudf/cudf/core/join/_join_helpers.py   |   9 +-
 python/cudf/cudf/core/scalar.py               |   9 +-
 python/cudf/cudf/core/series.py               |  13 +-
 python/cudf/cudf/io/parquet.py                |  39 +--
 python/cudf/cudf/testing/dataset_generator.py |  18 +-
 python/cudf/cudf/tests/test_api_types.py      |  56 ++-
 python/cudf/cudf/tests/test_binops.py         | 115 +++---
 python/cudf/cudf/tests/test_column.py         |  13 +-
 python/cudf/cudf/tests/test_concat.py         |  89 ++++-
 python/cudf/cudf/tests/test_dataframe.py      |  10 +-
 python/cudf/cudf/tests/test_dtypes.py         |  32 +-
 python/cudf/cudf/tests/test_joining.py        |  25 +-
 python/cudf/cudf/tests/test_orc.py            |  17 +-
 python/cudf/cudf/tests/test_parquet.py        |  47 ++-
 python/cudf/cudf/tests/test_reductions.py     |  49 ++-
 python/cudf/cudf/tests/test_replace.py        |   9 +-
 python/cudf/cudf/tests/test_scalar.py         |  33 +-
 python/cudf/cudf/tests/test_scan.py           |  28 +-
 python/cudf/cudf/tests/test_string.py         |  21 +-
 python/cudf/cudf/utils/dtypes.py              |  29 +-
 python/cudf/cudf/utils/utils.py               |   4 +-
 39 files changed, 845 insertions(+), 509 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 4dadf6a1869..049cec77d9c 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
 from cudf.utils.gpu_utils import validate_setup
 
 validate_setup()
@@ -51,6 +52,7 @@
     CategoricalDtype,
     Decimal64Dtype,
     Decimal32Dtype,
+    Decimal128Dtype,
     IntervalDtype,
     ListDtype,
     StructDtype,
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 5e0ee3136b7..653e6b90b3f 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -8,12 +8,7 @@ import rmm
 
 import cudf
 import cudf._lib as libcudfxx
-from cudf.api.types import (
-    is_categorical_dtype,
-    is_decimal_dtype,
-    is_list_dtype,
-    is_struct_dtype,
-)
+from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype
 from cudf.core.buffer import Buffer
 
 from cpython.buffer cimport PyObject_CheckBuffer
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
index 930ebaa1bea..b5e9b0ba06b 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -59,6 +59,9 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         fixed_point_scalar(int64_t value,
                            scale_type scale,
                            bool is_valid) except +
+        fixed_point_scalar(data_type value,
+                           scale_type scale,
+                           bool is_valid) except +
         int64_t value() except +
         # TODO: Figure out how to add an int32 overload of value()
 
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd
index 1f2094b3958..23727a20ec2 100644
--- a/python/cudf/cudf/_lib/cpp/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, uint32_t
 
@@ -79,6 +79,7 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         DURATION_NANOSECONDS   "cudf::type_id::DURATION_NANOSECONDS"
         DECIMAL32              "cudf::type_id::DECIMAL32"
         DECIMAL64              "cudf::type_id::DECIMAL64"
+        DECIMAL128             "cudf::type_id::DECIMAL128"
 
     ctypedef enum hash_id "cudf::hash_id":
         HASH_IDENTITY "cudf::hash_id::HASH_IDENTITY"
@@ -102,3 +103,7 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         HIGHER "cudf::interpolation::HIGHER"
         MIDPOINT "cudf::interpolation::MIDPOINT"
         NEAREST "cudf::interpolation::NEAREST"
+
+    # A Hack to let cython compile with __int128_t symbol
+    # https://stackoverflow.com/a/27609033
+    ctypedef int int128 "__int128_t"
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
index 628ffef433b..858569fd696 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
@@ -1,12 +1,17 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 from libc.stdint cimport int32_t, int64_t
 
+from cudf._lib.cpp.types cimport int128
+
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
     # cython type stub to help resolve to numeric::decimal64
     ctypedef int64_t decimal64
     # cython type stub to help resolve to numeric::decimal32
     ctypedef int64_t decimal32
+    # cython type stub to help resolve to numeric::decimal128
+    ctypedef int128 decimal128
 
     cdef cppclass scale_type:
         scale_type(int32_t)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index bf761c30bc8..cbba1796c26 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import cudf
 
@@ -249,7 +249,6 @@ cdef orc_reader_options make_orc_reader_options(
         .timestamp_type(data_type(timestamp_type))
         .use_index(use_index)
         .decimal_cols_as_float(c_decimal_cols_as_float)
-        .decimal128(False)
         .build()
     )
 
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 43c0198f80a..32d6cb2ea6d 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import decimal
 
 import numpy as np
@@ -45,7 +46,12 @@ from cudf._lib.cpp.scalar.scalar cimport (
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, scale_type
+from cudf._lib.cpp.wrappers.decimals cimport (
+    decimal32,
+    decimal64,
+    decimal128,
+    scale_type,
+)
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -88,7 +94,7 @@ cdef class DeviceScalar:
         # IMPORTANT: this should only ever be called from __init__
         valid = not _is_null_host_scalar(value)
 
-        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             _set_decimal_from_scalar(
                 self.c_value, value, dtype, valid)
         elif isinstance(dtype, cudf.ListDtype):
@@ -118,7 +124,7 @@ cdef class DeviceScalar:
             )
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+        if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype):
             result = _get_py_decimal_from_fixed_point(self.c_value)
         elif cudf.api.types.is_struct_dtype(self.dtype):
             result = _get_py_dict_from_struct(self.c_value)
@@ -181,6 +187,7 @@ cdef class DeviceScalar:
 
         s.c_value = move(ptr)
         cdtype = s.get_raw_ptr()[0].type()
+
         if cdtype.id() == libcudf_types.DECIMAL64 and dtype is None:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
@@ -322,6 +329,12 @@ cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
                 <int32_t>np.int32(value), scale_type(-dtype.scale), valid
             )
         )
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        s.reset(
+            new fixed_point_scalar[decimal128](
+                <libcudf_types.int128>value, scale_type(-dtype.scale), valid
+            )
+        )
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
@@ -463,6 +476,10 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
         rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
+    elif cdtype.id() == libcudf_types.DECIMAL128:
+        rep_val = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].value())
+        scale = int((<fixed_point_scalar[decimal128]*>s_ptr)[0].type().scale())
+        return decimal.Decimal(rep_val).scaleb(scale)
     else:
         raise ValueError("Could not convert cudf::scalar to numpy scalar")
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 54e85d8833f..dfc9cae915f 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,7 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import numpy as np
 
+import cudf
+
 from cudf._lib.column cimport Column
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
@@ -17,7 +19,13 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport DECIMAL64, data_type, type_id
+from cudf._lib.cpp.types cimport (
+    DECIMAL32,
+    DECIMAL64,
+    DECIMAL128,
+    data_type,
+    type_id,
+)
 from cudf._lib.types cimport underlying_type_t_type_id
 
 
@@ -60,7 +68,15 @@ def to_decimal(Column input_col, object out_type):
     cdef column_view input_column_view = input_col.view()
     cdef unique_ptr[column] c_result
     cdef int scale = out_type.scale
-    cdef data_type c_out_type = data_type(DECIMAL64, -scale)
+    cdef data_type c_out_type
+    if isinstance(out_type, cudf.Decimal32Dtype):
+        c_out_type = data_type(DECIMAL32, -scale)
+    elif isinstance(out_type, cudf.Decimal64Dtype):
+        c_out_type = data_type(DECIMAL64, -scale)
+    elif isinstance(out_type, cudf.Decimal128Dtype):
+        c_out_type = data_type(DECIMAL128, -scale)
+    else:
+        raise TypeError("should be a decimal dtype")
     with nogil:
         c_result = move(
             cpp_to_fixed_point(
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 1fa389f408c..0a05fd240f3 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
@@ -66,6 +66,7 @@ class TypeId(IntEnum):
     )
     DECIMAL32 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL32
     DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64
+    DECIMAL128 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL128
 
 
 SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
@@ -206,6 +207,11 @@ cdef dtype_from_column_view(column_view cv):
             precision=cudf.Decimal32Dtype.MAX_PRECISION,
             scale=-cv.type().scale()
         )
+    elif tid == libcudf_types.type_id.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
     else:
         return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
             <underlying_type_t_type_id>(tid)
@@ -216,6 +222,8 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
         tid = libcudf_types.type_id.LIST
     elif cudf.api.types.is_struct_dtype(dtype):
         tid = libcudf_types.type_id.STRUCT
+    elif cudf.api.types.is_decimal128_dtype(dtype):
+        tid = libcudf_types.type_id.DECIMAL128
     elif cudf.api.types.is_decimal64_dtype(dtype):
         tid = libcudf_types.type_id.DECIMAL64
     elif cudf.api.types.is_decimal32_dtype(dtype):
@@ -232,6 +240,7 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
 
 cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
     return tid in (
+        libcudf_types.type_id.DECIMAL128,
         libcudf_types.type_id.DECIMAL64,
-        libcudf_types.type_id.DECIMAL32
+        libcudf_types.type_id.DECIMAL32,
     )
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 10bbb620715..6d5387591cb 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 """Define common type operations."""
 
 from __future__ import annotations
@@ -20,6 +21,7 @@
     is_categorical_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
+    is_decimal128_dtype,
     is_decimal_dtype,
     is_interval_dtype,
     is_list_dtype,
@@ -41,19 +43,23 @@ def is_numeric_dtype(obj):
         Whether or not the array or dtype is of a numeric dtype.
     """
     if isclass(obj):
-        if issubclass(obj, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)):
+        if issubclass(obj, cudf.core.dtypes.DecimalDtype):
             return True
         if issubclass(obj, _BaseDtype):
             return False
     else:
-        if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
-            getattr(obj, "dtype", None), cudf.Decimal32Dtype
+        if isinstance(obj, cudf.Decimal128Dtype) or isinstance(
+            getattr(obj, "dtype", None), cudf.Decimal128Dtype
         ):
             return True
         if isinstance(obj, cudf.Decimal64Dtype) or isinstance(
             getattr(obj, "dtype", None), cudf.Decimal64Dtype
         ):
             return True
+        if isinstance(obj, cudf.Decimal32Dtype) or isinstance(
+            getattr(obj, "dtype", None), cudf.Decimal32Dtype
+        ):
+            return True
         if isinstance(obj, _BaseDtype) or isinstance(
             getattr(obj, "dtype", None), _BaseDtype
         ):
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 5a44d7c58a6..96e2a7554cf 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 """
 isort: skip_file
 """
@@ -31,5 +32,7 @@
 from cudf.core.column.decimal import (  # noqa: F401
     Decimal32Column,
     Decimal64Column,
+    Decimal128Column,
+    DecimalBaseColumn,
 )
 from cudf.core.column.interval import IntervalColumn  # noqa: F401
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a966276842f..667ce0488cd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -50,6 +50,7 @@
     is_categorical_dtype,
     is_decimal32_dtype,
     is_decimal64_dtype,
+    is_decimal128_dtype,
     is_decimal_dtype,
     is_dtype_equal,
     is_integer_dtype,
@@ -295,8 +296,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             array.type, pd.core.arrays._arrow_utils.ArrowIntervalType
         ):
             return cudf.core.column.IntervalColumn.from_arrow(array)
-        elif isinstance(array.type, pa.Decimal128Type):
-            return cudf.core.column.Decimal64Column.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"]
 
@@ -987,6 +986,11 @@ def as_decimal_column(
     ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]:
         raise NotImplementedError
 
+    def as_decimal128_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.Decimal128Column":
+        raise NotImplementedError
+
     def as_decimal64_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.Decimal64Column":
@@ -1481,6 +1485,18 @@ def build_column(
             null_count=null_count,
             children=children,
         )
+    elif is_decimal128_dtype(dtype):
+        if size is None:
+            raise TypeError("Must specify size")
+        return cudf.core.column.Decimal128Column(
+            data=data,
+            size=size,
+            offset=offset,
+            dtype=dtype,
+            mask=mask,
+            null_count=null_count,
+            children=children,
+        )
     elif is_interval_dtype(dtype):
         return cudf.core.column.IntervalColumn(
             dtype=dtype,
@@ -1838,7 +1854,7 @@ def as_column(
         else:
             pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
             if isinstance(pyarrow_array.type, pa.Decimal128Type):
-                pyarrow_type = cudf.Decimal64Dtype.from_arrow(
+                pyarrow_type = cudf.Decimal128Dtype.from_arrow(
                     pyarrow_array.type
                 )
             else:
@@ -2040,7 +2056,15 @@ def as_column(
                 # https://github.com/apache/arrow/pull/9948
                 # Hence we should let the exception propagate to
                 # the user.
-                if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                if isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
+                    data = pa.array(
+                        arbitrary,
+                        type=pa.decimal128(
+                            precision=dtype.precision, scale=dtype.scale
+                        ),
+                    )
+                    return cudf.core.column.Decimal128Column.from_arrow(data)
+                elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
                     data = pa.array(
                         arbitrary,
                         type=pa.decimal128(
@@ -2048,7 +2072,7 @@ def as_column(
                         ),
                     )
                     return cudf.core.column.Decimal64Column.from_arrow(data)
-                if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
                     data = pa.array(
                         arbitrary,
                         type=pa.decimal128(
@@ -2056,6 +2080,7 @@ def as_column(
                         ),
                     )
                     return cudf.core.column.Decimal32Column.from_arrow(data)
+
             pa_type = None
             np_type = None
             try:
@@ -2074,7 +2099,17 @@ def as_column(
                     ) and not isinstance(dtype, cudf.IntervalDtype):
                         data = pa.array(arbitrary, type=dtype.to_arrow())
                         return as_column(data, nan_as_null=nan_as_null)
-                    if isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
+                    elif isinstance(dtype, cudf.core.dtypes.Decimal128Dtype):
+                        data = pa.array(
+                            arbitrary,
+                            type=pa.decimal128(
+                                precision=dtype.precision, scale=dtype.scale
+                            ),
+                        )
+                        return cudf.core.column.Decimal128Column.from_arrow(
+                            data
+                        )
+                    elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
                         data = pa.array(
                             arbitrary,
                             type=pa.decimal128(
@@ -2084,7 +2119,7 @@ def as_column(
                         return cudf.core.column.Decimal64Column.from_arrow(
                             data
                         )
-                    if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
+                    elif isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
                         data = pa.array(
                             arbitrary,
                             type=pa.decimal128(
@@ -2094,6 +2129,7 @@ def as_column(
                         return cudf.core.column.Decimal32Column.from_arrow(
                             data
                         )
+
                     if is_bool_dtype(dtype):
                         # Need this special case handling for bool dtypes,
                         # since 'boolean' & 'pd.BooleanDtype' are not
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 7037b8e6f36..a17cace3c81 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from decimal import Decimal
 from typing import Any, Sequence, Tuple, Union, cast
@@ -18,22 +18,27 @@
 from cudf.api.types import is_integer_dtype, is_scalar
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column
-from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
+from cudf.core.dtypes import (
+    Decimal32Dtype,
+    Decimal64Dtype,
+    Decimal128Dtype,
+    DecimalDtype,
+)
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
 from .numerical_base import NumericalBaseColumn
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
-    """Base column for decimal64 and decimal32 columns"""
+    """Base column for decimal32, decimal64 or decimal128 columns"""
 
-    dtype: Union[Decimal32Dtype, Decimal64Dtype]
+    dtype: DecimalDtype
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
     ) -> Union["DecimalBaseColumn"]:
         if (
-            isinstance(dtype, (Decimal64Dtype, Decimal32Dtype))
+            isinstance(dtype, cudf.core.dtypes.DecimalDtype)
             and dtype.scale < self.dtype.scale
         ):
             warn(
@@ -45,6 +50,126 @@ def as_decimal_column(
             return self
         return libcudf.unary.cast(self, dtype)
 
+    def as_string_column(
+        self, dtype: Dtype, format=None, **kwargs
+    ) -> "cudf.core.column.StringColumn":
+        if len(self) > 0:
+            return cpp_from_decimal(self)
+        else:
+            return cast(
+                "cudf.core.column.StringColumn", as_column([], dtype="object")
+            )
+
+    def binary_operator(self, op, other, reflect=False):
+        if reflect:
+            self, other = other, self
+
+        if not isinstance(
+            other,
+            (
+                DecimalBaseColumn,
+                cudf.core.column.NumericalColumn,
+                cudf.Scalar,
+            ),
+        ):
+            raise TypeError(
+                f"Operator {op} not supported between"
+                f"{str(type(self))} and {str(type(other))}"
+            )
+        elif isinstance(
+            other, cudf.core.column.NumericalColumn
+        ) and not is_integer_dtype(other.dtype):
+            raise TypeError(
+                f"Only decimal and integer column is supported for {op}."
+            )
+        if isinstance(other, cudf.core.column.NumericalColumn):
+            other = other.as_decimal_column(
+                self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0)
+            )
+        if not isinstance(self.dtype, other.dtype.__class__):
+            if (
+                self.dtype.precision == other.dtype.precision
+                and self.dtype.scale == other.dtype.scale
+            ):
+                other = other.astype(self.dtype)
+
+        # Binary Arithmetics between decimal columns. `Scale` and `precision`
+        # are computed outside of libcudf
+        try:
+            if op in ("add", "sub", "mul", "div"):
+                output_type = _get_decimal_type(self.dtype, other.dtype, op)
+                result = libcudf.binaryop.binaryop(
+                    self, other, op, output_type
+                )
+                result.dtype.precision = output_type.precision
+            elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
+                result = libcudf.binaryop.binaryop(self, other, op, bool)
+        except RuntimeError as e:
+            if "Unsupported operator for these types" in str(e):
+                raise NotImplementedError(
+                    f"{op} not supported for types with different bit-widths"
+                ) from e
+            raise
+
+        return result
+
+    def fillna(
+        self, value: Any = None, method: str = None, dtype: Dtype = None
+    ):
+        """Fill null values with ``value``.
+
+        Returns a copy with null filled.
+        """
+        if isinstance(value, (int, Decimal)):
+            value = cudf.Scalar(value, dtype=self.dtype)
+        elif (
+            isinstance(value, DecimalBaseColumn)
+            or isinstance(value, cudf.core.column.NumericalColumn)
+            and is_integer_dtype(value.dtype)
+        ):
+            value = value.astype(self.dtype)
+        else:
+            raise TypeError(
+                "Decimal columns only support using fillna with decimal and "
+                "integer values"
+            )
+
+        result = libcudf.replace.replace_nulls(
+            input_col=self, replacement=value, method=method, dtype=dtype
+        )
+        return result._with_type_metadata(self.dtype)
+
+    def normalize_binop_value(self, other):
+        if is_scalar(other) and isinstance(other, (int, np.int, Decimal)):
+            return cudf.Scalar(Decimal(other))
+        elif isinstance(other, cudf.Scalar) and isinstance(
+            other.dtype, cudf.core.dtypes.DecimalDtype
+        ):
+            return other
+        else:
+            raise TypeError(f"cannot normalize {type(other)}")
+
+    def _decimal_quantile(
+        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+    ) -> ColumnBase:
+        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
+        # get sorted indices and exclude nulls
+        sorted_indices = self.as_frame()._get_sorted_inds(
+            ascending=True, na_position="first"
+        )
+        sorted_indices = sorted_indices[self.null_count :]
+
+        result = cpp_quantile(
+            self, quant, interpolation, sorted_indices, exact
+        )
+
+        return result._with_type_metadata(self.dtype)
+
+    def as_numerical_column(
+        self, dtype: Dtype, **kwargs
+    ) -> "cudf.core.column.NumericalColumn":
+        return libcudf.unary.cast(self, dtype)
+
 
 class Decimal32Column(DecimalBaseColumn):
     dtype: Decimal32Dtype
@@ -98,6 +223,35 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
+    def _with_type_metadata(
+        self: "cudf.core.column.Decimal32Column", dtype: Dtype
+    ) -> "cudf.core.column.Decimal32Column":
+        if isinstance(dtype, Decimal32Dtype):
+            self.dtype.precision = dtype.precision
+
+        return self
+
+
+class Decimal128Column(DecimalBaseColumn):
+    dtype: Decimal128Dtype
+
+    @classmethod
+    def from_arrow(cls, data: pa.Array):
+        result = cast(Decimal128Dtype, super().from_arrow(data))
+        result.dtype.precision = data.type.precision
+        return result
+
+    def to_arrow(self):
+        return super().to_arrow().cast(self.dtype.to_arrow())
+
+    def _with_type_metadata(
+        self: "cudf.core.column.Decimal128Column", dtype: Dtype
+    ) -> "cudf.core.column.Decimal128Column":
+        if isinstance(dtype, Decimal128Dtype):
+            self.dtype.precision = dtype.precision
+
+        return self
+
 
 class Decimal64Column(DecimalBaseColumn):
     dtype: Decimal64Dtype
@@ -156,114 +310,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    def binary_operator(self, op, other, reflect=False):
-        if reflect:
-            self, other = other, self
-
-        # Binary Arithmetics between decimal columns. `Scale` and `precision`
-        # are computed outside of libcudf
-        if op in ("add", "sub", "mul", "div"):
-            scale = _binop_scale(self.dtype, other.dtype, op)
-            output_type = Decimal64Dtype(
-                scale=scale, precision=Decimal64Dtype.MAX_PRECISION
-            )  # precision will be ignored, libcudf has no notion of precision
-            result = libcudf.binaryop.binaryop(self, other, op, output_type)
-            result.dtype.precision = _binop_precision(
-                self.dtype, other.dtype, op
-            )
-        elif op in ("eq", "ne", "lt", "gt", "le", "ge"):
-            if not isinstance(
-                other,
-                (
-                    Decimal64Column,
-                    cudf.core.column.NumericalColumn,
-                    cudf.Scalar,
-                ),
-            ):
-                raise TypeError(
-                    f"Operator {op} not supported between"
-                    f"{str(type(self))} and {str(type(other))}"
-                )
-            if isinstance(
-                other, cudf.core.column.NumericalColumn
-            ) and not is_integer_dtype(other.dtype):
-                raise TypeError(
-                    f"Only decimal and integer column is supported for {op}."
-                )
-            if isinstance(other, cudf.core.column.NumericalColumn):
-                other = other.as_decimal_column(
-                    Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
-                )
-            result = libcudf.binaryop.binaryop(self, other, op, bool)
-        return result
-
-    def normalize_binop_value(self, other):
-        if is_scalar(other) and isinstance(other, (int, np.int, Decimal)):
-            return cudf.Scalar(Decimal(other))
-        elif isinstance(other, cudf.Scalar) and isinstance(
-            other.dtype, cudf.Decimal64Dtype
-        ):
-            return other
-        else:
-            raise TypeError(f"cannot normalize {type(other)}")
-
-    def _decimal_quantile(
-        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
-    ) -> ColumnBase:
-        quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
-        # get sorted indices and exclude nulls
-        sorted_indices = self.as_frame()._get_sorted_inds(
-            ascending=True, na_position="first"
-        )
-        sorted_indices = sorted_indices[self.null_count :]
-
-        result = cpp_quantile(
-            self, quant, interpolation, sorted_indices, exact
-        )
-
-        return result._with_type_metadata(self.dtype)
-
-    def as_numerical_column(
-        self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.NumericalColumn":
-        return libcudf.unary.cast(self, dtype)
-
-    def as_string_column(
-        self, dtype: Dtype, format=None, **kwargs
-    ) -> "cudf.core.column.StringColumn":
-        if len(self) > 0:
-            return cpp_from_decimal(self)
-        else:
-            return cast(
-                "cudf.core.column.StringColumn", as_column([], dtype="object")
-            )
-
-    def fillna(
-        self, value: Any = None, method: str = None, dtype: Dtype = None
-    ):
-        """Fill null values with ``value``.
-
-        Returns a copy with null filled.
-        """
-        if isinstance(value, (int, Decimal)):
-            value = cudf.Scalar(value, dtype=self.dtype)
-        elif (
-            isinstance(value, Decimal64Column)
-            or isinstance(value, cudf.core.column.NumericalColumn)
-            and is_integer_dtype(value.dtype)
-        ):
-            value = value.astype(self.dtype)
-        else:
-            raise TypeError(
-                "Decimal columns only support using fillna with decimal and "
-                "integer values"
-            )
-
-        result = libcudf.replace.replace_nulls(
-            input_col=self, replacement=value, method=method, dtype=dtype
-        )
-        return result._with_type_metadata(self.dtype)
-
     def serialize(self) -> Tuple[dict, list]:
         header, frames = super().serialize()
         header["dtype"] = self.dtype.serialize()
@@ -291,34 +337,45 @@ def _with_type_metadata(
         return self
 
 
-def _binop_scale(l_dtype, r_dtype, op):
+def _get_decimal_type(lhs_dtype, rhs_dtype, op):
+    """
+    Returns the resulting decimal type after calculating
+    precision & scale when performing the binary operation
+    `op` for the given dtypes.
+
+    For precision & scale calculations see : https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
+    """  # noqa: E501
+
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
-    s1, s2 = l_dtype.scale, r_dtype.scale
+
+    p1, p2 = lhs_dtype.precision, rhs_dtype.precision
+    s1, s2 = lhs_dtype.scale, rhs_dtype.scale
+
     if op in ("add", "sub"):
-        return max(s1, s2)
+        scale = max(s1, s2)
+        precision = scale + max(p1 - s1, p2 - s2) + 1
     elif op == "mul":
-        return s1 + s2
+        scale = s1 + s2
+        precision = p1 + p2 + 1
     elif op == "div":
-        return s1 - s2
+        scale = max(6, s1 + p2 + 1)
+        precision = p1 - s1 + s2 + scale
     else:
         raise NotImplementedError()
 
+    for decimal_type in (
+        cudf.Decimal32Dtype,
+        cudf.Decimal64Dtype,
+        cudf.Decimal128Dtype,
+    ):
+        try:
+            min_decimal_type = decimal_type(precision=precision, scale=scale)
+        except ValueError:
+            # Call to _validate fails, which means we need
+            # to try the next dtype
+            pass
+        else:
+            return min_decimal_type
 
-def _binop_precision(l_dtype, r_dtype, op):
-    """
-    Returns the result precision when performing the
-    binary operation `op` for the given dtypes.
-
-    See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
-    p1, p2 = l_dtype.precision, r_dtype.precision
-    s1, s2 = l_dtype.scale, r_dtype.scale
-    if op in ("add", "sub"):
-        result = max(s1, s2) + max(p1 - s1, p2 - s2) + 1
-    elif op in ("mul", "div"):
-        result = p1 + p2 + 1
-    else:
-        raise NotImplementedError()
-    # TODO
-    return min(result, cudf.Decimal64Dtype.MAX_PRECISION)
+    raise OverflowError("Maximum supported decimal type is Decimal128")
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 8f0a858ee34..2b0d7cfea38 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -30,7 +30,12 @@
     column,
     string,
 )
-from cudf.core.dtypes import CategoricalDtype, Decimal32Dtype, Decimal64Dtype
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    Decimal32Dtype,
+    Decimal64Dtype,
+    Decimal128Dtype,
+)
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -166,16 +171,20 @@ def binary_operator(
                     (
                         NumericalColumn,
                         cudf.Scalar,
-                        cudf.core.column.Decimal64Column,
-                        cudf.core.column.Decimal32Column,
+                        cudf.core.column.DecimalBaseColumn,
                     ),
                 )
                 or np.isscalar(rhs)
             ):
                 msg = "{!r} operator not supported between {} and {}"
                 raise TypeError(msg.format(binop, type(self), type(rhs)))
-            if isinstance(rhs, cudf.core.column.Decimal64Column):
+            if isinstance(rhs, cudf.core.column.Decimal128Column):
                 lhs: Union[ScalarLike, ColumnBase] = self.as_decimal_column(
+                    Decimal128Dtype(Decimal128Dtype.MAX_PRECISION, 0)
+                )
+                return lhs.binary_operator(binop, rhs)
+            elif isinstance(rhs, cudf.core.column.Decimal64Column):
+                lhs = self.as_decimal_column(
                     Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
                 )
                 return lhs.binary_operator(binop, rhs)
@@ -291,7 +300,7 @@ def as_timedelta_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.Decimal64Column":
+    ) -> "cudf.core.column.DecimalBaseColumn":
         return libcudf.unary.cast(self, dtype)
 
     def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a83110d273c..9b44b4e6831 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -5196,7 +5196,7 @@ def as_timedelta_column(
 
     def as_decimal_column(
         self, dtype: Dtype, **kwargs
-    ) -> "cudf.core.column.Decimal64Column":
+    ) -> "cudf.core.column.DecimalBaseColumn":
         return libstrings.to_decimal(self, dtype)
 
     def as_string_column(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 69600426ec0..1dddcb9e3af 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations, division
 
@@ -1583,7 +1583,7 @@ def _concat(
             if isinstance(
                 col,
                 (
-                    cudf.core.column.Decimal64Column,
+                    cudf.core.column.DecimalBaseColumn,
                     cudf.core.column.StructColumn,
                 ),
             ):
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 5f21e883a4d..3a1c366b429 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import decimal
 import pickle
@@ -355,14 +355,14 @@ def deserialize(cls, header: dict, frames: list):
         return cls(fields)
 
 
-class Decimal32Dtype(_BaseDtype):
+class DecimalDtype(_BaseDtype):
     """
     Parameters
     ----------
     precision : int
         The total number of digits in each value of this dtype
     scale : int, optional
-        The scale of the Decimal32Dtype. See Notes below.
+        The scale of the dtype. See Notes below.
 
     Notes
     -----
@@ -379,9 +379,7 @@ class Decimal32Dtype(_BaseDtype):
         and *not* representable with precision<6 or scale<4.
     """
 
-    name = "decimal32"
     _metadata = ("precision", "scale")
-    MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max))
 
     def __init__(self, precision, scale=0):
         self._validate(precision, scale)
@@ -389,7 +387,7 @@ def __init__(self, precision, scale=0):
 
     @property
     def str(self):
-        return f"decimal32({self.precision}, {self.scale})"
+        return f"{str(self.name)}({self.precision}, {self.scale})"
 
     @property
     def precision(self):
@@ -404,6 +402,10 @@ def precision(self, value):
     def scale(self):
         return self._typ.scale
 
+    @property
+    def itemsize(self):
+        return self.ITEMSIZE
+
     @property
     def type(self):
         # might need to account for precision and scale here
@@ -416,22 +418,15 @@ def to_arrow(self):
     def from_arrow(cls, typ):
         return cls(typ.precision, typ.scale)
 
-    @property
-    def itemsize(self):
-        return 4
-
     def __repr__(self):
         return (
             f"{self.__class__.__name__}"
             f"(precision={self.precision}, scale={self.scale})"
         )
 
-    def __hash__(self):
-        return hash(self._typ)
-
     @classmethod
     def _validate(cls, precision, scale=0):
-        if precision > Decimal32Dtype.MAX_PRECISION:
+        if precision > cls.MAX_PRECISION:
             raise ValueError(
                 f"Cannot construct a {cls.__name__}"
                 f" with precision > {cls.MAX_PRECISION}"
@@ -462,113 +457,33 @@ def serialize(self) -> Tuple[dict, list]:
     def deserialize(cls, header: dict, frames: list):
         return cls(header["precision"], header["scale"])
 
-
-class Decimal64Dtype(_BaseDtype):
-    """
-    Parameters
-    ----------
-    precision : int
-        The total number of digits in each value of this dtype
-    scale : int, optional
-        The scale of the Decimal64Dtype. See Notes below.
-
-    Notes
-    -----
-        When the scale is positive:
-          - numbers with fractional parts (e.g., 0.0042) can be represented
-          - the scale is the total number of digits to the right of the
-            decimal point
-        When the scale is negative:
-          - only multiples of powers of 10 (including 10**0) can be
-            represented (e.g., 1729, 4200, 1000000)
-          - the scale represents the number of trailing zeros in the value.
-        For example, 42 is representable with precision=2 and scale=0.
-        13.0051 is representable with precision=6 and scale=4,
-        and *not* representable with precision<6 or scale<4.
-    """
-
-    name = "decimal64"
-    _metadata = ("precision", "scale")
-    MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
-
-    def __init__(self, precision, scale=0):
-        self._validate(precision, scale)
-        self._typ = pa.decimal128(precision, scale)
-
-    @property
-    def str(self):
-        return f"decimal64({self.precision}, {self.scale})"
-
-    @property
-    def precision(self):
-        return self._typ.precision
-
-    @precision.setter
-    def precision(self, value):
-        self._validate(value, self.scale)
-        self._typ = pa.decimal128(precision=value, scale=self.scale)
-
-    @property
-    def scale(self):
-        return self._typ.scale
-
-    @property
-    def type(self):
-        # might need to account for precision and scale here
-        return decimal.Decimal
-
-    def to_arrow(self):
-        return self._typ
-
-    @classmethod
-    def from_arrow(cls, typ):
-        return cls(typ.precision, typ.scale)
-
-    @property
-    def itemsize(self):
-        return 8
-
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}"
-            f"(precision={self.precision}, scale={self.scale})"
-        )
+    def __eq__(self, other: Dtype) -> bool:
+        if other is self:
+            return True
+        elif not isinstance(other, self.__class__):
+            return False
+        return self.precision == other.precision and self.scale == other.scale
 
     def __hash__(self):
         return hash(self._typ)
 
-    @classmethod
-    def _validate(cls, precision, scale=0):
-        if precision > Decimal64Dtype.MAX_PRECISION:
-            raise ValueError(
-                f"Cannot construct a {cls.__name__}"
-                f" with precision > {cls.MAX_PRECISION}"
-            )
-        if abs(scale) > precision:
-            raise ValueError(f"scale={scale} exceeds precision={precision}")
 
-    @classmethod
-    def _from_decimal(cls, decimal):
-        """
-        Create a cudf.Decimal64Dtype from a decimal.Decimal object
-        """
-        metadata = decimal.as_tuple()
-        precision = max(len(metadata.digits), -metadata.exponent)
-        return cls(precision, -metadata.exponent)
+class Decimal32Dtype(DecimalDtype):
+    name = "decimal32"
+    MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max))
+    ITEMSIZE = 4
 
-    def serialize(self) -> Tuple[dict, list]:
-        return (
-            {
-                "type-serialized": pickle.dumps(type(self)),
-                "precision": self.precision,
-                "scale": self.scale,
-            },
-            [],
-        )
 
-    @classmethod
-    def deserialize(cls, header: dict, frames: list):
-        return cls(header["precision"], header["scale"])
+class Decimal64Dtype(DecimalDtype):
+    name = "decimal64"
+    MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
+    ITEMSIZE = 8
+
+
+class Decimal128Dtype(DecimalDtype):
+    name = "decimal128"
+    MAX_PRECISION = 38
+    ITEMSIZE = 16
 
 
 class IntervalDtype(StructDtype):
@@ -740,7 +655,11 @@ def is_decimal_dtype(obj):
     bool
         Whether or not the array-like or dtype is of the decimal dtype.
     """
-    return is_decimal32_dtype(obj) or is_decimal64_dtype(obj)
+    return (
+        is_decimal32_dtype(obj)
+        or is_decimal64_dtype(obj)
+        or is_decimal128_dtype(obj)
+    )
 
 
 def is_interval_dtype(obj):
@@ -791,3 +710,15 @@ def is_decimal64_dtype(obj):
         )
         or (hasattr(obj, "dtype") and is_decimal64_dtype(obj.dtype))
     )
+
+
+def is_decimal128_dtype(obj):
+    return (
+        type(obj) is cudf.core.dtypes.Decimal128Dtype
+        or obj is cudf.core.dtypes.Decimal128Dtype
+        or (
+            isinstance(obj, str)
+            and obj == cudf.core.dtypes.Decimal128Dtype.name
+        )
+        or (hasattr(obj, "dtype") and is_decimal128_dtype(obj.dtype))
+    )
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 5b041ba53b9..6da98bf980d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import collections
 import itertools
@@ -1003,7 +1003,9 @@ def diff(self, periods=1, axis=0):
             cudf.core.frame.Frame(value_columns._data)
         )
         grouped = self.obj.__class__._from_data(data, index)
-        grouped = self._mimic_pandas_order(grouped)
+        grouped = self._mimic_pandas_order(grouped)._copy_type_metadata(
+            value_columns
+        )
 
         result = grouped - self.shift(periods=periods)
         return result._copy_type_metadata(value_columns)
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 6dec0b10273..ead0cd566d9 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 from __future__ import annotations
 
 import collections
@@ -8,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -85,9 +86,7 @@ def _match_join_keys(
     if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
-    if isinstance(ltype, cudf.Decimal64Dtype) or isinstance(
-        rtype, cudf.Decimal64Dtype
-    ):
+    if is_decimal_dtype(ltype) or is_decimal_dtype(rtype):
         raise TypeError(
             "Decimal columns can only be merged with decimal columns "
             "of the same precision and scale"
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 37bb8e32c5a..b0770b71ca6 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import decimal
 
 import numpy as np
@@ -145,12 +146,12 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 return NA, dtype
 
-        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             value = pa.scalar(
                 value, type=pa.decimal128(dtype.precision, dtype.scale)
             ).as_py()
         if isinstance(value, decimal.Decimal) and dtype is None:
-            dtype = cudf.Decimal64Dtype._from_decimal(value)
+            dtype = cudf.Decimal128Dtype._from_decimal(value)
 
         value = to_cudf_compatible_scalar(value, dtype=dtype)
 
@@ -171,7 +172,7 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 dtype = value.dtype
 
-        if not isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+        if not isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 6842a05a505..2ecee781eb1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -133,11 +133,7 @@ def __setitem__(self, key, value):
         if (
             not isinstance(
                 self._frame._column.dtype,
-                (
-                    cudf.Decimal64Dtype,
-                    cudf.Decimal32Dtype,
-                    cudf.CategoricalDtype,
-                ),
+                (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype),
             )
             and hasattr(value, "dtype")
             and _is_non_decimal_numeric_dtype(value.dtype)
@@ -1466,7 +1462,10 @@ def _concat(cls, objs, axis=0, index=True):
         # Reassign precision for decimal cols & type schema for struct cols
         if isinstance(
             col,
-            (cudf.core.column.Decimal64Column, cudf.core.column.StructColumn),
+            (
+                cudf.core.column.DecimalBaseColumn,
+                cudf.core.column.StructColumn,
+            ),
         ):
             col = col._with_type_metadata(objs[0].dtype)
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 9694d19e159..3e73e0c9e3d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import io
 import json
@@ -10,7 +10,6 @@
 
 import fsspec
 import numpy as np
-import pyarrow as pa
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -627,34 +626,6 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
-        # Temporary error to probe a parquet file
-        # and raise decimal128 support error.
-        if len(filepaths_or_buffers) > 0:
-            try:
-                metadata = pq.read_metadata(filepaths_or_buffers[0])
-            except TypeError:
-                # pq.read_metadata only supports reading metadata from
-                # certain types of file inputs, like str-filepath or file-like
-                # objects, and errors for the rest of inputs. Hence this is
-                # to avoid failing on other types of file inputs.
-                pass
-            else:
-                arrow_schema = metadata.schema.to_arrow_schema()
-                check_cols = arrow_schema.names if columns is None else columns
-                for col_name, arrow_type in zip(
-                    arrow_schema.names, arrow_schema.types
-                ):
-                    if col_name not in check_cols:
-                        continue
-                    if isinstance(arrow_type, pa.ListType):
-                        val_field_types = arrow_type.value_field.flatten()
-                        for val_field_type in val_field_types:
-                            _check_decimal128_type(val_field_type.type)
-                    elif isinstance(arrow_type, pa.StructType):
-                        _ = cudf.StructDtype.from_arrow(arrow_type)
-                    else:
-                        _check_decimal128_type(arrow_type)
-
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -982,11 +953,3 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self.close()
-
-
-def _check_decimal128_type(arrow_type):
-    if isinstance(arrow_type, pa.Decimal128Type):
-        if arrow_type.precision > cudf.Decimal64Dtype.MAX_PRECISION:
-            raise NotImplementedError(
-                "Decimal type greater than Decimal64 is not yet supported"
-            )
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index f4a80c60ddf..13be158ed78 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 # This module is for generating "synthetic" datasets. It was originally
 # designed for testing filtered reading. Generally, it should be useful
@@ -384,6 +384,22 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "decimal128":
+            max_precision = meta.get(
+                "max_precision", cudf.Decimal128Dtype.MAX_PRECISION
+            )
+            precision = np.random.randint(1, max_precision)
+            scale = np.random.randint(0, precision)
+            dtype = cudf.Decimal128Dtype(precision=precision, scale=scale)
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    is_sorted=False,
+                    dtype=dtype,
+                )
+            )
         elif dtype == "category":
             column_params.append(
                 ColumnParameters(
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index 98249e761c1..4d104c122d1 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -84,13 +84,17 @@
         (cudf.CategoricalDtype, True),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), True),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -100,7 +104,9 @@
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), True),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         # TODO: Currently creating an empty Series of list type ignores the
         # provided type and instead makes a float64 Series.
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
@@ -189,13 +195,17 @@ def test_is_categorical_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, True),
         (cudf.Decimal64Dtype, True),
+        (cudf.Decimal32Dtype, True),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), True),
         (cudf.Decimal64Dtype(5, 2), True),
+        (cudf.Decimal32Dtype(5, 2), True),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), True),
@@ -205,7 +215,9 @@ def test_is_categorical_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), True),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), True),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), True),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -290,13 +302,17 @@ def test_is_numeric_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -306,7 +322,9 @@ def test_is_numeric_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -391,13 +409,17 @@ def test_is_integer_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -407,7 +429,9 @@ def test_is_integer_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -493,13 +517,17 @@ def test_is_integer(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -509,7 +537,9 @@ def test_is_integer(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -594,13 +624,17 @@ def test_is_string_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -610,7 +644,9 @@ def test_is_string_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), True),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -695,13 +731,17 @@ def test_is_datetime_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, True),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), True),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -711,7 +751,9 @@ def test_is_datetime_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), True),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -796,13 +838,17 @@ def test_is_list_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, True),
+        (cudf.Decimal128Dtype, False),
         (cudf.Decimal64Dtype, False),
+        (cudf.Decimal32Dtype, False),
         # (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), True),
+        (cudf.Decimal128Dtype(5, 2), False),
         (cudf.Decimal64Dtype(5, 2), False),
+        (cudf.Decimal32Dtype(5, 2), False),
         # (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -812,7 +858,9 @@ def test_is_list_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), False),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), False),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), False),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), True),
         # (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
@@ -900,13 +948,17 @@ def test_is_struct_dtype(obj, expect):
         (cudf.CategoricalDtype, False),
         (cudf.ListDtype, False),
         (cudf.StructDtype, False),
+        (cudf.Decimal128Dtype, True),
         (cudf.Decimal64Dtype, True),
+        (cudf.Decimal32Dtype, True),
         (cudf.IntervalDtype, False),
         # cuDF dtype instances.
         (cudf.CategoricalDtype("a"), False),
         (cudf.ListDtype(int), False),
         (cudf.StructDtype({"a": int}), False),
+        (cudf.Decimal128Dtype(5, 2), True),
         (cudf.Decimal64Dtype(5, 2), True),
+        (cudf.Decimal32Dtype(5, 2), True),
         (cudf.IntervalDtype(int), False),
         # cuDF objects
         (cudf.Series(dtype="bool"), False),
@@ -916,7 +968,9 @@ def test_is_struct_dtype(obj, expect):
         (cudf.Series(dtype="datetime64[s]"), False),
         (cudf.Series(dtype="timedelta64[s]"), False),
         (cudf.Series(dtype="category"), False),
+        (cudf.Series(dtype=cudf.Decimal128Dtype(5, 2)), True),
         (cudf.Series(dtype=cudf.Decimal64Dtype(5, 2)), True),
+        (cudf.Series(dtype=cudf.Decimal32Dtype(5, 2)), True),
         (cudf.Series([[1, 2], [3, 4, 5]]), False),
         (cudf.Series([{"a": 1, "b": 2}, {"c": 3}]), False),
         (cudf.Series(dtype=cudf.IntervalDtype(int)), False),
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index ba2a6dce369..921f2de38c2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from __future__ import division
 
@@ -1802,7 +1802,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["1.5", "2.0"],
             cudf.Decimal64Dtype(scale=2, precision=3),
             ["3.0", "4.0"],
-            cudf.Decimal64Dtype(scale=2, precision=4),
+            cudf.Decimal32Dtype(scale=2, precision=4),
         ),
         (
             operator.add,
@@ -1811,7 +1811,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", "3.005"],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.add,
@@ -1820,7 +1820,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.1", "0.2"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["100.1", "200.2"],
-            cudf.Decimal64Dtype(scale=3, precision=18),
+            cudf.Decimal128Dtype(scale=3, precision=23),
         ),
         (
             operator.sub,
@@ -1829,7 +1829,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.sub,
@@ -1838,7 +1838,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", "0.995"],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.sub,
@@ -1847,7 +1847,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.1", "0.2"],
             cudf.Decimal64Dtype(scale=6, precision=10),
             ["99.9", "199.8"],
-            cudf.Decimal64Dtype(scale=6, precision=18),
+            cudf.Decimal128Dtype(scale=6, precision=19),
         ),
         (
             operator.mul,
@@ -1856,7 +1856,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["1.5", "3.0"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", "6.0"],
-            cudf.Decimal64Dtype(scale=5, precision=8),
+            cudf.Decimal32Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -1865,7 +1865,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.1", "0.2"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["10.0", "40.0"],
-            cudf.Decimal64Dtype(scale=1, precision=8),
+            cudf.Decimal32Dtype(scale=1, precision=8),
         ),
         (
             operator.mul,
@@ -1874,7 +1874,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.343", "0.500"],
             cudf.Decimal64Dtype(scale=3, precision=3),
             ["343.0", "1000.0"],
-            cudf.Decimal64Dtype(scale=0, precision=8),
+            cudf.Decimal32Dtype(scale=0, precision=8),
         ),
         (
             operator.truediv,
@@ -1883,7 +1883,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["1.5", "3.0"],
             cudf.Decimal64Dtype(scale=1, precision=4),
             ["1.0", "0.6"],
-            cudf.Decimal64Dtype(scale=1, precision=9),
+            cudf.Decimal64Dtype(scale=7, precision=10),
         ),
         (
             operator.truediv,
@@ -1892,7 +1892,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.1", "0.2"],
             cudf.Decimal64Dtype(scale=2, precision=4),
             ["1000.0", "1000.0"],
-            cudf.Decimal64Dtype(scale=-3, precision=8),
+            cudf.Decimal64Dtype(scale=6, precision=12),
         ),
         (
             operator.truediv,
@@ -1901,7 +1901,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.34", "8.50"],
             cudf.Decimal64Dtype(scale=2, precision=8),
             ["56.77", "1.79"],
-            cudf.Decimal64Dtype(scale=2, precision=18),
+            cudf.Decimal128Dtype(scale=13, precision=25),
         ),
         (
             operator.add,
@@ -1910,7 +1910,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["1.5", None, "2.0"],
             cudf.Decimal64Dtype(scale=1, precision=2),
             ["3.0", None, "4.0"],
-            cudf.Decimal64Dtype(scale=1, precision=3),
+            cudf.Decimal32Dtype(scale=1, precision=3),
         ),
         (
             operator.add,
@@ -1919,7 +1919,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", "1.005"],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["3.75", None],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.sub,
@@ -1928,7 +1928,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.sub,
@@ -1937,7 +1937,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["2.25", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["-0.75", None],
-            cudf.Decimal64Dtype(scale=3, precision=5),
+            cudf.Decimal32Dtype(scale=3, precision=5),
         ),
         (
             operator.mul,
@@ -1946,7 +1946,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["1.5", None],
             cudf.Decimal64Dtype(scale=3, precision=4),
             ["2.25", None],
-            cudf.Decimal64Dtype(scale=5, precision=8),
+            cudf.Decimal32Dtype(scale=5, precision=8),
         ),
         (
             operator.mul,
@@ -1955,7 +1955,7 @@ def test_binops_with_NA_consistent(dtype, op):
             ["0.1", None],
             cudf.Decimal64Dtype(scale=3, precision=12),
             ["10.0", None],
-            cudf.Decimal64Dtype(scale=1, precision=18),
+            cudf.Decimal128Dtype(scale=1, precision=23),
         ),
         (
             operator.eq,
@@ -2128,7 +2128,10 @@ def test_binops_decimal(args):
     b = utils._decimal_series(rhs, r_dtype)
     expect = (
         utils._decimal_series(expect, expect_dtype)
-        if isinstance(expect_dtype, cudf.Decimal64Dtype)
+        if isinstance(
+            expect_dtype,
+            (cudf.Decimal64Dtype, cudf.Decimal32Dtype, cudf.Decimal128Dtype),
+        )
         else cudf.Series(expect, dtype=expect_dtype)
     )
 
@@ -2322,7 +2325,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(1),
             ["101", "201"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             False,
         ),
         (
@@ -2331,7 +2334,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             1,
             ["101", "201"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             False,
         ),
         (
@@ -2340,7 +2343,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("1.5"),
             ["101.5", "201.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             False,
         ),
         (
@@ -2349,7 +2352,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("1.5")),
             ["101.5", "201.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             False,
         ),
         (
@@ -2358,7 +2361,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(1),
             ["101", "201"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             True,
         ),
         (
@@ -2367,7 +2370,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             1,
             ["101", "201"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             True,
         ),
         (
@@ -2376,7 +2379,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("1.5"),
             ["101.5", "201.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             True,
         ),
         (
@@ -2385,7 +2388,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("1.5")),
             ["101.5", "201.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             True,
         ),
         (
@@ -2394,7 +2397,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             1,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal32Dtype(scale=-2, precision=5),
             False,
         ),
         (
@@ -2403,7 +2406,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(2),
             ["200", "400"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal32Dtype(scale=-2, precision=5),
             False,
         ),
         (
@@ -2412,7 +2415,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("1.5"),
             ["150", "300"],
-            cudf.Decimal64Dtype(scale=-1, precision=6),
+            cudf.Decimal32Dtype(scale=-1, precision=6),
             False,
         ),
         (
@@ -2421,7 +2424,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("1.5")),
             ["150", "300"],
-            cudf.Decimal64Dtype(scale=-1, precision=6),
+            cudf.Decimal32Dtype(scale=-1, precision=6),
             False,
         ),
         (
@@ -2430,7 +2433,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             1,
             ["100", "200"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal32Dtype(scale=-2, precision=5),
             True,
         ),
         (
@@ -2439,7 +2442,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(2),
             ["200", "400"],
-            cudf.Decimal64Dtype(scale=-2, precision=5),
+            cudf.Decimal32Dtype(scale=-2, precision=5),
             True,
         ),
         (
@@ -2448,7 +2451,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("1.5"),
             ["150", "300"],
-            cudf.Decimal64Dtype(scale=-1, precision=6),
+            cudf.Decimal32Dtype(scale=-1, precision=6),
             True,
         ),
         (
@@ -2457,7 +2460,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("1.5")),
             ["150", "300"],
-            cudf.Decimal64Dtype(scale=-1, precision=6),
+            cudf.Decimal32Dtype(scale=-1, precision=6),
             True,
         ),
         (
@@ -2466,7 +2469,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=4),
             1,
             ["1000", "2000"],
-            cudf.Decimal64Dtype(scale=-2, precision=6),
+            cudf.Decimal64Dtype(scale=6, precision=12),
             False,
         ),
         (
@@ -2475,7 +2478,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=2, precision=5),
             decimal.Decimal(2),
             ["50", "100"],
-            cudf.Decimal64Dtype(scale=2, precision=7),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             False,
         ),
         (
@@ -2484,7 +2487,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=2, precision=4),
             decimal.Decimal("1.5"),
             ["23.4", "36.6"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             False,
         ),
         (
@@ -2493,7 +2496,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=1, precision=3),
             cudf.Scalar(decimal.Decimal("1.5")),
             ["14", "62"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             False,
         ),
         (
@@ -2502,7 +2505,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=2, precision=5),
             1,
             ["0", "0"],
-            cudf.Decimal64Dtype(scale=-2, precision=7),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             True,
         ),
         (
@@ -2511,7 +2514,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=1, precision=6),
             decimal.Decimal(20),
             ["10", "40"],
-            cudf.Decimal64Dtype(scale=-1, precision=9),
+            cudf.Decimal64Dtype(scale=7, precision=10),
             True,
         ),
         (
@@ -2520,7 +2523,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=2, precision=3),
             decimal.Decimal("8.55"),
             ["7", "1"],
-            cudf.Decimal64Dtype(scale=0, precision=7),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             True,
         ),
         (
@@ -2529,7 +2532,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=1, precision=3),
             cudf.Scalar(decimal.Decimal("90.84")),
             ["82.5", "2.1"],
-            cudf.Decimal64Dtype(scale=1, precision=8),
+            cudf.Decimal32Dtype(scale=6, precision=9),
             True,
         ),
         (
@@ -2538,7 +2541,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(2),
             ["98", "198"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             False,
         ),
         (
@@ -2547,7 +2550,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("2.5"),
             ["97.5", "197.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             False,
         ),
         (
@@ -2556,7 +2559,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             4,
             ["96", "196"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             False,
         ),
         (
@@ -2565,7 +2568,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("2.5")),
             ["97.5", "197.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             False,
         ),
         (
@@ -2574,7 +2577,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal(2),
             ["-98", "-198"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             True,
         ),
         (
@@ -2583,7 +2586,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             4,
             ["-96", "-196"],
-            cudf.Decimal64Dtype(scale=0, precision=6),
+            cudf.Decimal32Dtype(scale=0, precision=6),
             True,
         ),
         (
@@ -2592,7 +2595,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             decimal.Decimal("2.5"),
             ["-97.5", "-197.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             True,
         ),
         (
@@ -2601,11 +2604,15 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
             cudf.Decimal64Dtype(scale=-2, precision=3),
             cudf.Scalar(decimal.Decimal("2.5")),
             ["-97.5", "-197.5"],
-            cudf.Decimal64Dtype(scale=1, precision=7),
+            cudf.Decimal32Dtype(scale=1, precision=7),
             True,
         ),
     ],
 )
+@pytest.mark.xfail(
+    reason="binop operations not supported for different "
+    "bit-width decimal types"
+)
 def test_binops_decimal_scalar(args):
     op, lhs, l_dtype, rhs, expect, expect_dtype, reflect = args
 
@@ -2776,6 +2783,10 @@ def decimal_series(input, dtype):
     ],
 )
 @pytest.mark.parametrize("reflected", [True, False])
+@pytest.mark.xfail(
+    reason="binop operations not supported for different bit-width "
+    "decimal types"
+)
 def test_binops_decimal_scalar_compare(args, reflected):
     """
     Tested compare operations:
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index d2c7c073aa1..e01b952be94 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
 import cupy as cp
 import numpy as np
 import pandas as pd
@@ -116,9 +117,13 @@ def test_column_slicing(pandas_input, offset, size):
 @pytest.mark.parametrize("size", [50, 10, 0])
 @pytest.mark.parametrize("precision", [2, 3, 5])
 @pytest.mark.parametrize("scale", [0, 1, 2])
-def test_decimal_column_slicing(offset, size, precision, scale):
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype],
+)
+def test_decimal_column_slicing(offset, size, precision, scale, decimal_type):
     col = cudf.core.column.as_column(pd.Series(np.random.rand(1000)))
-    col = col.astype(cudf.Decimal64Dtype(precision, scale))
+    col = col.astype(decimal_type(precision, scale))
     column_slicing_test(col, offset, size, True)
 
 
@@ -379,7 +384,7 @@ def test_as_column_buffer(data, expected):
         (
             pa.array([100, 200, 300], type=pa.decimal128(3)),
             cudf.core.column.as_column(
-                [100, 200, 300], dtype=cudf.core.dtypes.Decimal64Dtype(3, 0)
+                [100, 200, 300], dtype=cudf.core.dtypes.Decimal128Dtype(3, 0)
             ),
         ),
         (
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 46707a283af..b8724fe36f5 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import re
 from decimal import Decimal
@@ -9,7 +9,7 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
 
@@ -1357,8 +1357,19 @@ def test_concat_single_object(ignore_index, typ):
     )
 
 
-@pytest.mark.parametrize("ltype", [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2)])
-@pytest.mark.parametrize("rtype", [Decimal64Dtype(3, 2), Decimal64Dtype(8, 4)])
+@pytest.mark.parametrize(
+    "ltype",
+    [Decimal64Dtype(3, 1), Decimal64Dtype(7, 2), Decimal64Dtype(8, 4)],
+)
+@pytest.mark.parametrize(
+    "rtype",
+    [
+        Decimal64Dtype(3, 2),
+        Decimal64Dtype(8, 4),
+        gd.Decimal128Dtype(3, 2),
+        gd.Decimal32Dtype(8, 4),
+    ],
+)
 def test_concat_decimal_dataframe(ltype, rtype):
     gdf1 = gd.DataFrame(
         {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]}
@@ -1381,7 +1392,13 @@ def test_concat_decimal_dataframe(ltype, rtype):
 
 @pytest.mark.parametrize("ltype", [Decimal64Dtype(4, 1), Decimal64Dtype(8, 2)])
 @pytest.mark.parametrize(
-    "rtype", [Decimal64Dtype(4, 3), Decimal64Dtype(10, 4)]
+    "rtype",
+    [
+        Decimal64Dtype(4, 3),
+        Decimal64Dtype(10, 4),
+        Decimal32Dtype(8, 3),
+        Decimal128Dtype(18, 3),
+    ],
 )
 def test_concat_decimal_series(ltype, rtype):
     gs1 = gd.Series(["228.3", "559.5", "281.1"]).astype(ltype)
@@ -1420,7 +1437,7 @@ def test_concat_decimal_series(ltype, rtype):
                         Decimal("-5"),
                     ]
                 },
-                dtype=Decimal64Dtype(7, 4),
+                dtype=Decimal32Dtype(7, 4),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1442,7 +1459,7 @@ def test_concat_decimal_series(ltype, rtype):
                         Decimal("-48"),
                     ]
                 },
-                dtype=Decimal64Dtype(5, 2),
+                dtype=Decimal32Dtype(5, 2),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1464,7 +1481,7 @@ def test_concat_decimal_series(ltype, rtype):
                         Decimal("-49.25"),
                     ]
                 },
-                dtype=Decimal64Dtype(9, 4),
+                dtype=Decimal32Dtype(9, 4),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1486,7 +1503,29 @@ def test_concat_decimal_series(ltype, rtype):
                         Decimal("-31.945"),
                     ]
                 },
-                dtype=Decimal64Dtype(9, 4),
+                dtype=Decimal32Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.DataFrame(
+                {"val": [Decimal("95633.24"), Decimal("236.633")]},
+                dtype=Decimal128Dtype(19, 4),
+            ),
+            gd.DataFrame({"val": [5393, -95832]}, dtype="int64"),
+            gd.DataFrame({"val": [-29.234, -31.945]}, dtype="float64"),
+            gd.DataFrame(
+                {
+                    "val": [
+                        Decimal("95633.24"),
+                        Decimal("236.633"),
+                        Decimal("5393"),
+                        Decimal("-95832"),
+                        Decimal("-29.234"),
+                        Decimal("-31.945"),
+                    ]
+                },
+                dtype=Decimal128Dtype(19, 4),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1538,7 +1577,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
                     Decimal("593"),
                     Decimal("-702"),
                 ],
-                dtype=Decimal64Dtype(5, 2),
+                dtype=Decimal32Dtype(5, 2),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1558,7 +1597,7 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
                     Decimal("5299.262"),
                     Decimal("-2049.25"),
                 ],
-                dtype=Decimal64Dtype(9, 4),
+                dtype=Decimal32Dtype(9, 4),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
@@ -1578,7 +1617,33 @@ def test_concat_decimal_numeric_dataframe(df1, df2, df3, expected):
                     Decimal("-40.292"),
                     Decimal("49202.953"),
                 ],
-                dtype=Decimal64Dtype(9, 4),
+                dtype=Decimal32Dtype(9, 4),
+                index=[0, 1, 0, 1, 0, 1],
+            ),
+        ),
+        (
+            gd.Series(
+                [Decimal("492.204"), Decimal("-72824.455")],
+                dtype=Decimal64Dtype(10, 4),
+            ),
+            gd.Series(
+                [Decimal("8438"), Decimal("-27462")],
+                dtype=Decimal32Dtype(9, 4),
+            ),
+            gd.Series(
+                [Decimal("-40.292"), Decimal("49202.953")],
+                dtype=Decimal128Dtype(19, 4),
+            ),
+            gd.Series(
+                [
+                    Decimal("492.204"),
+                    Decimal("-72824.455"),
+                    Decimal("8438"),
+                    Decimal("-27462"),
+                    Decimal("-40.292"),
+                    Decimal("49202.953"),
+                ],
+                dtype=Decimal128Dtype(19, 4),
                 index=[0, 1, 0, 1, 0, 1],
             ),
         ),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6171f20929d..61c3f428019 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import array as arr
 import datetime
@@ -2176,13 +2176,17 @@ def test_quantile(q, numeric_only):
 
 @pytest.mark.parametrize("q", [0.2, 1, 0.001, [0.5], [], [0.005, 0.8, 0.03]])
 @pytest.mark.parametrize("interpolation", ["higher", "lower", "nearest"])
-def test_decimal_quantile(q, interpolation):
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_decimal_quantile(q, interpolation, decimal_type):
     data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"]
     gdf = cudf.DataFrame(
         {"id": np.random.randint(0, 10, size=len(data)), "val": data}
     )
     gdf["id"] = gdf["id"].astype("float64")
-    gdf["val"] = gdf["val"].astype(cudf.Decimal64Dtype(7, 2))
+    gdf["val"] = gdf["val"].astype(decimal_type(7, 2))
     pdf = gdf.to_pandas()
 
     got = gdf.quantile(q, numeric_only=False, interpolation=interpolation)
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 877cec24afa..356685c976e 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -9,7 +9,9 @@
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
     CategoricalDtype,
+    Decimal32Dtype,
     Decimal64Dtype,
+    Decimal128Dtype,
     IntervalDtype,
     ListDtype,
     StructDtype,
@@ -138,16 +140,28 @@ def test_struct_dtype_fields(fields):
     assert_eq(dt.fields, fields)
 
 
-def test_decimal_dtype():
-    dt = Decimal64Dtype(4, 2)
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_decimal_dtype_arrow_roundtrip(decimal_type):
+    dt = decimal_type(4, 2)
     assert dt.to_arrow() == pa.decimal128(4, 2)
-    assert dt == Decimal64Dtype.from_arrow(pa.decimal128(4, 2))
+    assert dt == decimal_type.from_arrow(pa.decimal128(4, 2))
 
 
-def test_max_precision():
-    Decimal64Dtype(scale=0, precision=18)
+@pytest.mark.parametrize(
+    "decimal_type,max_precision",
+    [
+        (cudf.Decimal32Dtype, 9),
+        (cudf.Decimal64Dtype, 18),
+        (cudf.Decimal128Dtype, 38),
+    ],
+)
+def test_max_precision(decimal_type, max_precision):
+    decimal_type(scale=0, precision=max_precision)
     with pytest.raises(ValueError):
-        Decimal64Dtype(scale=0, precision=19)
+        decimal_type(scale=0, precision=max_precision + 1)
 
 
 @pytest.mark.parametrize("fields", ["int64", "int32"])
@@ -180,7 +194,9 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
                 for i, child in enumerate(column.base_children)
             ]
         )
-    elif isinstance(column.dtype, Decimal64Dtype):
+    elif isinstance(
+        column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype)
+    ):
         return array.type.equals(column.dtype.to_arrow())
     elif isinstance(column.dtype, CategoricalDtype):
         raise NotImplementedError()
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 2fb7393f5b4..69793dc1828 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
-from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
+from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
@@ -1130,7 +1130,12 @@ def test_typecast_on_join_overflow_unsafe(dtypes):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(5, 2), Decimal64Dtype(7, 5), Decimal64Dtype(12, 7)],
+    [
+        Decimal64Dtype(5, 2),
+        Decimal64Dtype(7, 5),
+        Decimal64Dtype(12, 7),
+        Decimal128Dtype(20, 5),
+    ],
 )
 def test_decimal_typecast_inner(dtype):
     other_data = ["a", "b", "c", "d", "e"]
@@ -1166,7 +1171,12 @@ def test_decimal_typecast_inner(dtype):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(7, 3), Decimal64Dtype(9, 5), Decimal64Dtype(14, 10)],
+    [
+        Decimal64Dtype(7, 3),
+        Decimal64Dtype(9, 5),
+        Decimal64Dtype(14, 10),
+        Decimal128Dtype(21, 9),
+    ],
 )
 def test_decimal_typecast_left(dtype):
     other_data = ["a", "b", "c", "d"]
@@ -1203,7 +1213,12 @@ def test_decimal_typecast_left(dtype):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(7, 3), Decimal64Dtype(10, 5), Decimal64Dtype(18, 9)],
+    [
+        Decimal64Dtype(7, 3),
+        Decimal64Dtype(10, 5),
+        Decimal64Dtype(18, 9),
+        Decimal128Dtype(22, 8),
+    ],
 )
 def test_decimal_typecast_outer(dtype):
     other_data = ["a", "b", "c"]
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index dc176992434..44812f5aba4 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -15,7 +15,6 @@
 import pytest
 
 import cudf
-from cudf.core.dtypes import Decimal64Dtype
 from cudf.io.orc import ORCWriter
 from cudf.testing._utils import (
     assert_eq,
@@ -528,12 +527,6 @@ def test_orc_decimal_precision_fail(datadir):
     except pa.ArrowIOError as e:
         pytest.skip(".orc file is not found: %s" % e)
 
-    # Max precision supported is 18 (Decimal64Dtype limit)
-    # and the data has the precision 19. This test should be removed
-    # once Decimal128Dtype is introduced.
-    with pytest.raises(RuntimeError):
-        cudf.read_orc(file_path)
-
     # Shouldn't cause failure if decimal column is not chosen to be read.
     pdf = orcfile.read(columns=["int"]).to_pandas()
     gdf = cudf.read_orc(file_path, columns=["int"])
@@ -790,12 +783,16 @@ def test_empty_string_columns(data):
 
 
 @pytest.mark.parametrize("scale", [-3, 0, 3])
-def test_orc_writer_decimal(tmpdir, scale):
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_orc_writer_decimal(tmpdir, scale, decimal_type):
     np.random.seed(0)
     fname = tmpdir / "decimal.orc"
 
     expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)})
-    expected["dec_val"] = expected["dec_val"].astype(Decimal64Dtype(7, scale))
+    expected["dec_val"] = expected["dec_val"].astype(decimal_type(7, scale))
 
     expected.to_orc(fname)
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 016ed1229f1..f239d88992a 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
 import math
@@ -633,29 +633,19 @@ def test_parquet_reader_spark_timestamps(datadir):
 def test_parquet_reader_spark_decimals(datadir):
     fname = datadir / "spark_decimal.parquet"
 
-    # expect = pd.read_parquet(fname)
-    with pytest.raises(
-        NotImplementedError,
-        match="Decimal type greater than Decimal64 is not yet supported",
-    ):
-        cudf.read_parquet(fname)
-
-    # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
-    # This is because cuDF returns as float64 as it lacks an equivalent dtype
-    # expect = expect.apply(pd.to_numeric)
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
 
-    # np.testing.assert_allclose(expect, got)
-    # assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("columns", [["a"], ["b", "a"], None])
-def test_parquet_reader_decimal128_error_validation(datadir, columns):
+def test_parquet_reader_decimal128(datadir, columns):
     fname = datadir / "nested_decimal128_file.parquet"
-    with pytest.raises(
-        NotImplementedError,
-        match="Decimal type greater than Decimal64 is not yet supported",
-    ):
-        cudf.read_parquet(fname, columns=columns)
+    got = cudf.read_parquet(fname, columns=columns)
+    expect = cudf.read_parquet(fname, columns=columns)
+
+    assert_eq(expect, got)
 
 
 def test_parquet_reader_microsecond_timestamps(datadir):
@@ -2264,12 +2254,15 @@ def test_parquet_writer_nested(tmpdir, data):
     assert_eq(expect, got)
 
 
-def test_parquet_writer_decimal(tmpdir):
-    from cudf.core.dtypes import Decimal64Dtype
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_parquet_writer_decimal(tmpdir, decimal_type):
 
     gdf = cudf.DataFrame({"val": [0.00, 0.01, 0.02]})
 
-    gdf["dec_val"] = gdf["val"].astype(Decimal64Dtype(7, 2))
+    gdf["dec_val"] = gdf["val"].astype(decimal_type(7, 2))
 
     fname = tmpdir.join("test_parquet_writer_decimal.parquet")
     gdf.to_parquet(fname)
@@ -2313,10 +2306,12 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):
     assert_eq(gdf.to_pandas(nullable=nullable), got)
 
 
-def test_parquet_decimal_precision(tmpdir):
-    df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(
-        cudf.Decimal64Dtype(5, 2)
-    )
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_parquet_decimal_precision(tmpdir, decimal_type):
+    df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(decimal_type(5, 2))
     assert df.val.dtype.precision == 5
 
     fname = tmpdir.join("decimal_test.parquet")
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 4ed6448de50..40add502309 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from __future__ import division, print_function
 
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf import Series
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils
 from cudf.testing._utils import NUMERIC_TYPES, assert_eq, gen_rand
 
@@ -53,10 +53,17 @@ def test_sum_string():
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)],
+    [
+        Decimal64Dtype(6, 3),
+        Decimal64Dtype(10, 6),
+        Decimal64Dtype(16, 7),
+        Decimal32Dtype(6, 3),
+        Decimal128Dtype(20, 7),
+    ],
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_sum_decimal(dtype, nelem):
+    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).sum()
@@ -89,9 +96,16 @@ def test_product(dtype, nelem):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(6, 2), Decimal64Dtype(8, 4), Decimal64Dtype(10, 5)],
+    [
+        Decimal64Dtype(6, 2),
+        Decimal64Dtype(8, 4),
+        Decimal64Dtype(10, 5),
+        Decimal32Dtype(6, 2),
+        Decimal128Dtype(20, 5),
+    ],
 )
 def test_product_decimal(dtype):
+    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).product()
@@ -131,9 +145,16 @@ def test_sum_of_squares(dtype, nelem):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(6, 2), Decimal64Dtype(8, 4), Decimal64Dtype(10, 5)],
+    [
+        Decimal64Dtype(6, 2),
+        Decimal64Dtype(8, 4),
+        Decimal64Dtype(10, 5),
+        Decimal128Dtype(20, 7),
+        Decimal32Dtype(6, 2),
+    ],
 )
 def test_sum_of_squares_decimal(dtype):
+    np.random.seed(0)
     data = [str(x) for x in gen_rand("int8", 3) / 10]
 
     expected = pd.Series([Decimal(x) for x in data]).pow(2).sum()
@@ -156,10 +177,17 @@ def test_min(dtype, nelem):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)],
+    [
+        Decimal64Dtype(6, 3),
+        Decimal64Dtype(10, 6),
+        Decimal64Dtype(16, 7),
+        Decimal32Dtype(6, 3),
+        Decimal128Dtype(20, 7),
+    ],
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_min_decimal(dtype, nelem):
+    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).min()
@@ -182,10 +210,17 @@ def test_max(dtype, nelem):
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(6, 3), Decimal64Dtype(10, 6), Decimal64Dtype(16, 7)],
+    [
+        Decimal64Dtype(6, 3),
+        Decimal64Dtype(10, 6),
+        Decimal64Dtype(16, 7),
+        Decimal32Dtype(6, 3),
+        Decimal128Dtype(20, 7),
+    ],
 )
 @pytest.mark.parametrize("nelem", params_sizes)
 def test_max_decimal(dtype, nelem):
+    np.random.seed(0)
     data = [str(x) for x in gen_rand("int64", nelem) / 100]
 
     expected = pd.Series([Decimal(x) for x in data]).max()
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 2e7936feeae..90429945cc5 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import re
 from decimal import Decimal
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
@@ -350,7 +350,7 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
             Decimal64Dtype(7, 2)
         ),
         cudf.Series(["-74.56", None, "-23.73", "34.55", "2.89", None]).astype(
-            Decimal64Dtype(7, 2)
+            Decimal32Dtype(7, 2)
         ),
         cudf.Series(
             ["85.955", np.nan, "-3.243", np.nan, "29.492", np.nan]
@@ -361,6 +361,9 @@ def test_fillna_method_numerical(data, container, data_dtype, method, inplace):
         cudf.Series(
             [np.nan, "55.2498", np.nan, "-5.2965", "-28.9423", np.nan]
         ).astype(Decimal64Dtype(10, 4)),
+        cudf.Series(
+            ["2.964", None, "54347.432", "-989.330", None, "56.444"]
+        ).astype(Decimal128Dtype(20, 7)),
     ],
 )
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index a8b62710e0e..e8382681820 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 import datetime
 import datetime as dt
 import re
@@ -22,6 +24,8 @@
     cudf.Decimal64Dtype(1, 1),
     cudf.Decimal64Dtype(4, 2),
     cudf.Decimal64Dtype(4, -2),
+    cudf.Decimal32Dtype(3, 1),
+    cudf.Decimal128Dtype(28, 3),
 ]
 
 SCALAR_VALUES = [
@@ -145,8 +149,12 @@ def test_scalar_device_initialization(value):
 
 
 @pytest.mark.parametrize("value", DECIMAL_VALUES)
-def test_scalar_device_initialization_decimal(value):
-    dtype = cudf.Decimal64Dtype._from_decimal(value)
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_scalar_device_initialization_decimal(value, decimal_type):
+    dtype = decimal_type._from_decimal(value)
     column = cudf.Series([str(value)]).astype(dtype)._column
     dev_slr = get_element(column, 0)
 
@@ -199,7 +207,7 @@ def test_null_scalar(dtype):
     assert s.value is cudf.NA
     assert s.dtype == (
         cudf.dtype(dtype)
-        if not isinstance(dtype, cudf.Decimal64Dtype)
+        if not isinstance(dtype, cudf.core.dtypes.DecimalDtype)
         else dtype
     )
     assert s.is_valid() is False
@@ -250,6 +258,12 @@ def test_scalar_dtype_and_validity(dtype):
         (Decimal(1), cudf.Decimal64Dtype(1, 0), Decimal("1")),
         (Decimal("1.1"), cudf.Decimal64Dtype(2, 1), Decimal("1.1")),
         (Decimal("1.1"), cudf.Decimal64Dtype(4, 3), Decimal("1.100")),
+        (Decimal("41.123"), cudf.Decimal32Dtype(5, 3), Decimal("41.123")),
+        (
+            Decimal("41345435344353535344373628492731234.123"),
+            cudf.Decimal128Dtype(38, 3),
+            Decimal("41345435344353535344373628492731234.123"),
+        ),
         (Decimal("1.11"), cudf.Decimal64Dtype(2, 2), pa.lib.ArrowInvalid),
     ],
 )
@@ -335,18 +349,25 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
 
 
 @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
-def test_device_scalar_direct_construction(value):
+@pytest.mark.parametrize(
+    "decimal_type",
+    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
+)
+def test_device_scalar_direct_construction(value, decimal_type):
     value = cudf.utils.utils.to_cudf_compatible_scalar(value)
+
     dtype = (
         value.dtype
         if not isinstance(value, Decimal)
-        else cudf.Decimal64Dtype._from_decimal(value)
+        else decimal_type._from_decimal(value)
     )
 
     s = cudf._lib.scalar.DeviceScalar(value, dtype)
 
     assert s.value == value or np.isnan(s.value) and np.isnan(value)
-    if isinstance(dtype, cudf.Decimal64Dtype):
+    if isinstance(
+        dtype, (cudf.Decimal64Dtype, cudf.Decimal128Dtype, cudf.Decimal32Dtype)
+    ):
         assert s.dtype.precision == dtype.precision
         assert s.dtype.scale == dtype.scale
     elif dtype.char == "U":
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index 741a9f45d09..4cbc2197cfd 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 from itertools import product
 
 import numpy as np
@@ -5,7 +7,7 @@
 import pytest
 
 import cudf
-from cudf.core.dtypes import Decimal64Dtype
+from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
@@ -69,7 +71,13 @@ def test_cumsum_masked():
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(8, 4), Decimal64Dtype(10, 5), Decimal64Dtype(12, 7)],
+    [
+        Decimal64Dtype(8, 4),
+        Decimal64Dtype(10, 5),
+        Decimal64Dtype(12, 7),
+        Decimal32Dtype(8, 5),
+        Decimal128Dtype(13, 6),
+    ],
 )
 def test_cumsum_decimal(dtype):
     data = ["243.32", "48.245", "-7234.298", np.nan, "-467.2"]
@@ -126,7 +134,13 @@ def test_cummin_masked():
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+    [
+        Decimal64Dtype(8, 4),
+        Decimal64Dtype(11, 6),
+        Decimal64Dtype(14, 7),
+        Decimal32Dtype(8, 4),
+        Decimal128Dtype(11, 6),
+    ],
 )
 def test_cummin_decimal(dtype):
     data = ["8394.294", np.nan, "-9940.444", np.nan, "-23.928"]
@@ -183,7 +197,13 @@ def test_cummax_masked():
 
 @pytest.mark.parametrize(
     "dtype",
-    [Decimal64Dtype(8, 4), Decimal64Dtype(11, 6), Decimal64Dtype(14, 7)],
+    [
+        Decimal64Dtype(8, 4),
+        Decimal64Dtype(11, 6),
+        Decimal64Dtype(14, 7),
+        Decimal32Dtype(8, 4),
+        Decimal128Dtype(11, 6),
+    ],
 )
 def test_cummax_decimal(dtype):
     data = [np.nan, "54.203", "8.222", "644.32", "-562.272"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index cc7be02a024..75cf2e6c892 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import json
 import re
@@ -229,9 +229,13 @@ def test_string_astype(dtype):
         ([], 0, 5),
     ],
 )
-def test_string_to_decimal(data, scale, precision):
+@pytest.mark.parametrize(
+    "decimal_dtype",
+    [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype],
+)
+def test_string_to_decimal(data, scale, precision, decimal_dtype):
     gs = cudf.Series(data, dtype="str")
-    fp = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
+    fp = gs.astype(decimal_dtype(scale=scale, precision=precision))
     got = fp.astype("str")
     assert_eq(gs, got)
 
@@ -256,7 +260,11 @@ def test_string_empty_to_decimal():
         ([], 0, 5),
     ],
 )
-def test_string_from_decimal(data, scale, precision):
+@pytest.mark.parametrize(
+    "decimal_dtype",
+    [cudf.Decimal128Dtype, cudf.Decimal32Dtype, cudf.Decimal64Dtype],
+)
+def test_string_from_decimal(data, scale, precision, decimal_dtype):
     decimal_data = []
     for d in data:
         if d is None:
@@ -264,11 +272,10 @@ def test_string_from_decimal(data, scale, precision):
         else:
             decimal_data.append(Decimal(d))
     fp = cudf.Series(
-        decimal_data,
-        dtype=cudf.Decimal64Dtype(scale=scale, precision=precision),
+        decimal_data, dtype=decimal_dtype(scale=scale, precision=precision),
     )
     gs = fp.astype("str")
-    got = gs.astype(cudf.Decimal64Dtype(scale=scale, precision=precision))
+    got = gs.astype(decimal_dtype(scale=scale, precision=precision))
     assert_eq(fp, got)
 
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 7142d0d710e..44bbb1b493d 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import datetime as dt
 from collections import namedtuple
@@ -164,8 +164,20 @@ def _find_common_type_decimal(dtypes):
     lhs = max([dtype.precision - dtype.scale for dtype in dtypes])
     # Combine to get the necessary precision and clip at the maximum
     # precision
-    p = min(cudf.Decimal64Dtype.MAX_PRECISION, s + lhs)
-    return cudf.Decimal64Dtype(p, s)
+    p = s + lhs
+
+    if p > cudf.Decimal64Dtype.MAX_PRECISION:
+        return cudf.Decimal128Dtype(
+            min(cudf.Decimal128Dtype.MAX_PRECISION, p), s
+        )
+    elif p > cudf.Decimal32Dtype.MAX_PRECISION:
+        return cudf.Decimal64Dtype(
+            min(cudf.Decimal64Dtype.MAX_PRECISION, p), s
+        )
+    else:
+        return cudf.Decimal32Dtype(
+            min(cudf.Decimal32Dtype.MAX_PRECISION, p), s
+        )
 
 
 def cudf_dtype_from_pydata_dtype(dtype):
@@ -179,6 +191,8 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal32Dtype
     elif cudf.api.types.is_decimal64_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
+    elif cudf.api.types.is_decimal128_dtype(dtype):
+        return cudf.core.dtypes.Decimal128Dtype
     elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
         return dtype.type
 
@@ -210,7 +224,7 @@ def cudf_dtype_from_pa_type(typ):
     elif pa.types.is_struct(typ):
         return cudf.core.dtypes.StructDtype.from_arrow(typ)
     elif pa.types.is_decimal(typ):
-        return cudf.core.dtypes.Decimal64Dtype.from_arrow(typ)
+        return cudf.core.dtypes.Decimal128Dtype.from_arrow(typ)
     else:
         return cudf.api.types.pandas_dtype(typ.to_pandas_dtype())
 
@@ -586,8 +600,9 @@ def _can_cast(from_dtype, to_dtype):
 
     # TODO : Add precision & scale checking for
     # decimal types in future
-    if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype):
-        if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype):
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
             return True
         elif isinstance(to_dtype, np.dtype):
             if to_dtype.kind in {"i", "f", "u", "U", "O"}:
@@ -597,7 +612,7 @@ def _can_cast(from_dtype, to_dtype):
     elif isinstance(from_dtype, np.dtype):
         if isinstance(to_dtype, np.dtype):
             return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype):
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
             if from_dtype.kind in {"i", "f", "u", "U", "O"}:
                 return True
             else:
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index cea384b9c11..2af7543e600 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import decimal
 import functools
@@ -58,7 +58,7 @@ def scalar_broadcast_to(scalar, size, dtype=None):
 
     if isinstance(scalar, decimal.Decimal):
         if dtype is None:
-            dtype = cudf.Decimal64Dtype._from_decimal(scalar)
+            dtype = cudf.Decimal128Dtype._from_decimal(scalar)
 
         out_col = column.column_empty(size, dtype=dtype)
         if out_col.size != 0:

From 4e4c3dd608604a7cf29bd59ccccffdfd5cdb525a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 18 Jan 2022 17:03:53 -0600
Subject: [PATCH 175/202] Update `decimal` dtypes related docs entries (#10072)

This PR adds `Decimal128Dtype` to the list of supported dtypes in docs.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/10072
---
 docs/cudf/source/basics/basics.rst             | 3 ++-
 docs/cudf/source/basics/io-supported-types.rst | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
index cae7d017291..60a65558033 100644
--- a/docs/cudf/source/basics/basics.rst
+++ b/docs/cudf/source/basics/basics.rst
@@ -37,7 +37,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen
     | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
     | Decimal                | Decimal32Dtype,  | (none)                                                                              | (none)                                      |
-    |                        | Decimal64Dtype   |                                                                                     |                                             |
+    |                        | Decimal64Dtype,  |                                                                                     |                                             |
+    |                        | Decimal128Dtype  |                                                                                     |                                             |
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
 
 **Note: All dtypes above are Nullable**
diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst
index 0962113eb25..4a7da60fa85 100644
--- a/docs/cudf/source/basics/io-supported-types.rst
+++ b/docs/cudf/source/basics/io-supported-types.rst
@@ -62,6 +62,8 @@ The following table lists are compatible cudf types for each supported IO format
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | decimal64             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | decimal128            | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
 
 **Notes:**
 

From 512e161afd03e7a7f3baebeeccb4910282993a0f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 18 Jan 2022 18:51:52 -0800
Subject: [PATCH 176/202] Add check for negative stripe index in ORC reader
 (#10074)

Fixes CI failure

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Christopher Harris (https://github.com/cwharris)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10074
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 3d43042842f..a4ae9999a19 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -172,9 +172,10 @@ std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stri
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
       for (const auto& stripe_idx : user_specified_stripes[src_file_idx]) {
-        CUDF_EXPECTS(stripe_idx < static_cast<decltype(stripe_idx)>(
-                                    per_file_metadata[src_file_idx].ff.stripes.size()),
-                     "Invalid stripe index");
+        CUDF_EXPECTS(
+          stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
+                                             per_file_metadata[src_file_idx].ff.stripes.size()),
+          "Invalid stripe index");
         stripe_infos.push_back(
           std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
         row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;

From b90a6fd5f562f8f97723618c2919e30a4a7f83ff Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 18 Jan 2022 21:36:21 -0800
Subject: [PATCH 177/202] fix gcc 11 compilation errors (#10067)

Fix gcc 11 compilation errors.

Authors:
  - Rong Ou (https://github.com/rongou)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10067
---
 java/src/main/native/src/CompiledExpression.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index a18c88e10dc..6f6e1ac52a7 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ class jni_serialized_ast {
     check_for_eof(sizeof(T));
     // use memcpy since data may be misaligned
     T result;
-    memcpy(&result, data_ptr, sizeof(T));
+    memcpy(reinterpret_cast<jbyte *>(&result), data_ptr, sizeof(T));
     data_ptr += sizeof(T);
     return result;
   }

From e41618841f6443ab6ebada9ee2c5b37de9f7fdb1 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 07:21:07 -0600
Subject: [PATCH 178/202] Avoid index materialization when `DataFrame` is
 created with un-named `Series` objects (#10071)

Fixes: #10070

This PR removed materializing of `index` incase of list-like un-named `Series` inputs are passed to `DataFrame` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/10071
---
 python/cudf/cudf/core/dataframe.py               | 2 +-
 python/cudf/cudf/tests/test_dataframe.py         | 9 +++++++--
 python/dask_cudf/dask_cudf/tests/test_groupby.py | 1 +
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1dddcb9e3af..a444d87b50c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6519,7 +6519,7 @@ def _get_union_of_series_names(series_list):
         else:
             names_list.append(series.name)
     if unnamed_count == len(series_list):
-        names_list = [*range(len(series_list))]
+        names_list = range(len(series_list))
 
     return names_list
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 61c3f428019..3e359335719 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -7665,9 +7665,14 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns):
     actual = cudf.DataFrame(gd_data, columns=columns)
 
     if ignore_dtype:
-        assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False)
+        assert_eq(
+            expected.fillna(-1),
+            actual.fillna(-1),
+            check_dtype=False,
+            check_index_type=True,
+        )
     else:
-        assert_eq(expected, actual)
+        assert_eq(expected, actual, check_index_type=True)
 
 
 @pytest.mark.parametrize(
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index fce9b773dac..274c6670426 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -17,6 +17,7 @@
 @pytest.mark.parametrize("aggregation", SUPPORTED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation):
+    np.random.seed(0)
     pdf = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=10000),

From 3aecce25701ea4d0f182d2a0f47237863ad15e69 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 19 Jan 2022 08:44:49 -0600
Subject: [PATCH 179/202] Update Java tests to expect DECIMAL128 from Arrow
 (#10073)

After #9986 reading Arrow in libcudf now returns DECIMAL128 instead of DECIMAL64.  This updates the Java tests to expect DECIMAL128 instead of DECIMAL64 by upcasting the decimal columns in the original table being round-tripped through Arrow before comparing the result.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Rong Ou (https://github.com/rongou)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/10073
---
 .../test/java/ai/rapids/cudf/TableTest.java   | 70 +++++++++++++++++--
 1 file changed, 66 insertions(+), 4 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 7fe69d2d7fc..18a0de77664 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7192,6 +7192,64 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
     }
   }
 
+  /** Return a column where DECIMAL64 has been up-casted to DECIMAL128 */
+  private ColumnVector castDecimal64To128(ColumnView c) {
+    DType dtype = c.getType();
+    switch (dtype.getTypeId()) {
+      case DECIMAL64:
+        return c.castTo(DType.create(DType.DTypeEnum.DECIMAL128, dtype.getScale()));
+      case STRUCT:
+      case LIST:
+      {
+        ColumnView[] oldViews = c.getChildColumnViews();
+        assert oldViews != null;
+        ColumnVector[] newChildren = new ColumnVector[oldViews.length];
+        try {
+          for (int i = 0; i < oldViews.length; i++) {
+            newChildren[i] = castDecimal64To128(oldViews[i]);
+          }
+          try (ColumnView newView = new ColumnView(dtype, c.getRowCount(),
+              Optional.of(c.getNullCount()), c.getValid(), c.getOffsets(), newChildren)) {
+            return newView.copyToColumnVector();
+          }
+        } finally {
+          for (ColumnView v : oldViews) {
+            v.close();
+          }
+          for (ColumnVector v : newChildren) {
+            if (v != null) {
+              v.close();
+            }
+          }
+        }
+      }
+      default:
+        if (c instanceof ColumnVector) {
+          return ((ColumnVector) c).incRefCount();
+        } else {
+          return c.copyToColumnVector();
+        }
+    }
+  }
+
+  /** Return a new Table with any DECIMAL64 columns up-casted to DECIMAL128 */
+  private Table castDecimal64To128(Table t) {
+    final int numCols = t.getNumberOfColumns();
+    ColumnVector[] cols = new ColumnVector[numCols];
+    try {
+      for (int i = 0; i < numCols; i++) {
+        cols[i] = castDecimal64To128(t.getColumn(i));
+      }
+      return new Table(cols);
+    } finally {
+      for (ColumnVector c : cols) {
+        if (c != null) {
+          c.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
     File tempFile = File.createTempFile("test-names-metadata", ".arrow");
@@ -7203,7 +7261,9 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
       try (TableWriter writer = Table.writeArrowIPCChunked(options, tempFile.getAbsoluteFile())) {
         writer.write(table0);
       }
-      try (StreamedTableReader reader = Table.readArrowIPCChunked(tempFile)) {
+      // Reading from Arrow converts decimals to DECIMAL128
+      try (StreamedTableReader reader = Table.readArrowIPCChunked(tempFile);
+           Table expected = castDecimal64To128(table0)) {
         boolean done = false;
         int count = 0;
         while (!done) {
@@ -7211,7 +7271,7 @@ void testArrowIPCWriteToFileWithNamesAndMetadata() throws IOException {
             if (t == null) {
               done = true;
             } else {
-              assertTablesAreEqual(table0, t);
+              assertTablesAreEqual(expected, t);
               count++;
             }
           }
@@ -7243,7 +7303,9 @@ void testArrowIPCWriteToBufferChunked() {
         writer.write(table0);
         writer.write(table0);
       }
-      try (StreamedTableReader reader = Table.readArrowIPCChunked(new MyBufferProvider(consumer))) {
+      // Reading from Arrow converts decimals to DECIMAL128
+      try (StreamedTableReader reader = Table.readArrowIPCChunked(new MyBufferProvider(consumer));
+           Table expected = castDecimal64To128(table0)) {
         boolean done = false;
         int count = 0;
         while (!done) {
@@ -7251,7 +7313,7 @@ void testArrowIPCWriteToBufferChunked() {
             if (t == null) {
               done = true;
             } else {
-              assertTablesAreEqual(table0, t);
+              assertTablesAreEqual(expected, t);
               count++;
             }
           }

From 8e88adc3c6bdac9717aaa434a009713f627fb39f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 19 Jan 2022 08:58:23 -0600
Subject: [PATCH 180/202] Fix `columns` ordering issue in parquet reader
 (#10066)

Fixes: #10062

This PR fixes issue where the order of `columns` and parquet metadata columns(i.e., `meta['columns']`) can differ and both are not guaranteed to be in the same order always. In this PR, removed the code that has this assumption and created a new dict that contains the metadata of columns which are later used to update the column metadata in dataframe.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/10066
---
 python/cudf/cudf/_lib/parquet.pyx      | 13 ++++++++++---
 python/cudf/cudf/tests/test_parquet.py | 18 ++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 16873435e1d..8cb7dd942c1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -200,12 +200,19 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
     update_struct_field_names(df, c_out_table.metadata.schema_info)
 
-    # update the decimal precision of each column
     if meta is not None:
-        for col, col_meta in zip(column_names, meta["columns"]):
+        # Book keep each column metadata as the order
+        # of `meta["columns"]` and `column_names` are not
+        # guaranteed to be deterministic and same always.
+        meta_data_per_column = {
+            col_meta['name']: col_meta for col_meta in meta["columns"]
+        }
+
+        # update the decimal precision of each column
+        for col in column_names:
             if is_decimal_dtype(df._data[col].dtype):
                 df._data[col].dtype.precision = (
-                    col_meta["metadata"]["precision"]
+                    meta_data_per_column[col]["metadata"]["precision"]
                 )
 
     # Set the index column
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f239d88992a..519f24b7ca6 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2368,3 +2368,21 @@ def test_parquet_writer_row_group_size(
         math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
     )
     assert expected_num_rows == row_groups
+
+
+def test_parquet_reader_decimal_columns():
+    df = cudf.DataFrame(
+        {
+            "col1": cudf.Series([1, 2, 3], dtype=cudf.Decimal64Dtype(10, 2)),
+            "col2": [10, 11, 12],
+            "col3": [12, 13, 14],
+            "col4": ["a", "b", "c"],
+        }
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+
+    actual = cudf.read_parquet(buffer, columns=["col3", "col2", "col1"])
+    expected = pd.read_parquet(buffer, columns=["col3", "col2", "col1"])
+
+    assert_eq(actual, expected)

From f193d594b0b8aac6f4b0fe599f389281fd430579 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 19 Jan 2022 07:03:55 -0800
Subject: [PATCH 181/202] Include row group level stats when writing ORC files
 (#10041)

Closes #9964
Encodes row group level stats with the rest and writes the encoded blobs into the protobuf, at the start of each stripe (other stats are in the file footer).
Adds `put_bytes` to `ProtobufWriter` to optimize writing of buffers.
Adds new struct to represent the encoded ORC statistics so they are separated by granularity level (instead of using a single vector).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/10041
---
 cpp/src/io/orc/orc.cpp              |  65 +++++-----
 cpp/src/io/orc/orc.h                | 138 ++++++++++++---------
 cpp/src/io/orc/orc_common.h         |  20 +--
 cpp/src/io/orc/orc_field_writer.hpp |  36 ++----
 cpp/src/io/orc/stats_enc.cu         |  22 ++--
 cpp/src/io/orc/stripe_init.cu       |  12 +-
 cpp/src/io/orc/writer_impl.cu       | 182 +++++++++++++++++-----------
 cpp/src/io/orc/writer_impl.hpp      |  21 +++-
 8 files changed, 278 insertions(+), 218 deletions(-)

diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index 44cea6169e4..f51fd28676e 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,10 +38,10 @@ uint32_t ProtobufReader::read_field_size(const uint8_t* end)
 void ProtobufReader::skip_struct_field(int t)
 {
   switch (t) {
-    case PB_TYPE_VARINT: get<uint32_t>(); break;
-    case PB_TYPE_FIXED64: skip_bytes(8); break;
-    case PB_TYPE_FIXEDLEN: skip_bytes(get<uint32_t>()); break;
-    case PB_TYPE_FIXED32: skip_bytes(4); break;
+    case ProtofType::VARINT: get<uint32_t>(); break;
+    case ProtofType::FIXED64: skip_bytes(8); break;
+    case ProtofType::FIXEDLEN: skip_bytes(get<uint32_t>()); break;
+    case ProtofType::FIXED32: skip_bytes(4); break;
     default: break;
   }
 }
@@ -209,43 +209,54 @@ void ProtobufWriter::put_row_index_entry(int32_t present_blk,
                                          int32_t data_ofs,
                                          int32_t data2_blk,
                                          int32_t data2_ofs,
-                                         TypeKind kind)
+                                         TypeKind kind,
+                                         ColStatsBlob const* stats)
 {
   size_t sz = 0, lpos;
-  putb(1 * 8 + PB_TYPE_FIXEDLEN);  // 1:RowIndex.entry
+  put_uint(encode_field_number(1, ProtofType::FIXEDLEN));  // 1:RowIndex.entry
   lpos = m_buf->size();
-  putb(0xcd);                      // sz+2
-  putb(1 * 8 + PB_TYPE_FIXEDLEN);  // 1:positions[packed=true]
-  putb(0xcd);                      // sz
+  put_byte(0xcd);                                          // sz+2
+  put_uint(encode_field_number(1, ProtofType::FIXEDLEN));  // 1:positions[packed=true]
+  put_byte(0xcd);                                          // sz
   if (present_blk >= 0) sz += put_uint(present_blk);
   if (present_ofs >= 0) {
-    sz += put_uint(present_ofs) + 2;
-    putb(0);  // run pos = 0
-    putb(0);  // bit pos = 0
+    sz += put_uint(present_ofs);
+    sz += put_byte(0);  // run pos = 0
+    sz += put_byte(0);  // bit pos = 0
   }
   if (data_blk >= 0) { sz += put_uint(data_blk); }
   if (data_ofs >= 0) {
     sz += put_uint(data_ofs);
     if (kind != STRING && kind != FLOAT && kind != DOUBLE && kind != DECIMAL) {
-      putb(0);  // RLE run pos always zero (assumes RLE aligned with row index boundaries)
-      sz++;
+      // RLE run pos always zero (assumes RLE aligned with row index boundaries)
+      sz += put_byte(0);
       if (kind == BOOLEAN) {
-        putb(0);  // bit position in byte, always zero
-        sz++;
+        // bit position in byte, always zero
+        sz += put_byte(0);
       }
     }
   }
-  if (kind !=
-      INT)  // INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
-  {
+  // INT kind can be passed in to bypass 2nd stream index (dictionary length streams)
+  if (kind != INT) {
     if (data2_blk >= 0) { sz += put_uint(data2_blk); }
     if (data2_ofs >= 0) {
-      sz += put_uint(data2_ofs) + 1;
-      putb(0);  // RLE run pos always zero (assumes RLE aligned with row index boundaries)
+      sz += put_uint(data2_ofs);
+      // RLE run pos always zero (assumes RLE aligned with row index boundaries)
+      sz += put_byte(0);
     }
   }
-  m_buf->data()[lpos]     = (uint8_t)(sz + 2);
+  // size of the field 1
   m_buf->data()[lpos + 2] = (uint8_t)(sz);
+
+  if (stats != nullptr) {
+    sz += put_uint(encode_field_number<decltype(*stats)>(2));  // 2: statistics
+    // Statistics field contains its length as varint and dtype specific data (encoded on the GPU)
+    sz += put_uint(stats->size());
+    sz += put_bytes<typename ColStatsBlob::value_type>(*stats);
+  }
+
+  // size of the whole row index entry
+  m_buf->data()[lpos] = (uint8_t)(sz + 2);
 }
 
 size_t ProtobufWriter::write(const PostScript& s)
@@ -256,7 +267,7 @@ size_t ProtobufWriter::write(const PostScript& s)
   if (s.compression != NONE) { w.field_uint(3, s.compressionBlockSize); }
   w.field_packed_uint(4, s.version);
   w.field_uint(5, s.metadataLength);
-  w.field_string(8000, s.magic);
+  w.field_blob(8000, s.magic);
   return w.value();
 }
 
@@ -300,8 +311,8 @@ size_t ProtobufWriter::write(const SchemaType& s)
 size_t ProtobufWriter::write(const UserMetadataItem& s)
 {
   ProtobufFieldWriter w(this);
-  w.field_string(1, s.name);
-  w.field_string(2, s.value);
+  w.field_blob(1, s.name);
+  w.field_blob(2, s.value);
   return w.value();
 }
 
@@ -310,7 +321,7 @@ size_t ProtobufWriter::write(const StripeFooter& s)
   ProtobufFieldWriter w(this);
   w.field_repeated_struct(1, s.streams);
   w.field_repeated_struct(2, s.columns);
-  if (s.writerTimezone != "") { w.field_string(3, s.writerTimezone); }
+  if (s.writerTimezone != "") { w.field_blob(3, s.writerTimezone); }
   return w.value();
 }
 
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 277c5d99f8f..4fa3480c90a 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,6 +131,67 @@ struct Metadata {
   std::vector<StripeStatistics> stripeStats;
 };
 
+int inline constexpr encode_field_number(int field_number, ProtofType field_type) noexcept
+{
+  return (field_number * 8) + static_cast<int>(field_type);
+}
+
+namespace {
+template <typename base_t,
+          typename std::enable_if_t<!std::is_arithmetic<base_t>::value and
+                                    !std::is_enum<base_t>::value>* = nullptr>
+int static constexpr encode_field_number_base(int field_number) noexcept
+{
+  return encode_field_number(field_number, ProtofType::FIXEDLEN);
+}
+
+template <typename base_t,
+          typename std::enable_if_t<std::is_integral<base_t>::value or
+                                    std::is_enum<base_t>::value>* = nullptr>
+int static constexpr encode_field_number_base(int field_number) noexcept
+{
+  return encode_field_number(field_number, ProtofType::VARINT);
+}
+
+template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
+int static constexpr encode_field_number_base(int field_number) noexcept
+{
+  return encode_field_number(field_number, ProtofType::FIXED32);
+}
+
+template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
+int static constexpr encode_field_number_base(int field_number) noexcept
+{
+  return encode_field_number(field_number, ProtofType::FIXED64);
+}
+};  // namespace
+
+template <
+  typename T,
+  typename std::enable_if_t<!std::is_class<T>::value or std::is_same_v<T, std::string>>* = nullptr>
+int constexpr encode_field_number(int field_number) noexcept
+{
+  return encode_field_number_base<T>(field_number);
+}
+
+// containters change the field number encoding
+template <
+  typename T,
+  typename std::enable_if_t<std::is_same<T, std::vector<typename T::value_type>>::value>* = nullptr>
+int constexpr encode_field_number(int field_number) noexcept
+{
+  return encode_field_number_base<T>(field_number);
+}
+
+// optional fields don't change the field number encoding
+template <typename T,
+          typename std::enable_if_t<
+            std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
+int constexpr encode_field_number(int field_number) noexcept
+{
+  return encode_field_number_base<typename T::value_type>(field_number);
+}
+
 /**
  * @brief Class for parsing Orc's Protocol Buffers encoded metadata
  */
@@ -181,60 +242,6 @@ class ProtobufReader {
   template <typename T, typename... Operator>
   void function_builder(T& s, size_t maxlen, std::tuple<Operator...>& op);
 
-  template <typename base_t,
-            typename std::enable_if_t<!std::is_arithmetic<base_t>::value and
-                                      !std::is_enum<base_t>::value>* = nullptr>
-  int static constexpr encode_field_number_base(int field_number) noexcept
-  {
-    return (field_number * 8) + PB_TYPE_FIXEDLEN;
-  }
-
-  template <typename base_t,
-            typename std::enable_if_t<std::is_integral<base_t>::value or
-                                      std::is_enum<base_t>::value>* = nullptr>
-  int static constexpr encode_field_number_base(int field_number) noexcept
-  {
-    return (field_number * 8) + PB_TYPE_VARINT;
-  }
-
-  template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, float>>* = nullptr>
-  int static constexpr encode_field_number_base(int field_number) noexcept
-  {
-    return (field_number * 8) + PB_TYPE_FIXED32;
-  }
-
-  template <typename base_t, typename std::enable_if_t<std::is_same_v<base_t, double>>* = nullptr>
-  int static constexpr encode_field_number_base(int field_number) noexcept
-  {
-    return (field_number * 8) + PB_TYPE_FIXED64;
-  }
-
-  template <typename T,
-            typename std::enable_if_t<!std::is_class<T>::value or std::is_same_v<T, std::string>>* =
-              nullptr>
-  int static constexpr encode_field_number(int field_number) noexcept
-  {
-    return encode_field_number_base<T>(field_number);
-  }
-
-  // containters change the field number encoding
-  template <typename T,
-            typename std::enable_if_t<
-              std::is_same<T, std::vector<typename T::value_type>>::value>* = nullptr>
-  int static constexpr encode_field_number(int field_number) noexcept
-  {
-    return encode_field_number_base<T>(field_number);
-  }
-
-  // optional fields don't change the field number encoding
-  template <typename T,
-            typename std::enable_if_t<
-              std::is_same<T, std::optional<typename T::value_type>>::value>* = nullptr>
-  int static constexpr encode_field_number(int field_number) noexcept
-  {
-    return encode_field_number_base<typename T::value_type>(field_number);
-  }
-
   uint32_t read_field_size(const uint8_t* end);
 
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
@@ -470,16 +477,28 @@ class ProtobufWriter {
  public:
   ProtobufWriter() { m_buf = nullptr; }
   ProtobufWriter(std::vector<uint8_t>* output) { m_buf = output; }
-  void putb(uint8_t v) { m_buf->push_back(v); }
+  uint32_t put_byte(uint8_t v)
+  {
+    m_buf->push_back(v);
+    return 1;
+  }
+  template <typename T>
+  uint32_t put_bytes(host_span<T const> values)
+  {
+    static_assert(sizeof(T) == 1);
+    m_buf->reserve(m_buf->size() + values.size());
+    m_buf->insert(m_buf->end(), values.begin(), values.end());
+    return values.size();
+  }
   uint32_t put_uint(uint64_t v)
   {
     int l = 1;
     while (v > 0x7f) {
-      putb(static_cast<uint8_t>(v | 0x80));
+      put_byte(static_cast<uint8_t>(v | 0x80));
       v >>= 7;
       l++;
     }
-    putb(static_cast<uint8_t>(v));
+    put_byte(static_cast<uint8_t>(v));
     return l;
   }
   uint32_t put_int(int64_t v)
@@ -493,7 +512,8 @@ class ProtobufWriter {
                            int32_t data_ofs,
                            int32_t data2_blk,
                            int32_t data2_ofs,
-                           TypeKind kind);
+                           TypeKind kind,
+                           ColStatsBlob const* stats);
 
  public:
   size_t write(const PostScript&);
diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h
index f88a84b0bfc..6bee5be81ed 100644
--- a/cpp/src/io/orc/orc_common.h
+++ b/cpp/src/io/orc/orc_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -76,15 +76,15 @@ enum ColumnEncodingKind : int8_t {
   DICTIONARY_V2         = 3,  // the encoding is dictionary-based using RLE v2
 };
 
-enum : uint8_t {  // Protobuf field types
-  PB_TYPE_VARINT      = 0,
-  PB_TYPE_FIXED64     = 1,
-  PB_TYPE_FIXEDLEN    = 2,
-  PB_TYPE_START_GROUP = 3,  // deprecated
-  PB_TYPE_END_GROUP   = 4,  // deprecated
-  PB_TYPE_FIXED32     = 5,
-  PB_TYPE_INVALID_6   = 6,
-  PB_TYPE_INVALID_7   = 7,
+enum ProtofType : uint8_t {
+  VARINT      = 0,
+  FIXED64     = 1,
+  FIXEDLEN    = 2,
+  START_GROUP = 3,  // deprecated
+  END_GROUP   = 4,  // deprecated
+  FIXED32     = 5,
+  INVALID_6   = 6,
+  INVALID_7   = 7,
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index afcd99a2cd6..9714277b54d 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
   template <typename T>
   void field_uint(int field, const T& value)
   {
-    struct_size += p->put_uint(field * 8 + PB_TYPE_VARINT);
+    struct_size += p->put_uint(encode_field_number<T>(field));
     struct_size += p->put_uint(static_cast<uint64_t>(value));
   }
 
@@ -52,9 +52,9 @@ struct ProtobufWriter::ProtobufFieldWriter {
   template <typename T>
   void field_packed_uint(int field, const std::vector<T>& value)
   {
-    struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
+    struct_size += p->put_uint(encode_field_number<std::vector<T>>(field));
     auto lpos = p->m_buf->size();
-    p->putb(0);
+    p->put_byte(0);
     auto sz = std::accumulate(value.begin(), value.end(), 0, [p = this->p](size_t sum, auto val) {
       return sum + p->put_uint(val);
     });
@@ -65,29 +65,15 @@ struct ProtobufWriter::ProtobufFieldWriter {
     (*(p->m_buf))[lpos] = static_cast<uint8_t>(sz);
   }
 
-  /**
-   * @brief Function to write a string to the internal buffer
-   */
-  void field_string(int field, const std::string& value)
-  {
-    size_t len = value.length();
-    struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
-    struct_size += p->put_uint(len) + len;
-    for (size_t i = 0; i < len; i++)
-      p->putb(value[i]);
-  }
-
   /**
    * @brief Function to write a blob to the internal buffer
    */
   template <typename T>
-  void field_blob(int field, const std::vector<T>& value)
+  void field_blob(int field, T const& values)
   {
-    size_t len = value.size();
-    struct_size += p->put_uint(field * 8 + PB_TYPE_FIXEDLEN);
-    struct_size += p->put_uint(len) + len;
-    for (size_t i = 0; i < len; i++)
-      p->putb(value[i]);
+    struct_size += p->put_uint(encode_field_number<T>(field));
+    struct_size += p->put_uint(values.size());
+    struct_size += p->put_bytes<typename T::value_type>(values);
   }
 
   /**
@@ -96,9 +82,9 @@ struct ProtobufWriter::ProtobufFieldWriter {
   template <typename T>
   void field_struct(int field, const T& value)
   {
-    struct_size += p->put_uint((field)*8 + PB_TYPE_FIXEDLEN);
+    struct_size += p->put_uint(encode_field_number(field, ProtofType::FIXEDLEN));
     auto lpos = p->m_buf->size();
-    p->putb(0);
+    p->put_byte(0);
     auto sz = p->write(value);
     struct_size += sz + 1;
     for (; sz > 0x7f; sz >>= 7, struct_size++)
@@ -112,7 +98,7 @@ struct ProtobufWriter::ProtobufFieldWriter {
   void field_repeated_string(int field, const std::vector<std::string>& value)
   {
     for (const auto& elem : value)
-      field_string(field, elem);
+      field_blob(field, elem);
   }
 
   /**
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 7441819d7cd..b377a2e7076 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -150,7 +150,7 @@ __device__ inline uint8_t* pb_encode_uint(uint8_t* p, uint64_t v)
 // Protobuf field encoding for unsigned int
 __device__ inline uint8_t* pb_put_uint(uint8_t* p, uint32_t id, uint64_t v)
 {
-  p[0] = id * 8 + PB_TYPE_VARINT;  // NOTE: Assumes id < 16
+  p[0] = id * 8 + static_cast<ProtofType>(ProtofType::VARINT);  // NOTE: Assumes id < 16
   return pb_encode_uint(p + 1, v);
 }
 
@@ -165,7 +165,7 @@ __device__ inline uint8_t* pb_put_int(uint8_t* p, uint32_t id, int64_t v)
 __device__ inline uint8_t* pb_put_packed_uint(uint8_t* p, uint32_t id, uint64_t v)
 {
   uint8_t* p2 = pb_encode_uint(p + 2, v);
-  p[0]        = id * 8 + PB_TYPE_FIXEDLEN;
+  p[0]        = id * 8 + ProtofType::FIXEDLEN;
   p[1]        = static_cast<uint8_t>(p2 - (p + 2));
   return p2;
 }
@@ -173,7 +173,7 @@ __device__ inline uint8_t* pb_put_packed_uint(uint8_t* p, uint32_t id, uint64_t
 // Protobuf field encoding for binary/string
 __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* bytes, uint32_t len)
 {
-  p[0] = id * 8 + PB_TYPE_FIXEDLEN;
+  p[0] = id * 8 + ProtofType::FIXEDLEN;
   p    = pb_encode_uint(p + 1, len);
   memcpy(p, bytes, len);
   return p + len;
@@ -182,7 +182,7 @@ __device__ inline uint8_t* pb_put_binary(uint8_t* p, uint32_t id, const void* by
 // Protobuf field encoding for 64-bit raw encoding (double)
 __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, const void* raw64)
 {
-  p[0] = id * 8 + PB_TYPE_FIXED64;
+  p[0] = id * 8 + ProtofType::FIXED64;
   memcpy(p + 1, raw64, 8);
   return p + 9;
 }
@@ -248,7 +248,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 sum = 3;
         // }
         if (s->chunk.has_minmax || s->chunk.has_sum) {
-          *cur = 2 * 8 + PB_TYPE_FIXEDLEN;
+          *cur = 2 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
           if (s->chunk.has_minmax) {
             cur = pb_put_int(cur, 1, s->chunk.min_value.i_val);
@@ -267,7 +267,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional double sum = 3;
         // }
         if (s->chunk.has_minmax) {
-          *cur = 3 * 8 + PB_TYPE_FIXEDLEN;
+          *cur = 3 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
           cur          = pb_put_fixed64(cur, 1, &s->chunk.min_value.fp_val);
           cur          = pb_put_fixed64(cur, 2, &s->chunk.max_value.fp_val);
@@ -286,7 +286,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
                         (pb_put_uint(cur, 1, s->chunk.min_value.str_val.length) - cur) +
                         (pb_put_uint(cur, 2, s->chunk.max_value.str_val.length) - cur) +
                         s->chunk.min_value.str_val.length + s->chunk.max_value.str_val.length;
-          cur[0] = 4 * 8 + PB_TYPE_FIXEDLEN;
+          cur[0] = 4 * 8 + ProtofType::FIXEDLEN;
           cur    = pb_encode_uint(cur + 1, sz);
           cur    = pb_put_binary(
             cur, 1, s->chunk.min_value.str_val.ptr, s->chunk.min_value.str_val.length);
@@ -301,7 +301,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  repeated uint64 count = 1 [packed=true];
         // }
         if (s->chunk.has_sum) {  // Sum is equal to the number of 'true' values
-          cur[0]       = 5 * 8 + PB_TYPE_FIXEDLEN;
+          cur[0]       = 5 * 8 + ProtofType::FIXEDLEN;
           cur          = pb_put_packed_uint(cur + 2, 1, s->chunk.sum.u_val);
           fld_start[1] = cur - (fld_start + 2);
         }
@@ -325,7 +325,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint32 maximum = 2;
         // }
         if (s->chunk.has_minmax) {
-          cur[0] = 7 * 8 + PB_TYPE_FIXEDLEN;
+          cur[0] = 7 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
           cur          = pb_put_int(cur, 1, s->chunk.min_value.i_val);
           cur          = pb_put_int(cur, 2, s->chunk.max_value.i_val);
@@ -341,7 +341,7 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 maximumUtc = 4;
         // }
         if (s->chunk.has_minmax) {
-          cur[0] = 9 * 8 + PB_TYPE_FIXEDLEN;
+          cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
           cur += 2;
           cur          = pb_put_int(cur, 3, s->chunk.min_value.i_val);  // minimumUtc
           cur          = pb_put_int(cur, 4, s->chunk.max_value.i_val);  // maximumUtc
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index be561530459..b197751d925 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -227,7 +227,7 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
                                                       const uint8_t* start,
                                                       const uint8_t* end)
 {
-  constexpr uint32_t pb_rowindexentry_id = static_cast<uint32_t>(PB_TYPE_FIXEDLEN) + 8;
+  constexpr uint32_t pb_rowindexentry_id = ProtofType::FIXEDLEN + 8;
 
   const uint8_t* cur      = start;
   row_entry_state_e state = NOT_FOUND;
@@ -246,13 +246,13 @@ static uint32_t __device__ ProtobufParseRowIndexEntry(rowindex_state_s* s,
           state = GET_LENGTH;
         } else {
           v &= 7;
-          if (v == PB_TYPE_FIXED64)
+          if (v == ProtofType::FIXED64)
             cur += 8;
-          else if (v == PB_TYPE_FIXED32)
+          else if (v == ProtofType::FIXED32)
             cur += 4;
-          else if (v == PB_TYPE_VARINT)
+          else if (v == ProtofType::VARINT)
             state = SKIP_VARINT;
-          else if (v == PB_TYPE_FIXEDLEN)
+          else if (v == ProtofType::FIXEDLEN)
             state = SKIP_FIXEDLEN;
         }
         break;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index b0e674c206f..b7264cb81ac 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1062,13 +1062,23 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
                    [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; });
 }
 
-std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
-  orc_table_view const& orc_table, file_segmentation const& segmentation)
+writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
+  bool are_statistics_enabled,
+  orc_table_view const& orc_table,
+  file_segmentation const& segmentation)
 {
-  auto const num_stat_blobs = (1 + segmentation.num_stripes()) * orc_table.num_columns();
+  auto const num_rowgroup_blobs = segmentation.rowgroups.count();
+  auto const num_stripe_blobs   = segmentation.num_stripes() * orc_table.num_columns();
+  auto const num_file_blobs     = orc_table.num_columns();
+  auto const num_stat_blobs     = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
+
+  if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
   hostdevice_vector<statistics_merge_group> stat_merge(num_stat_blobs, stream);
+  auto rowgroup_stat_merge = stat_merge.host_ptr();
+  auto stripe_stat_merge   = rowgroup_stat_merge + num_rowgroup_blobs;
+  auto file_stat_merge     = stripe_stat_merge + num_stripe_blobs;
 
   for (auto const& column : orc_table.columns) {
     stats_column_desc* desc = &stat_desc[column.index()];
@@ -1101,14 +1111,20 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
       desc->ts_scale = 0;
     }
     for (auto const& stripe : segmentation.stripes) {
-      auto grp = &stat_merge[column.index() * segmentation.num_stripes() + stripe.id];
-      grp->col = stat_desc.device_ptr(column.index());
-      grp->start_chunk =
+      auto& grp = stripe_stat_merge[column.index() * segmentation.num_stripes() + stripe.id];
+      grp.col   = stat_desc.device_ptr(column.index());
+      grp.start_chunk =
         static_cast<uint32_t>(column.index() * segmentation.num_rowgroups() + stripe.first);
-      grp->num_chunks = stripe.size;
+      grp.num_chunks = stripe.size;
+      for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
+        auto& rg_grp =
+          rowgroup_stat_merge[column.index() * segmentation.num_rowgroups() + *rg_idx_it];
+        rg_grp.col         = stat_desc.device_ptr(column.index());
+        rg_grp.start_chunk = *rg_idx_it;
+        rg_grp.num_chunks  = 1;
+      }
     }
-    statistics_merge_group* col_stats =
-      &stat_merge[segmentation.num_stripes() * orc_table.num_columns() + column.index()];
+    auto col_stats         = &file_stat_merge[column.index()];
     col_stats->col         = stat_desc.device_ptr(column.index());
     col_stats->start_chunk = static_cast<uint32_t>(column.index() * segmentation.num_stripes());
     col_stats->num_chunks  = static_cast<uint32_t>(segmentation.num_stripes());
@@ -1117,58 +1133,73 @@ std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
   stat_merge.host_to_device(stream);
   set_stat_desc_leaf_cols(orc_table.d_columns, stat_desc, stream);
 
-  auto const num_chunks = segmentation.rowgroups.count();
-  rmm::device_uvector<statistics_chunk> stat_chunks(num_chunks + num_stat_blobs, stream);
-  rmm::device_uvector<statistics_group> stat_groups(num_chunks, stream);
+  rmm::device_uvector<statistics_chunk> stat_chunks(num_stat_blobs, stream);
+  auto rowgroup_stat_chunks = stat_chunks.data();
+  auto stripe_stat_chunks   = rowgroup_stat_chunks + num_rowgroup_blobs;
+  auto file_stat_chunks     = stripe_stat_chunks + num_stripe_blobs;
+
+  rmm::device_uvector<statistics_group> stat_groups(num_rowgroup_blobs, stream);
   gpu::orc_init_statistics_groups(
     stat_groups.data(), stat_desc.device_ptr(), segmentation.rowgroups, stream);
 
   detail::calculate_group_statistics<detail::io_file_format::ORC>(
-    stat_chunks.data(), stat_groups.data(), num_chunks, stream);
+    stat_chunks.data(), stat_groups.data(), num_rowgroup_blobs, stream);
+
   detail::merge_group_statistics<detail::io_file_format::ORC>(
-    stat_chunks.data() + num_chunks,
-    stat_chunks.data(),
-    stat_merge.device_ptr(),
-    segmentation.num_stripes() * orc_table.num_columns(),
+    stripe_stat_chunks,
+    rowgroup_stat_chunks,
+    stat_merge.device_ptr(num_rowgroup_blobs),
+    num_stripe_blobs,
     stream);
 
   detail::merge_group_statistics<detail::io_file_format::ORC>(
-    stat_chunks.data() + num_chunks + segmentation.num_stripes() * orc_table.num_columns(),
-    stat_chunks.data() + num_chunks,
-    stat_merge.device_ptr(segmentation.num_stripes() * orc_table.num_columns()),
-    orc_table.num_columns(),
+    file_stat_chunks,
+    stripe_stat_chunks,
+    stat_merge.device_ptr(num_rowgroup_blobs + num_stripe_blobs),
+    num_file_blobs,
     stream);
   gpu::orc_init_statistics_buffersize(
-    stat_merge.device_ptr(), stat_chunks.data() + num_chunks, num_stat_blobs, stream);
+    stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
   stat_merge.device_to_host(stream, true);
 
   hostdevice_vector<uint8_t> blobs(
     stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
-  gpu::orc_encode_statistics(blobs.device_ptr(),
-                             stat_merge.device_ptr(),
-                             stat_chunks.data() + num_chunks,
-                             num_stat_blobs,
-                             stream);
+  gpu::orc_encode_statistics(
+    blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
   stat_merge.device_to_host(stream);
   blobs.device_to_host(stream, true);
 
-  std::vector<std::vector<uint8_t>> stat_blobs(num_stat_blobs);
-  for (size_t i = 0; i < num_stat_blobs; i++) {
-    const uint8_t* stat_begin = blobs.host_ptr(stat_merge[i].start_chunk);
-    const uint8_t* stat_end   = stat_begin + stat_merge[i].num_chunks;
-    stat_blobs[i].assign(stat_begin, stat_end);
+  std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
+  for (size_t i = 0; i < num_rowgroup_blobs; i++) {
+    auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
+    auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
+    rowgroup_blobs[i].assign(stat_begin, stat_end);
+  }
+
+  std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
+  for (size_t i = 0; i < num_stripe_blobs; i++) {
+    auto const stat_begin = blobs.host_ptr(stripe_stat_merge[i].start_chunk);
+    auto const stat_end   = stat_begin + stripe_stat_merge[i].num_chunks;
+    stripe_blobs[i].assign(stat_begin, stat_end);
   }
 
-  return stat_blobs;
+  std::vector<ColStatsBlob> file_blobs(num_file_blobs);
+  for (size_t i = 0; i < num_file_blobs; i++) {
+    auto const stat_begin = blobs.host_ptr(file_stat_merge[i].start_chunk);
+    auto const stat_end   = stat_begin + file_stat_merge[i].num_chunks;
+    file_blobs[i].assign(stat_begin, stat_end);
+  }
+  return {std::move(rowgroup_blobs), std::move(stripe_blobs), std::move(file_blobs)};
 }
 
 void writer::impl::write_index_stream(int32_t stripe_id,
                                       int32_t stream_id,
                                       host_span<orc_column_view const> columns,
-                                      stripe_rowgroups const& rowgroups_range,
+                                      file_segmentation const& segmentation,
                                       host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                                       host_2dspan<gpu::StripeStream const> strm_desc,
                                       host_span<gpu_inflate_status_s const> comp_out,
+                                      std::vector<ColStatsBlob> const& rg_stats,
                                       StripeInformation* stripe,
                                       orc_streams* streams,
                                       ProtobufWriter* pbw)
@@ -1226,9 +1257,18 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
 
   // Add row index entries
+  auto const& rowgroups_range = segmentation.stripes[stripe_id];
   std::for_each(rowgroups_range.cbegin(), rowgroups_range.cend(), [&](auto rowgroup) {
-    pbw->put_row_index_entry(
-      present.comp_pos, present.pos, data.comp_pos, data.pos, data2.comp_pos, data2.pos, kind);
+    pbw->put_row_index_entry(present.comp_pos,
+                             present.pos,
+                             data.comp_pos,
+                             data.pos,
+                             data2.comp_pos,
+                             data2.pos,
+                             kind,
+                             (rg_stats.empty() or stream_id == 0)
+                               ? nullptr
+                               : (&rg_stats[column_id * segmentation.num_rowgroups() + rowgroup]));
 
     if (stream_id != 0) {
       const auto& strm = enc_streams[column_id][rowgroup];
@@ -1852,11 +1892,6 @@ void writer::impl::write(table_view const& table)
   auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs);
 
   if (num_rows > 0) {
-    // Gather column statistics
-    auto const column_stats = enable_statistics_ && table.num_columns() > 0
-                                ? gather_statistic_blobs(orc_table, segmentation)
-                                : std::vector<ColStatsBlob>{};
-
     // Allocate intermediate output stream buffer
     size_t compressed_bfr_size       = 0;
     size_t num_compressed_blocks     = 0;
@@ -1919,11 +1954,12 @@ void writer::impl::write(table_view const& table)
 
     ProtobufWriter pbw_(&buffer_);
 
+    auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation);
+
     // Write stripes
     std::vector<std::future<void>> write_tasks;
     for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
-      auto const& rowgroups_range = segmentation.stripes[stripe_id];
-      auto& stripe                = stripes[stripe_id];
+      auto& stripe = stripes[stripe_id];
 
       stripe.offset = out_sink_->bytes_written();
 
@@ -1932,10 +1968,11 @@ void writer::impl::write(table_view const& table)
         write_index_stream(stripe_id,
                            stream_id,
                            orc_table.columns,
-                           rowgroups_range,
+                           segmentation,
                            enc_data.streams,
                            strm_descs,
                            comp_out,
+                           statistics.rowgroup_level,
                            &stripe,
                            &streams,
                            &pbw_);
@@ -1943,13 +1980,13 @@ void writer::impl::write(table_view const& table)
 
       // Column data consisting one or more separate streams
       for (auto const& strm_desc : strm_descs[stripe_id]) {
-        write_tasks.push_back(
-          write_data_stream(strm_desc,
-                            enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                            static_cast<uint8_t const*>(compressed_data.data()),
-                            stream_output.get(),
-                            &stripe,
-                            &streams));
+        write_tasks.push_back(write_data_stream(
+          strm_desc,
+          enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first],
+          static_cast<uint8_t const*>(compressed_data.data()),
+          stream_output.get(),
+          &stripe,
+          &streams));
       }
 
       // Write stripefooter consisting of stream information
@@ -1980,37 +2017,34 @@ void writer::impl::write(table_view const& table)
       task.wait();
     }
 
-    if (not column_stats.empty()) {
-      // File-level statistics
-      // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
-      if (single_write_mode) {
-        // First entry contains total number of rows
-        buffer_.resize(0);
-        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
-        pbw_.put_uint(num_rows);
-        ff.statistics.reserve(1 + orc_table.num_columns());
-        ff.statistics.emplace_back(std::move(buffer_));
-        // Add file stats, stored after stripe stats in `column_stats`
-        ff.statistics.insert(
-          ff.statistics.end(),
-          std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(),
-          std::make_move_iterator(column_stats.end()));
-      }
-      // Stripe-level statistics
+    // File-level statistics
+    // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls
+    if (single_write_mode and not statistics.file_level.empty()) {
+      // First entry contains total number of rows
+      buffer_.resize(0);
+      pbw_.put_uint(encode_field_number<size_type>(1));
+      pbw_.put_uint(num_rows);
+      ff.statistics.reserve(1 + orc_table.num_columns());
+      ff.statistics.emplace_back(std::move(buffer_));
+      // Add file stats, stored after stripe stats in `column_stats`
+      ff.statistics.insert(ff.statistics.end(),
+                           std::make_move_iterator(statistics.file_level.begin()),
+                           std::make_move_iterator(statistics.file_level.end()));
+    }
+    // Stripe-level statistics
+    if (not statistics.stripe_level.empty()) {
       size_t first_stripe = md.stripeStats.size();
       md.stripeStats.resize(first_stripe + stripes.size());
       for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) {
         md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns());
         buffer_.resize(0);
-        pbw_.putb(1 * 8 + PB_TYPE_VARINT);
+        pbw_.put_uint(encode_field_number<size_type>(1));
         pbw_.put_uint(stripes[stripe_id].numberOfRows);
         md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_);
         for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) {
           size_t idx = stripes.size() * col_idx + stripe_id;
-          if (idx < column_stats.size()) {
-            md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
-              std::move(column_stats[idx]);
-          }
+          md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] =
+            std::move(statistics.stripe_level[idx]);
         }
       }
     }
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index d989721334e..2738a77e50a 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -284,17 +284,24 @@ class writer::impl {
     hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
     hostdevice_2dvector<gpu::StripeStream>* strm_desc);
 
+  struct encoded_statistics {
+    std::vector<ColStatsBlob> rowgroup_level;
+    std::vector<ColStatsBlob> stripe_level;
+    std::vector<ColStatsBlob> file_level;
+  };
+
   /**
-   * @brief Returns per-stripe and per-file column statistics encoded
-   * in ORC protobuf format.
+   * @brief Returns column statistics encoded in ORC protobuf format.
    *
+   * @param are_statistics_enabled True if statistics are to be included in the output file
    * @param orc_table Table information to be written
    * @param columns List of columns
    * @param segmentation stripe and rowgroup ranges
    * @return The statistic blobs
    */
-  std::vector<std::vector<uint8_t>> gather_statistic_blobs(orc_table_view const& orc_table,
-                                                           file_segmentation const& segmentation);
+  encoded_statistics gather_statistic_blobs(bool are_statistics_enabled,
+                                            orc_table_view const& orc_table,
+                                            file_segmentation const& segmentation);
 
   /**
    * @brief Writes the specified column's row index stream.
@@ -302,10 +309,11 @@ class writer::impl {
    * @param[in] stripe_id Stripe's identifier
    * @param[in] stream_id Stream identifier (column id + 1)
    * @param[in] columns List of columns
-   * @param[in] rowgroups_range Indexes of rowgroups in the stripe
+   * @param[in] segmentation stripe and rowgroup ranges
    * @param[in] enc_streams List of encoder chunk streams [column][rowgroup]
    * @param[in] strm_desc List of stream descriptors
    * @param[in] comp_out Output status for compressed streams
+   * @param[in] rg_stats row group level statistics
    * @param[in,out] stripe Stream's parent stripe
    * @param[in,out] streams List of all streams
    * @param[in,out] pbw Protobuf writer
@@ -313,10 +321,11 @@ class writer::impl {
   void write_index_stream(int32_t stripe_id,
                           int32_t stream_id,
                           host_span<orc_column_view const> columns,
-                          stripe_rowgroups const& rowgroups_range,
+                          file_segmentation const& segmentation,
                           host_2dspan<gpu::encoder_chunk_streams const> enc_streams,
                           host_2dspan<gpu::StripeStream const> strm_desc,
                           host_span<gpu_inflate_status_s const> comp_out,
+                          std::vector<ColStatsBlob> const& rg_stats,
                           StripeInformation* stripe,
                           orc_streams* streams,
                           ProtobufWriter* pbw);

From e49084e03e5e089f1f8469440a12848749fed402 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 19 Jan 2022 09:04:12 -0600
Subject: [PATCH 182/202] Java bindings for mixed left, inner, and full joins
 (#9941)

Depends on #9917.  Adds Java bindings for the libcudf mixed join APIs.  A new MixedJoinSize class was added to track the size information returned for mixed joins.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9941
---
 .../java/ai/rapids/cudf/MixedJoinSize.java    |  43 +++
 java/src/main/java/ai/rapids/cudf/Table.java  | 235 +++++++++++-
 java/src/main/native/src/TableJni.cpp         | 173 +++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 338 ++++++++++++++++++
 4 files changed, 788 insertions(+), 1 deletion(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/MixedJoinSize.java

diff --git a/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java b/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java
new file mode 100644
index 00000000000..811f0b9a0b0
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/MixedJoinSize.java
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/** This class tracks size information associated with a mixed table join. */
+public final class MixedJoinSize implements AutoCloseable {
+  private final long outputRowCount;
+  // This is in flux, avoid exposing publicly until the dust settles.
+  private ColumnVector matches;
+
+  MixedJoinSize(long outputRowCount, ColumnVector matches) {
+    this.outputRowCount = outputRowCount;
+    this.matches = matches;
+  }
+
+  /** Return the number of output rows that would be generated from the mixed join */
+  public long getOutputRowCount() {
+    return outputRowCount;
+  }
+
+  ColumnVector getMatches() {
+    return matches;
+  }
+
+  @Override
+  public synchronized void close() {
+    matches.close();
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index dcd7953fa2e..a021ded4588 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -640,6 +640,36 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left
                                                                          long condition,
                                                                          long rowCount) throws CudfException;
 
+  private static native long[] mixedLeftJoinSize(long leftKeysTable, long rightKeysTable,
+                                                 long leftConditionTable, long rightConditionTable,
+                                                 long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftJoinGatherMaps(long leftKeysTable, long rightKeysTable,
+                                                       long leftConditionTable, long rightConditionTable,
+                                                       long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedLeftJoinGatherMapsWithSize(long leftKeysTable, long rightKeysTable,
+                                                               long leftConditionTable, long rightConditionTable,
+                                                               long condition, boolean compareNullsEqual,
+                                                               long outputRowCount, long matchesColumnView);
+
+  private static native long[] mixedInnerJoinSize(long leftKeysTable, long rightKeysTable,
+                                                  long leftConditionTable, long rightConditionTable,
+                                                  long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedInnerJoinGatherMaps(long leftKeysTable, long rightKeysTable,
+                                                        long leftConditionTable, long rightConditionTable,
+                                                        long condition, boolean compareNullsEqual);
+
+  private static native long[] mixedInnerJoinGatherMapsWithSize(long leftKeysTable, long rightKeysTable,
+                                                                long leftConditionTable, long rightConditionTable,
+                                                                long condition, boolean compareNullsEqual,
+                                                                long outputRowCount, long matchesColumnView);
+
+  private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long rightKeysTable,
+                                                       long leftConditionTable, long rightConditionTable,
+                                                       long condition, boolean compareNullsEqual);
+
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -2221,7 +2251,7 @@ public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target
         target.getNativeView(), checkBounds));
   }
 
-  private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
+  private static GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
     long leftHandle = gatherMapData[2];
@@ -2374,6 +2404,94 @@ public GatherMap[] conditionalLeftJoinGatherMaps(Table rightTable,
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes output size information for a left join between two tables using a mix of equality
+   * and inequality conditions. The entire join condition is assumed to be a logical AND of the
+   * equality condition and inequality condition.
+   * NOTE: It is the responsibility of the caller to close the resulting size information object
+   * or native resources can be leaked!
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return size information for the join
+   */
+  public static MixedJoinSize mixedLeftJoinSize(Table leftKeys, Table rightKeys,
+                                                Table leftConditional, Table rightConditional,
+                                                CompiledExpression condition,
+                                                NullEquality nullEquality) {
+    long[] mixedSizeInfo = mixedLeftJoinSize(
+            leftKeys.getNativeView(), rightKeys.getNativeView(),
+            leftConditional.getNativeView(), rightConditional.getNativeView(),
+            condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
+    assert mixedSizeInfo.length == 2;
+    long outputRowCount = mixedSizeInfo[0];
+    long matchesColumnHandle = mixedSizeInfo[1];
+    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return left and right table gather maps
+   */
+  public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKeys,
+                                                    Table leftConditional, Table rightConditional,
+                                                    CompiledExpression condition,
+                                                    NullEquality nullEquality) {
+    long[] gatherMapData = mixedLeftJoinGatherMaps(
+            leftKeys.getNativeView(), rightKeys.getNativeView(),
+            leftConditional.getNativeView(), rightConditional.getNativeView(),
+            condition.getNativeHandle(),
+            nullEquality == NullEquality.EQUAL);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of a left join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the left join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing the size result from
+   * {@link #mixedLeftJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
+   * when the output size was computed previously.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @param joinSize mixed join size result
+   * @return left and right table gather maps
+   */
+  public static GatherMap[] mixedLeftJoinGatherMaps(Table leftKeys, Table rightKeys,
+                                                    Table leftConditional, Table rightConditional,
+                                                    CompiledExpression condition,
+                                                    NullEquality nullEquality,
+                                                    MixedJoinSize joinSize) {
+    long[] gatherMapData = mixedLeftJoinGatherMapsWithSize(
+            leftKeys.getNativeView(), rightKeys.getNativeView(),
+            leftConditional.getNativeView(), rightConditional.getNativeView(),
+            condition.getNativeHandle(),
+            nullEquality == NullEquality.EQUAL,
+            joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of an inner equi-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -2514,6 +2632,94 @@ public GatherMap[] conditionalInnerJoinGatherMaps(Table rightTable,
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes output size information for an inner join between two tables using a mix of equality
+   * and inequality conditions. The entire join condition is assumed to be a logical AND of the
+   * equality condition and inequality condition.
+   * NOTE: It is the responsibility of the caller to close the resulting size information object
+   * or native resources can be leaked!
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return size information for the join
+   */
+  public static MixedJoinSize mixedInnerJoinSize(Table leftKeys, Table rightKeys,
+                                                 Table leftConditional, Table rightConditional,
+                                                 CompiledExpression condition,
+                                                 NullEquality nullEquality) {
+    long[] mixedSizeInfo = mixedInnerJoinSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
+    assert mixedSizeInfo.length == 2;
+    long outputRowCount = mixedSizeInfo[0];
+    long matchesColumnHandle = mixedSizeInfo[1];
+    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return left and right table gather maps
+   */
+  public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality) {
+    long[] gatherMapData = mixedInnerJoinGatherMaps(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
+  /**
+   * Computes the gather maps that can be used to manifest the result of an inner join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the inner join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * This interface allows passing the size result from
+   * {@link #mixedInnerJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
+   * when the output size was computed previously.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @param joinSize mixed join size result
+   * @return left and right table gather maps
+   */
+  public static GatherMap[] mixedInnerJoinGatherMaps(Table leftKeys, Table rightKeys,
+                                                     Table leftConditional, Table rightConditional,
+                                                     CompiledExpression condition,
+                                                     NullEquality nullEquality,
+                                                     MixedJoinSize joinSize) {
+    long[] gatherMapData = mixedInnerJoinGatherMapsWithSize(
+        leftKeys.getNativeView(), rightKeys.getNativeView(),
+        leftConditional.getNativeView(), rightConditional.getNativeView(),
+        condition.getNativeHandle(),
+        nullEquality == NullEquality.EQUAL,
+        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   /**
    * Computes the gather maps that can be used to manifest the result of an full equi-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -2620,6 +2826,33 @@ public GatherMap[] conditionalFullJoinGatherMaps(Table rightTable,
     return buildJoinGatherMaps(gatherMapData);
   }
 
+  /**
+   * Computes the gather maps that can be used to manifest the result of a full join between
+   * two tables using a mix of equality and inequality conditions. The entire join condition is
+   * assumed to be a logical AND of the equality condition and inequality condition.
+   * Two {@link GatherMap} instances will be returned that can be used to gather
+   * the left and right tables, respectively, to produce the result of the full join.
+   * It is the responsibility of the caller to close the resulting gather map instances.
+   * @param leftKeys the left table's key columns for the equality condition
+   * @param rightKeys the right table's key columns for the equality condition
+   * @param leftConditional the left table's columns needed to evaluate the inequality condition
+   * @param rightConditional the right table's columns needed to evaluate the inequality condition
+   * @param condition the inequality condition of the join
+   * @param nullEquality whether nulls should compare as equal
+   * @return left and right table gather maps
+   */
+  public static GatherMap[] mixedFullJoinGatherMaps(Table leftKeys, Table rightKeys,
+                                                    Table leftConditional, Table rightConditional,
+                                                    CompiledExpression condition,
+                                                    NullEquality nullEquality) {
+    long[] gatherMapData = mixedFullJoinGatherMaps(
+            leftKeys.getNativeView(), rightKeys.getNativeView(),
+            leftConditional.getNativeView(), rightConditional.getNativeView(),
+            condition.getNativeHandle(),
+            nullEquality == NullEquality.EQUAL);
+    return buildJoinGatherMaps(gatherMapData);
+  }
+
   private GatherMap buildSemiJoinGatherMap(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 828d163fe07..03faf9be021 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -41,6 +41,7 @@
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 #include "cudf_jni_apis.hpp"
@@ -886,6 +887,76 @@ jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_
   CATCH_STD(env, NULL);
 }
 
+template <typename T>
+jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
+                           jlong j_left_condition, jlong j_right_condition, jlong j_condition,
+                           jboolean j_nulls_equal, T join_size_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
+  JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
+  JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
+  JNI_NULL_CHECK(env, j_right_condition, "right condition table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const nulls_equal =
+        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size_info =
+        join_size_func(*left_keys, *right_keys, *left_condition, *right_condition,
+                       condition->get_top_expression(), nulls_equal);
+    if (join_size_info.second->size() > std::numeric_limits<cudf::size_type>::max()) {
+      throw std::runtime_error("Too many values in device buffer to convert into a column");
+    }
+    auto col_size = join_size_info.second->size();
+    auto col_data = join_size_info.second->release();
+    auto col = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32}, col_size,
+                                              std::move(col_data), rmm::device_buffer{}, 0);
+    cudf::jni::native_jlongArray result(env, 2);
+    result[0] = static_cast<jlong>(join_size_info.first);
+    result[1] = reinterpret_cast<jlong>(col.release());
+    return result.get_jArray();
+  }
+  CATCH_STD(env, NULL);
+}
+
+template <typename T>
+jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
+                                  jlong j_left_condition, jlong j_right_condition,
+                                  jlong j_condition, jboolean j_nulls_equal, T join_func) {
+  JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
+  JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
+  JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
+  JNI_NULL_CHECK(env, j_right_condition, "right condition table is null", 0);
+  JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
+    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const nulls_equal =
+        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    return gather_maps_to_java(env,
+                               join_func(*left_keys, *right_keys, *left_condition, *right_condition,
+                                         condition->get_top_expression(), nulls_equal));
+  }
+  CATCH_STD(env, NULL);
+}
+
+std::pair<std::size_t, cudf::device_span<cudf::size_type const>>
+get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view) {
+  auto const row_count = static_cast<std::size_t>(j_output_row_count);
+  auto const matches = reinterpret_cast<cudf::column_view const *>(j_matches_view);
+  return std::pair<std::size_t, cudf::device_span<cudf::size_type const>>(
+      row_count, cudf::device_span<cudf::size_type const>(matches->template data<cudf::size_type>(),
+                                                          matches->size()));
+}
+
 // Returns a table view containing only the columns at the specified indices
 cudf::table_view const get_keys_table(cudf::table_view const *t,
                                       native_jintArray const &key_indices) {
@@ -2227,6 +2298,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGather
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_size(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_join_size(left_keys, right_keys, left_condition, right_condition,
+                                          condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_gather_maps(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
+                                     condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
+    jlong j_matches_view) {
+  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+  return cudf::jni::mixed_join_gather_maps(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
+                                     condition, nulls_equal, size_info);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_maps(
@@ -2316,6 +2431,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGathe
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_size(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_inner_join_size(left_keys, right_keys, left_condition, right_condition,
+                                           condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_gather_maps(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
+                                      condition, nulls_equal);
+      });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
+    jlong j_matches_view) {
+  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
+  return cudf::jni::mixed_join_gather_maps(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
+                                      condition, nulls_equal, size_info);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_maps(
@@ -2374,6 +2533,20 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGather
       });
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(
+    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
+    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::mixed_join_gather_maps(
+      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
+      j_nulls_equal,
+      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
+         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
+         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
+        return cudf::mixed_full_join(left_keys, right_keys, left_condition, right_condition,
+                                     condition, nulls_equal);
+      });
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_single_map(
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 18a0de77664..47c468de8c8 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1578,6 +1578,144 @@ void testConditionalLeftJoinGatherMapsNullsWithCount() {
     }
   }
 
+  @Test
+  void testMixedLeftJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+            new ColumnReference(1, TableReference.LEFT),
+            new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3,  4,  5)
+             .column(7, 8, 9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8,   9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, inv)
+             .build()) {
+      GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.UNEQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+            new ColumnReference(1, TableReference.LEFT),
+            new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0,   1,   2,   3,   4,   5,   6, 7, 7, 8,   9)
+             .column(0, inv, inv, inv, inv, inv, inv, 0, 2, 1, inv)
+             .build()) {
+      GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.EQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftJoinGatherMapsWithSize() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+            new ColumnReference(1, TableReference.LEFT),
+            new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(  0,   1, 2,   3,   4,   5,   6, 7, 8,   9)
+             .column(inv, inv, 2, inv, inv, inv, inv, 0, 1, inv)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.UNEQUAL, sizeInfo);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedLeftJoinGatherMapsNullsWithSize() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+            new ColumnReference(1, TableReference.LEFT),
+            new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0,   1,   2,   3,   4,   5,   6, 7, 7, 8,   9)
+             .column(0, inv, inv, inv, inv, inv, inv, 0, 2, 1, inv)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedLeftJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      GatherMap[] maps = Table.mixedLeftJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+              NullEquality.EQUAL, sizeInfo);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testInnerJoinGatherMaps() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();
@@ -1848,6 +1986,140 @@ void testConditionalInnerJoinGatherMapsNullsWithCount() {
     }
   }
 
+  @Test
+  void testMixedInnerJoinGatherMaps() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8)
+             .column(2, 0, 1)
+             .build()) {
+      GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.UNEQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedInnerJoinGatherMapsNulls() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 7, 7, 8)
+             .column(0, 0, 2, 1)
+             .build()) {
+      GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.EQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedInnerJoinGatherMapsWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(2, 7, 8)
+             .column(2, 0, 1)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedInnerJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.UNEQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.UNEQUAL, sizeInfo);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedInnerJoinGatherMapsNullsWithSize() {
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(0, 7, 7, 8)
+             .column(0, 0, 2, 1)
+             .build();
+         MixedJoinSize sizeInfo = Table.mixedInnerJoinSize(leftKeys, rightKeys, left, right,
+             condition, NullEquality.EQUAL)) {
+      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
+      GatherMap[] maps = Table.mixedInnerJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.EQUAL, sizeInfo);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testFullJoinGatherMaps() {
     final int inv = Integer.MIN_VALUE;
@@ -2042,6 +2314,72 @@ void testConditionalFullJoinGatherMapsNulls() {
     }
   }
 
+  @Test
+  void testMixedFullJoinGatherMaps() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+            new ColumnReference(1, TableReference.LEFT),
+            new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
+             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(6, 5, 9, 8, 10, 32)
+             .column(0, 1, 2, 3, 4, 5)
+             .column(7, 8, 9, 0, 1, 2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv, inv,   0,   1, 2,   3,   4,   5,   6, 7, 8,   9)
+             .column(  3,   4,   5, inv, inv, 2, inv, inv, inv, inv, 0, 1, inv)
+             .build()) {
+      GatherMap[] maps = Table.mixedFullJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.UNEQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
+  @Test
+  void testMixedFullJoinGatherMapsNulls() {
+    final int inv = Integer.MIN_VALUE;
+    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
+        new ColumnReference(1, TableReference.LEFT),
+        new ColumnReference(1, TableReference.RIGHT));
+    try (CompiledExpression condition = expr.compile();
+         Table left = new Table.TestBuilder()
+             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
+             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
+             .build();
+         Table leftKeys = new Table(left.getColumn(0));
+         Table right = new Table.TestBuilder()
+             .column(null, 5, null, 8, 10, 32)
+             .column(   0, 1,    2, 3,  4,  5)
+             .column(   7, 8,    9, 0,  1,  2).build();
+         Table rightKeys = new Table(right.getColumn(0));
+         Table expected = new Table.TestBuilder()
+             .column(inv, inv, inv, 0,   1,   2,   3,   4,   5,   6, 7, 7, 8,   9)
+             .column(  3,   4,   5, 0, inv, inv, inv, inv, inv, inv, 0, 2, 1, inv)
+             .build()) {
+      GatherMap[] maps = Table.mixedFullJoinGatherMaps(leftKeys, rightKeys, left, right, condition,
+          NullEquality.EQUAL);
+      try {
+        verifyJoinGatherMaps(maps, expected);
+      } finally {
+        for (GatherMap map : maps) {
+          map.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testLeftSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();

From 8fd7dd26724e89d3d66eba865b93e3c2cd02dada Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 19 Jan 2022 07:04:48 -0800
Subject: [PATCH 183/202] Move `drop_duplicates`, `drop_na`, `_gather`, `take`
 to IndexFrame and create their `_base_index` counterparts (#9807)

This PR is a follow up of #9558 (Part 1 of 3)

One remaining problem from #9558 is that `Frame` is index agnostic, however the above functions needs to perform index-aware operations when building the list of columns to pass to libcudf. For example, to remove duplicates of `BaseIndex`, it should only construct the list with all its columns. But in a dataframe, it would need to pass in all data columns plus the index columns, while specifying the indices of the data columns to consider duplicates. This complicates for `_gather` which supports `keep_index` argument. This PR moves aforementioned functions to `IndexedFrames`, and create its counterparts in `_base_index`.

A couple noteworthy changes:
- Merge object added with two new arguments `l(r)hs_is_index`
- DataFrame/Series.take `keep_index` argument is removed. For internal usage it's more advised to use `_gather`. (And thus this PR is labeled breaking)

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/9807
---
 python/cudf/cudf/_lib/copying.pyx       |  17 ++
 python/cudf/cudf/core/_base_index.py    | 131 +++++++++-
 python/cudf/cudf/core/column/column.py  |   6 +-
 python/cudf/cudf/core/dataframe.py      |   7 +-
 python/cudf/cudf/core/frame.py          | 313 +-----------------------
 python/cudf/cudf/core/index.py          |  10 +
 python/cudf/cudf/core/indexed_frame.py  | 236 +++++++++++++++++-
 python/cudf/cudf/core/join/join.py      |  53 ++--
 python/cudf/cudf/core/multiindex.py     |   7 +
 python/cudf/cudf/core/series.py         |   5 -
 python/cudf/cudf/tests/test_dropna.py   |  62 +++++
 python/cudf/cudf/tests/test_indexing.py |  10 +-
 python/cudf/cudf/utils/utils.py         |  18 --
 13 files changed, 505 insertions(+), 370 deletions(-)

diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 28bd78733a3..30157bc10ad 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -48,6 +48,23 @@ from cudf._lib.utils cimport (
 ctypedef const scalar constscalar
 
 
+def _gather_map_is_valid(
+    gather_map: "cudf.core.column.ColumnBase",
+    nrows: int,
+    check_bounds: bool,
+    nullify: bool,
+) -> bool:
+    """Returns true if gather map is valid.
+
+    A gather map is valid if empty or all indices are within the range
+    ``[-nrows, nrows)``, except when ``nullify`` is specifed.
+    """
+    if not check_bounds or nullify or len(gather_map) == 0:
+        return True
+    gm_min, gm_max = minmax(gather_map)
+    return gm_min >= -nrows and gm_max < nrows
+
+
 def copy_column(Column input_column):
     """
     Deep copies a column
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 4f2614e843f..be5a1e7cc93 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -9,12 +9,18 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.stream_compaction import apply_boolean_mask
+from cudf._lib.copying import _gather_map_is_valid, gather
+from cudf._lib.stream_compaction import (
+    apply_boolean_mask,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf._typing import DtypeObj
 from cudf.api.types import (
     is_bool_dtype,
     is_dtype_equal,
     is_integer,
+    is_integer_dtype,
     is_list_like,
     is_scalar,
 )
@@ -1423,6 +1429,129 @@ def from_pandas(cls, index, nan_as_null=None):
     def _constructor_expanddim(self):
         return cudf.MultiIndex
 
+    def drop_duplicates(
+        self, keep="first", nulls_are_equal=True,
+    ):
+        """
+        Drop duplicate rows in index.
+
+        keep : {"first", "last", False}, default "first"
+            - 'first' : Drop duplicates except for the first occurrence.
+            - 'last' : Drop duplicates except for the last occurrence.
+            - ``False`` : Drop all duplicates.
+        nulls_are_equal: bool, default True
+            Null elements are considered equal to other null elements.
+        """
+
+        # This utilizes the fact that all `Index` is also a `Frame`.
+        result = self.__class__._from_columns(
+            drop_duplicates(
+                list(self._columns),
+                keys=range(len(self._data)),
+                keep=keep,
+                nulls_are_equal=nulls_are_equal,
+            ),
+            self._column_names,
+        )
+        result._copy_type_metadata(self, include_index=False)
+        return result
+
+    def dropna(self, how="any"):
+        """
+        Drop null rows from Index.
+
+        how : {"any", "all"}, default "any"
+            Specifies how to decide whether to drop a row.
+            "any" (default) drops rows containing at least
+            one null value. "all" drops only rows containing
+            *all* null values.
+        """
+
+        # This is to be consistent with IndexedFrame.dropna to handle nans
+        # as nulls by default
+        data_columns = [
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col
+            for col in self._columns
+        ]
+
+        result = self.__class__._from_columns(
+            drop_nulls(data_columns, how=how, keys=range(len(data_columns)),),
+            self._column_names,
+        )
+        result._copy_type_metadata(self, include_index=False)
+        return result
+
+    def _gather(self, gather_map, nullify=False, check_bounds=True):
+        """Gather rows of index specified by indices in `gather_map`.
+
+        Skip bounds checking if check_bounds is False.
+        Set rows to null for all out of bound indices if nullify is `True`.
+        """
+        gather_map = cudf.core.column.as_column(gather_map)
+
+        # TODO: For performance, the check and conversion of gather map should
+        # be done by the caller. This check will be removed in future release.
+        if not is_integer_dtype(gather_map.dtype):
+            gather_map = gather_map.astype("int32")
+
+        if not _gather_map_is_valid(
+            gather_map, len(self), check_bounds, nullify
+        ):
+            raise IndexError("Gather map index is out of bounds.")
+
+        result = self.__class__._from_columns(
+            gather(list(self._columns), gather_map, nullify=nullify),
+            self._column_names,
+        )
+
+        result._copy_type_metadata(self, include_index=False)
+        return result
+
+    def take(self, indices, axis=0, allow_fill=True, fill_value=None):
+        """Return a new index containing the rows specified by *indices*
+
+        Parameters
+        ----------
+        indices : array-like
+            Array of ints indicating which positions to take.
+        axis : int
+            The axis over which to select values, always 0.
+        allow_fill : Unsupported
+        fill_value : Unsupported
+
+        Returns
+        -------
+        out : Index
+            New object with desired subset of rows.
+
+        Examples
+        --------
+        >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e'])
+        >>> idx.take([2, 0, 4, 3])
+        StringIndex(['c' 'a' 'e' 'd'], dtype='object')
+        """
+
+        if axis not in {0, "index"}:
+            raise NotImplementedError(
+                "Gather along column axis is not yet supported."
+            )
+        if not allow_fill or fill_value is not None:
+            raise NotImplementedError(
+                "`allow_fill` and `fill_value` are unsupported."
+            )
+
+        indices = cudf.core.column.as_column(indices)
+        if is_bool_dtype(indices):
+            warnings.warn(
+                "Calling take with a boolean array is deprecated and will be "
+                "removed in the future.",
+                FutureWarning,
+            )
+            return self._apply_boolean_mask(indices)
+        return self._gather(indices)
+
     def _apply_boolean_mask(self, boolean_mask):
         """Apply boolean mask to each row of `self`.
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 667ce0488cd..1a83194489d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -77,7 +77,7 @@
     pandas_dtypes_alias_to_cudf_alias,
     pandas_dtypes_to_np_dtypes,
 )
-from cudf.utils.utils import _gather_map_is_valid, mask_dtype
+from cudf.utils.utils import mask_dtype
 
 T = TypeVar("T", bound="ColumnBase")
 
@@ -702,7 +702,9 @@ def take(
         # be done by the caller. This check will be removed in future release.
         if not is_integer_dtype(indices.dtype):
             indices = indices.astype("int32")
-        if not _gather_map_is_valid(indices, len(self), check_bounds, nullify):
+        if not libcudf.copying._gather_map_is_valid(
+            indices, len(self), check_bounds, nullify
+        ):
             raise IndexError("Gather map index is out of bounds.")
 
         return libcudf.copying.gather([self], indices, nullify=nullify)[
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a444d87b50c..c686cd0fd39 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2545,11 +2545,8 @@ def reset_index(
             inplace=inplace,
         )
 
-    def take(self, indices, axis=0, keep_index=None):
-        axis = self._get_axis_from_axis_arg(axis)
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 is supported.")
-        out = super().take(indices, keep_index)
+    def take(self, indices, axis=0):
+        out = super().take(indices)
         out.columns = self.columns
         return out
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6e47c0f41cf..1d59d9f3b1a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -30,10 +30,8 @@
 from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
-    is_bool_dtype,
     is_decimal_dtype,
     is_dict_like,
-    is_integer_dtype,
     is_scalar,
     issubdtype,
 )
@@ -52,7 +50,6 @@
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type, is_column_like
-from cudf.utils.utils import _gather_map_is_valid
 
 T = TypeVar("T", bound="Frame")
 
@@ -72,6 +69,7 @@ class Frame:
     # TODO: Once all dependence on Frame having an index is removed, this
     # attribute should be moved to IndexedFrame.
     _index: Optional[cudf.core.index.BaseIndex]
+    _names: Optional[List]
 
     def __init__(self, data=None, index=None):
         if data is None:
@@ -533,37 +531,6 @@ def _get_columns_by_index(self, indices):
             data, columns=data.to_pandas_index(), index=self.index
         )
 
-    def _gather(
-        self, gather_map, keep_index=True, nullify=False, check_bounds=True
-    ):
-        """Gather rows of frame specified by indices in `gather_map`.
-
-        Skip bounds checking if check_bounds is False.
-        Set rows to null for all out of bound indices if nullify is `True`.
-        """
-        # TODO: `keep_index` argument is to be removed.
-        gather_map = cudf.core.column.as_column(gather_map)
-
-        # TODO: For performance, the check and conversion of gather map should
-        # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
-            gather_map = gather_map.astype("int32")
-
-        if not _gather_map_is_valid(
-            gather_map, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
-
-        result = self.__class__._from_columns(
-            libcudf.copying.gather(
-                list(self._columns), gather_map, nullify=nullify,
-            ),
-            self._column_names,
-        )
-
-        result._copy_type_metadata(self)
-        return result
-
     def _as_column(self):
         """
         _as_column : Converts a single columned Frame to Column
@@ -1110,120 +1077,6 @@ def scatter_by_map(
 
         return result
 
-    def dropna(
-        self, axis=0, how="any", thresh=None, subset=None, inplace=False
-    ):
-        """
-        Drops rows (or columns) containing nulls from a Column.
-
-        Parameters
-        ----------
-        axis : {0, 1}, optional
-            Whether to drop rows (axis=0, default) or columns (axis=1)
-            containing nulls.
-        how : {"any", "all"}, optional
-            Specifies how to decide whether to drop a row (or column).
-            any (default) drops rows (or columns) containing at least
-            one null value. all drops only rows (or columns) containing
-            *all* null values.
-        thresh: int, optional
-            If specified, then drops every row (or column) containing
-            less than `thresh` non-null values
-        subset : list, optional
-            List of columns to consider when dropping rows (all columns
-            are considered by default). Alternatively, when dropping
-            columns, subset is a list of rows to consider.
-        inplace : bool, default False
-            If True, do operation inplace and return None.
-
-        Returns
-        -------
-        Copy of the DataFrame with rows/columns containing nulls dropped.
-
-        See also
-        --------
-        cudf.DataFrame.isna
-            Indicate null values.
-
-        cudf.DataFrame.notna
-            Indicate non-null values.
-
-        cudf.DataFrame.fillna
-            Replace null values.
-
-        cudf.Series.dropna
-            Drop null values.
-
-        cudf.Index.dropna
-            Drop null indices.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
-        ...                    "toy": ['Batmobile', None, 'Bullwhip'],
-        ...                    "born": [np.datetime64("1940-04-25"),
-        ...                             np.datetime64("NaT"),
-        ...                             np.datetime64("NaT")]})
-        >>> df
-               name        toy                 born
-        0    Alfred  Batmobile  1940-04-25 00:00:00
-        1    Batman       <NA>                 <NA>
-        2  Catwoman   Bullwhip                 <NA>
-
-        Drop the rows where at least one element is null.
-
-        >>> df.dropna()
-             name        toy       born
-        0  Alfred  Batmobile 1940-04-25
-
-        Drop the columns where at least one element is null.
-
-        >>> df.dropna(axis='columns')
-               name
-        0    Alfred
-        1    Batman
-        2  Catwoman
-
-        Drop the rows where all elements are null.
-
-        >>> df.dropna(how='all')
-               name        toy                 born
-        0    Alfred  Batmobile  1940-04-25 00:00:00
-        1    Batman       <NA>                 <NA>
-        2  Catwoman   Bullwhip                 <NA>
-
-        Keep only the rows with at least 2 non-null values.
-
-        >>> df.dropna(thresh=2)
-               name        toy                 born
-        0    Alfred  Batmobile  1940-04-25 00:00:00
-        2  Catwoman   Bullwhip                 <NA>
-
-        Define in which columns to look for null values.
-
-        >>> df.dropna(subset=['name', 'born'])
-             name        toy       born
-        0  Alfred  Batmobile 1940-04-25
-
-        Keep the DataFrame with valid entries in the same variable.
-
-        >>> df.dropna(inplace=True)
-        >>> df
-             name        toy       born
-        0  Alfred  Batmobile 1940-04-25
-        """
-        if axis == 0:
-            result = self._drop_na_rows(
-                how=how, subset=subset, thresh=thresh, drop_nan=True
-            )
-        else:
-            result = self._drop_na_columns(
-                how=how, subset=subset, thresh=thresh
-            )
-
-        return self._mimic_inplace(result, inplace=inplace)
-
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1370,70 +1223,6 @@ def fillna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    def ffill(self):
-        return self.fillna(method="ffill")
-
-    def bfill(self):
-        return self.fillna(method="bfill")
-
-    def _drop_na_rows(
-        self, how="any", subset=None, thresh=None, drop_nan=False
-    ):
-        """
-        Drops null rows from `self`.
-
-        how : {"any", "all"}, optional
-            Specifies how to decide whether to drop a row.
-            any (default) drops rows containing at least
-            one null value. all drops only rows containing
-            *all* null values.
-        subset : list, optional
-            List of columns to consider when dropping rows.
-        thresh: int, optional
-            If specified, then drops every row containing
-            less than `thresh` non-null values.
-        """
-        if subset is None:
-            subset = self._column_names
-        elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
-            or isinstance(subset, tuple)
-            and subset in self._data.names
-        ):
-            subset = (subset,)
-        diff = set(subset) - set(self._data)
-        if len(diff) != 0:
-            raise KeyError(f"columns {diff} do not exist")
-
-        if len(subset) == 0:
-            return self.copy(deep=True)
-
-        frame = self.copy(deep=False)
-        if drop_nan:
-            for name, col in frame._data.items():
-                if name in subset and isinstance(
-                    col, cudf.core.column.NumericalColumn
-                ):
-                    frame._data[name] = col.nans_to_nulls()
-                else:
-                    frame._data[name] = col
-
-        result = self.__class__._from_columns(
-            libcudf.stream_compaction.drop_nulls(
-                list(self._index._data.columns + frame._columns),
-                how=how,
-                keys=self._positions_from_column_names(
-                    subset, offset_by_index_columns=True
-                ),
-                thresh=thresh,
-            ),
-            self._column_names,
-            self._index.names,
-        )
-        result._copy_type_metadata(frame)
-        return result
-
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -2129,34 +1918,6 @@ def to_arrow(self):
             {name: col.to_arrow() for name, col in self._data.items()}
         )
 
-    def drop_duplicates(
-        self, keep="first", nulls_are_equal=True,
-    ):
-        """
-        Drop duplicate rows in frame.
-
-        keep : ["first", "last", False], default "first"
-            "first" will keep the first duplicate entry, "last" will keep the
-            last duplicate entry, and False will drop all duplicates.
-        nulls_are_equal: bool, default True
-            Null elements are considered equal to other null elements.
-        """
-
-        result = self.__class__._from_columns(
-            libcudf.stream_compaction.drop_duplicates(
-                list(self._columns),
-                keys=range(len(self._columns)),
-                keep=keep,
-                nulls_are_equal=nulls_are_equal,
-            ),
-            self._column_names,
-        )
-        # TODO: _copy_type_metadata is a common pattern to apply after the
-        # roundtrip from libcudf. We should build this into a factory function
-        # to increase reusability.
-        result._copy_type_metadata(self)
-        return result
-
     def _positions_from_column_names(self, column_names):
         """Map each column name into their positions in the frame.
 
@@ -2872,74 +2633,6 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
-    def take(self, indices, keep_index=None):
-        """Return a new object containing the rows specified by *positions*
-
-        Parameters
-        ----------
-        indices : array-like
-            Array of ints indicating which positions to take.
-        keep_index : bool, default True
-            Whether to retain the index in result or not.
-
-        Returns
-        -------
-        out : Series or DataFrame or Index
-            New object with desired subset of rows.
-
-        Examples
-        --------
-        **Series**
-        >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e'])
-        >>> s.take([2, 0, 4, 3])
-        2    c
-        0    a
-        4    e
-        3    d
-        dtype: object
-
-        **DataFrame**
-
-        >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
-        ...                    'b': cudf.Series(['a', 'b', 'c'])})
-        >>> a.take([0, 2, 2])
-             a  b
-        0  1.0  a
-        2  3.0  c
-        2  3.0  c
-        >>> a.take([True, False, True])
-             a  b
-        0  1.0  a
-        2  3.0  c
-
-        **Index**
-
-        >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e'])
-        >>> idx.take([2, 0, 4, 3])
-        StringIndex(['c' 'a' 'e' 'd'], dtype='object')
-        """
-        # TODO: When we remove keep_index we should introduce the axis
-        # parameter. We could also introduce is_copy, but that's already
-        # deprecated in pandas so it's probably unnecessary. We also need to
-        # introduce Index.take's allow_fill and fill_value parameters.
-        if keep_index is not None:
-            warnings.warn(
-                "keep_index is deprecated and will be removed in the future.",
-                FutureWarning,
-            )
-        else:
-            keep_index = True
-
-        indices = as_column(indices)
-        if is_bool_dtype(indices):
-            warnings.warn(
-                "Calling take with a boolean array is deprecated and will be "
-                "removed in the future.",
-                FutureWarning,
-            )
-            return self._apply_boolean_mask(indices)
-        return self._gather(indices, keep_index=keep_index)
-
     def sin(self):
         """
         Get Trigonometric sine, element-wise.
@@ -3629,6 +3322,8 @@ def _merge(
         elif how in {"leftsemi", "leftanti"}:
             merge_cls = MergeSemi
 
+        # TODO: the two isinstance checks below indicates that `_merge` should
+        # not be defined in `Frame`, but in `IndexedFrame`.
         return merge_cls(
             lhs,
             rhs,
@@ -3637,6 +3332,8 @@ def _merge(
             right_on=right_on,
             left_index=left_index,
             right_index=right_index,
+            lhs_is_index=isinstance(lhs, cudf.core._base_index.BaseIndex),
+            rhs_is_index=isinstance(rhs, cudf.core._base_index.BaseIndex),
             how=how,
             sort=sort,
             indicator=indicator,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1e493708415..91c7a740699 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -681,6 +681,16 @@ def _intersection(self, other, sort=False):
 
         return new_index
 
+    def _gather(self, gather_map, nullify=False, check_bounds=True):
+        return Int64Index._from_columns(
+            [self._values.take(gather_map, nullify, check_bounds)], [self.name]
+        )
+
+    def _apply_boolean_mask(self, boolean_mask):
+        return Int64Index._from_columns(
+            [self._values.apply_boolean_mask(boolean_mask)], [self.name]
+        )
+
 
 # Patch in all binops and unary ops, which bypass __getattr__ on the instance
 # and prevent the above overload from working.
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 7c5783bf637..9458057894a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -29,7 +29,7 @@
 from cudf.core.frame import Frame
 from cudf.core.index import Index, RangeIndex, _index_from_columns
 from cudf.core.multiindex import MultiIndex
-from cudf.utils.utils import _gather_map_is_valid, cached_property
+from cudf.utils.utils import cached_property
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -551,7 +551,7 @@ def _gather(
         if not is_integer_dtype(gather_map.dtype):
             gather_map = gather_map.astype("int32")
 
-        if not _gather_map_is_valid(
+        if not libcudf.copying._gather_map_is_valid(
             gather_map, len(self), check_bounds, nullify
         ):
             raise IndexError("Gather map index is out of bounds.")
@@ -577,8 +577,8 @@ def _positions_from_column_names(
         """Map each column name into their positions in the frame.
 
         Return positions of the provided column names, offset by the number of
-        index columns `offset_by_index_columns` is True. The order of indices
-        returned corresponds to the column order in this Frame.
+        index columns if `offset_by_index_columns` is True. The order of
+        indices returned corresponds to the column order in this Frame.
         """
         num_index_columns = (
             len(self._index._data) if offset_by_index_columns else 0
@@ -854,11 +854,12 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
                 n = 0
 
             # argsort the `by` column
-            return self.take(
+            return self._gather(
                 self._get_columns_by_label(columns)._get_sorted_inds(
                     ascending=not largest
                 )[:n],
                 keep_index=True,
+                check_bounds=False,
             )
         elif keep == "last":
             indices = self._get_columns_by_label(columns)._get_sorted_inds(
@@ -870,7 +871,7 @@ def _n_largest_or_smallest(self, largest, n, columns, keep):
                 indices = indices[0:0]
             else:
                 indices = indices[: -n - 1 : -1]
-            return self.take(indices, keep_index=True)
+            return self._gather(indices, keep_index=True, check_bounds=False)
         else:
             raise ValueError('keep must be either "first", "last"')
 
@@ -1198,6 +1199,176 @@ def resample(
             else cudf.core.resample.DataFrameResampler(self, by=by)
         )
 
+    def dropna(
+        self, axis=0, how="any", thresh=None, subset=None, inplace=False
+    ):
+        """
+        Drop rows (or columns) containing nulls from a Column.
+
+        Parameters
+        ----------
+        axis : {0, 1}, optional
+            Whether to drop rows (axis=0, default) or columns (axis=1)
+            containing nulls.
+        how : {"any", "all"}, optional
+            Specifies how to decide whether to drop a row (or column).
+            any (default) drops rows (or columns) containing at least
+            one null value. all drops only rows (or columns) containing
+            *all* null values.
+        thresh: int, optional
+            If specified, then drops every row (or column) containing
+            less than `thresh` non-null values
+        subset : list, optional
+            List of columns to consider when dropping rows (all columns
+            are considered by default). Alternatively, when dropping
+            columns, subset is a list of rows to consider.
+        inplace : bool, default False
+            If True, do operation inplace and return None.
+
+        Returns
+        -------
+        Copy of the DataFrame with rows/columns containing nulls dropped.
+
+        See also
+        --------
+        cudf.DataFrame.isna
+            Indicate null values.
+
+        cudf.DataFrame.notna
+            Indicate non-null values.
+
+        cudf.DataFrame.fillna
+            Replace null values.
+
+        cudf.Series.dropna
+            Drop null values.
+
+        cudf.Index.dropna
+            Drop null indices.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
+        ...                    "toy": ['Batmobile', None, 'Bullwhip'],
+        ...                    "born": [np.datetime64("1940-04-25"),
+        ...                             np.datetime64("NaT"),
+        ...                             np.datetime64("NaT")]})
+        >>> df
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        1    Batman       <NA>                 <NA>
+        2  Catwoman   Bullwhip                 <NA>
+
+        Drop the rows where at least one element is null.
+
+        >>> df.dropna()
+             name        toy       born
+        0  Alfred  Batmobile 1940-04-25
+
+        Drop the columns where at least one element is null.
+
+        >>> df.dropna(axis='columns')
+               name
+        0    Alfred
+        1    Batman
+        2  Catwoman
+
+        Drop the rows where all elements are null.
+
+        >>> df.dropna(how='all')
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        1    Batman       <NA>                 <NA>
+        2  Catwoman   Bullwhip                 <NA>
+
+        Keep only the rows with at least 2 non-null values.
+
+        >>> df.dropna(thresh=2)
+               name        toy                 born
+        0    Alfred  Batmobile  1940-04-25 00:00:00
+        2  Catwoman   Bullwhip                 <NA>
+
+        Define in which columns to look for null values.
+
+        >>> df.dropna(subset=['name', 'born'])
+             name        toy       born
+        0  Alfred  Batmobile 1940-04-25
+
+        Keep the DataFrame with valid entries in the same variable.
+
+        >>> df.dropna(inplace=True)
+        >>> df
+             name        toy       born
+        0  Alfred  Batmobile 1940-04-25
+        """
+        if axis == 0:
+            result = self._drop_na_rows(
+                how=how, subset=subset, thresh=thresh, drop_nan=True
+            )
+        else:
+            result = self._drop_na_columns(
+                how=how, subset=subset, thresh=thresh
+            )
+
+        return self._mimic_inplace(result, inplace=inplace)
+
+    def _drop_na_rows(
+        self, how="any", subset=None, thresh=None, drop_nan=False
+    ):
+        """
+        Drop null rows from `self`.
+
+        how : {"any", "all"}, optional
+            Specifies how to decide whether to drop a row.
+            any (default) drops rows containing at least
+            one null value. all drops only rows containing
+            *all* null values.
+        subset : list, optional
+            List of columns to consider when dropping rows.
+        thresh: int, optional
+            If specified, then drops every row containing
+            less than `thresh` non-null values.
+        """
+        if subset is None:
+            subset = self._column_names
+        elif (
+            not np.iterable(subset)
+            or isinstance(subset, str)
+            or isinstance(subset, tuple)
+            and subset in self._data.names
+        ):
+            subset = (subset,)
+        diff = set(subset) - set(self._data)
+        if len(diff) != 0:
+            raise KeyError(f"columns {diff} do not exist")
+
+        if len(subset) == 0:
+            return self.copy(deep=True)
+
+        if drop_nan:
+            data_columns = [
+                col.nans_to_nulls()
+                if isinstance(col, cudf.core.column.NumericalColumn)
+                else col
+                for col in self._columns
+            ]
+
+        result = self.__class__._from_columns(
+            libcudf.stream_compaction.drop_nulls(
+                list(self._index._data.columns) + data_columns,
+                how=how,
+                keys=self._positions_from_column_names(
+                    subset, offset_by_index_columns=True
+                ),
+                thresh=thresh,
+            ),
+            self._column_names,
+            self._index.names,
+        )
+        result._copy_type_metadata(self)
+        return result
+
     def _apply_boolean_mask(self, boolean_mask):
         """Apply boolean mask to each row of `self`.
 
@@ -1217,6 +1388,59 @@ def _apply_boolean_mask(self, boolean_mask):
         result._copy_type_metadata(self)
         return result
 
+    def take(self, indices, axis=0):
+        """Return a new frame containing the rows specified by *indices*.
+
+        Parameters
+        ----------
+        indices : array-like
+            Array of ints indicating which positions to take.
+        axis : Unsupported
+
+        Returns
+        -------
+        out : Series or DataFrame
+            New object with desired subset of rows.
+
+        Examples
+        --------
+        **Series**
+        >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e'])
+        >>> s.take([2, 0, 4, 3])
+        2    c
+        0    a
+        4    e
+        3    d
+        dtype: object
+
+        **DataFrame**
+
+        >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
+        ...                    'b': cudf.Series(['a', 'b', 'c'])})
+        >>> a.take([0, 2, 2])
+             a  b
+        0  1.0  a
+        2  3.0  c
+        2  3.0  c
+        >>> a.take([True, False, True])
+             a  b
+        0  1.0  a
+        2  3.0  c
+        """
+        axis = self._get_axis_from_axis_arg(axis)
+        if axis != 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+
+        indices = cudf.core.column.as_column(indices)
+        if is_bool_dtype(indices):
+            warnings.warn(
+                "Calling take with a boolean array is deprecated and will be "
+                "removed in the future.",
+                FutureWarning,
+            )
+            return self._apply_boolean_mask(indices)
+        return self._gather(indices)
+
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
         if level is not None and not isinstance(level, (tuple, list)):
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index dd8f462fb1d..704274815f6 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Callable
+from typing import TYPE_CHECKING, Callable, cast
 
 import cudf
 from cudf import _lib as libcudf
@@ -41,6 +41,8 @@ def __init__(
         right_on,
         left_index,
         right_index,
+        lhs_is_index,
+        rhs_is_index,
         how,
         sort,
         indicator,
@@ -70,6 +72,10 @@ def __init__(
         right_index : bool
             Boolean flag indicating the right index column or coumns
             are to be used as join keys in order.
+        lhs_is_index : bool
+            ``lhs`` is a ``BaseIndex``
+        rhs_is_index : bool
+            ``rhs`` is a ``BaseIndex``
         how : string
             The type of join. Possible values are
             'inner', 'outer', 'left', 'leftsemi' and 'leftanti'
@@ -94,6 +100,8 @@ def __init__(
 
         self.lhs = lhs.copy(deep=False)
         self.rhs = rhs.copy(deep=False)
+        self.lhs_is_index = lhs_is_index
+        self.rhs_is_index = rhs_is_index
         self.how = how
         self.sort = sort
         self.lsuffix, self.rsuffix = suffixes
@@ -201,24 +209,28 @@ def perform_merge(self) -> Frame:
         )
 
         gather_index = self._using_left_index or self._using_right_index
+        lkwargs = {
+            "gather_map": left_rows,
+            "nullify": True,
+            "check_bounds": False,
+        }
+        rkwargs = {
+            "gather_map": right_rows,
+            "nullify": True,
+            "check_bounds": False,
+        }
+        if not self.lhs_is_index:
+            lkwargs["keep_index"] = gather_index
+        if not self.rhs_is_index:
+            rkwargs["keep_index"] = gather_index
 
         left_result = (
-            self.lhs._gather(
-                left_rows,
-                nullify=True,
-                keep_index=gather_index,
-                check_bounds=False,
-            )
+            self.lhs._gather(**lkwargs)
             if left_rows is not None
             else cudf.core.frame.Frame()
         )
         right_result = (
-            self.rhs._gather(
-                right_rows,
-                nullify=True,
-                keep_index=gather_index,
-                check_bounds=False,
-            )
+            self.rhs._gather(**rkwargs)
             if right_rows is not None
             else cudf.core.frame.Frame()
         )
@@ -321,11 +333,16 @@ def _sort_result(self, result: Frame) -> Frame:
         if by:
             to_sort = cudf.DataFrame._from_data(dict(enumerate(by)))
             sort_order = to_sort.argsort()
-            result = result._gather(
-                sort_order,
-                keep_index=self._using_left_index or self._using_right_index,
-                check_bounds=False,
-            )
+            if isinstance(result, cudf.core._base_index.BaseIndex):
+                result = result._gather(sort_order, check_bounds=False)
+            else:
+                result = cast(cudf.core.indexed_frame.IndexedFrame, result)
+                result = result._gather(
+                    sort_order,
+                    keep_index=self._using_left_index
+                    or self._using_right_index,
+                    check_bounds=False,
+                )
         return result
 
     @staticmethod
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3acc947c649..e8ff7838a9e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1744,6 +1744,13 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
+    def _copy_type_metadata(
+        self, other: Frame, include_index: bool = True
+    ) -> Frame:
+        res = super()._copy_type_metadata(other, include_index=include_index)
+        res._names = other._names
+        return res
+
     def _split_columns_by_levels(self, levels):
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2ecee781eb1..e96531d4b1c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1205,11 +1205,6 @@ def __setitem__(self, key, value):
         else:
             self.loc[key] = value
 
-    def take(self, indices, axis=0, keep_index=True):
-        # Validate but don't use the axis.
-        _ = self._get_axis_from_axis_arg(axis)
-        return super().take(indices, keep_index)
-
     def __repr__(self):
         _, height = get_terminal_size()
         max_rows = (
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index e1d0c38c760..1e24dd9d275 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -228,3 +228,65 @@ def test_dropna_dataframe_np_nan(data, axis):
     pdf = pd.DataFrame(pd_data)
 
     assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data, dtype",
+    [
+        ([1, float("nan"), 2], "float64"),
+        (["x", None, "y"], "str"),
+        (["x", None, "y"], "category"),
+        (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"),
+        (["1s", pd.NaT, "3d"], "timedelta64[ns]"),
+    ],
+)
+def test_dropna_index(data, dtype):
+    pi = pd.Index(data, dtype=dtype)
+    gi = cudf.from_pandas(pi)
+
+    expect = pi.dropna()
+    got = gi.dropna()
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("data", [[[1, None, 2], [None, None, 2]]])
+@pytest.mark.parametrize("how", ["all", "any"])
+def test_dropna_multiindex(data, how):
+    pi = pd.MultiIndex.from_arrays(data)
+    gi = cudf.from_pandas(pi)
+
+    expect = pi.dropna(how)
+    got = gi.dropna(how)
+
+    with pytest.raises(AssertionError, match="different"):
+        # pandas-gh44792. Pandas infers the dtypes as (int64, int64), though
+        # int64 doesn't really store null/nans. The dtype propagates to the
+        # result of dropna. cuDF infers the dtypes as (float, float), which
+        # differs from pandas.
+        assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")],
+            [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")],
+        ],
+        [
+            [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")],
+            [np.nan, np.nan, 1.0],
+        ],
+        [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]],
+    ],
+)
+@pytest.mark.parametrize("how", ["all", "any"])
+def test_dropna_multiindex_2(data, how):
+    pi = pd.MultiIndex.from_arrays(data)
+    gi = cudf.from_pandas(pi)
+
+    expect = pi.dropna(how)
+    got = gi.dropna(how)
+
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 90a20e2bab4..e452dc5d7f7 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -747,9 +747,8 @@ def test_dataframe_take_with_multiIndex(ntake):
     assert_eq(actual, expected)
 
 
-@pytest.mark.parametrize("keep_index", [True, False])
 @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200])
-def test_series_take(ntake, keep_index):
+def test_series_take(ntake):
     np.random.seed(0)
     nelem = 123
 
@@ -758,12 +757,9 @@ def test_series_take(ntake, keep_index):
 
     take_indices = np.random.randint(0, len(gsr), ntake)
 
-    actual = gsr.take(take_indices, keep_index=keep_index)
+    actual = gsr.take(take_indices)
     expected = psr.take(take_indices)
 
-    if not keep_index:
-        expected = expected.reset_index(drop=True)
-
     assert_eq(actual, expected)
 
 
@@ -775,7 +771,7 @@ def test_series_take_positional():
     take_indices = [1, 2, 0, 3]
 
     expect = psr.take(take_indices)
-    got = gsr.take(take_indices, keep_index=True)
+    got = gsr.take(take_indices)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2af7543e600..d23094ef3f9 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -12,7 +12,6 @@
 import rmm
 
 import cudf
-from cudf._lib.reduce import minmax
 from cudf.core import column
 from cudf.core.buffer import Buffer
 from cudf.utils.dtypes import to_cudf_compatible_scalar
@@ -507,20 +506,3 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     if (indices == cp.arange(start, stop, step)).all():
         return slice(start, stop, step)
     return indices
-
-
-def _gather_map_is_valid(
-    gather_map: "cudf.core.column.ColumnBase",
-    nrows: int,
-    check_bounds: bool,
-    nullify: bool,
-) -> bool:
-    """Returns true if gather map is valid.
-
-    A gather map is valid if empty or all indices are within the range
-    ``[-nrows, nrows)``, except when ``nullify`` is specifed.
-    """
-    if not check_bounds or nullify or len(gather_map) == 0:
-        return True
-    gm_min, gm_max = minmax(gather_map)
-    return gm_min >= -nrows and gm_max < nrows

From f0410345844540e013fcb52c1787f6458532546d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Wed, 19 Jan 2022 19:48:59 -0500
Subject: [PATCH 184/202] Replace custom CUDA bindings previously provided by
 RMM with official CUDA Python bindings (#10008)

This PR replaces custom CUDA bindings that are provided by RMM, with official CUDA Python bindings. This PR should be merged after the RMM PR  https://github.com/rapidsai/rmm/pull/930

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/10008
---
 conda/environments/cudf_dev_cuda11.5.yml |  1 +
 conda/recipes/cudf/meta.yaml             |  2 +-
 python/cudf/cudf/utils/gpu_utils.py      | 42 +++++++++++-------------
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index bbbc754e850..cb52e656a31 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -33,6 +33,7 @@ dependencies:
   - ipython
   - pandoc=<2.0.0
   - cudatoolkit=11.5
+  - cuda-python >=11.5,<12.0
   - pip
   - flake8=3.8.3
   - black=19.10
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index a20749bc8c9..bd1412bc611 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -52,7 +52,7 @@ requirements:
     - packaging
     - cachetools
     - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
-
+    - cuda-python >=11.5,<12.0
 test:                                   # [linux64]
   requires:                             # [linux64]
     - cudatoolkit {{ cuda_version }}.*  # [linux64]
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index dbdd68f2df8..8947760e052 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -15,10 +15,10 @@ def validate_setup():
 
     import warnings
 
+    from cuda.cudart import cudaDeviceAttr, cudaError_t
+
     from rmm._cuda.gpu import (
         CUDARuntimeError,
-        cudaDeviceAttr,
-        cudaError,
         deviceGetName,
         driverGetVersion,
         getDeviceAttribute,
@@ -30,30 +30,30 @@ def _try_get_old_or_new_symbols():
         try:
             # CUDA 10.2+ symbols
             return [
-                cudaError.cudaErrorDeviceUninitialized,
-                cudaError.cudaErrorTimeout,
+                cudaError_t.cudaErrorDeviceUninitialized,
+                cudaError_t.cudaErrorTimeout,
             ]
         except AttributeError:
             # CUDA 10.1 symbols
-            return [cudaError.cudaErrorDeviceUninitilialized]
+            return [cudaError_t.cudaErrorDeviceUninitilialized]
 
     notify_caller_errors = {
-        cudaError.cudaErrorInitializationError,
-        cudaError.cudaErrorInsufficientDriver,
-        cudaError.cudaErrorInvalidDeviceFunction,
-        cudaError.cudaErrorInvalidDevice,
-        cudaError.cudaErrorStartupFailure,
-        cudaError.cudaErrorInvalidKernelImage,
-        cudaError.cudaErrorAlreadyAcquired,
-        cudaError.cudaErrorOperatingSystem,
-        cudaError.cudaErrorNotPermitted,
-        cudaError.cudaErrorNotSupported,
-        cudaError.cudaErrorSystemNotReady,
-        cudaError.cudaErrorSystemDriverMismatch,
-        cudaError.cudaErrorCompatNotSupportedOnDevice,
+        cudaError_t.cudaErrorInitializationError,
+        cudaError_t.cudaErrorInsufficientDriver,
+        cudaError_t.cudaErrorInvalidDeviceFunction,
+        cudaError_t.cudaErrorInvalidDevice,
+        cudaError_t.cudaErrorStartupFailure,
+        cudaError_t.cudaErrorInvalidKernelImage,
+        cudaError_t.cudaErrorAlreadyAcquired,
+        cudaError_t.cudaErrorOperatingSystem,
+        cudaError_t.cudaErrorNotPermitted,
+        cudaError_t.cudaErrorNotSupported,
+        cudaError_t.cudaErrorSystemNotReady,
+        cudaError_t.cudaErrorSystemDriverMismatch,
+        cudaError_t.cudaErrorCompatNotSupportedOnDevice,
         *_try_get_old_or_new_symbols(),
-        cudaError.cudaErrorUnknown,
-        cudaError.cudaErrorApiFailureBase,
+        cudaError_t.cudaErrorUnknown,
+        cudaError_t.cudaErrorApiFailureBase,
     }
 
     try:
@@ -68,8 +68,6 @@ def _try_get_old_or_new_symbols():
         # Cupy throws RunTimeException to get GPU count,
         # hence obtaining GPU count by in-house cpp api above
 
-        # 75 - Indicates to get "cudaDevAttrComputeCapabilityMajor" attribute
-        # 0 - Get GPU 0
         major_version = getDeviceAttribute(
             cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0
         )

From 6bbe2e896ebdaa5784b4378f434c03e9523beb93 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 19 Jan 2022 21:38:42 -0500
Subject: [PATCH 185/202] Include <optional> in headers that use std::optional
 (#10044)

Detected when compiling with gcc-11 where <optional> wasn't being brought in by other standard headers

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10044
---
 cpp/benchmarks/common/generate_benchmark_input.cpp | 3 ++-
 cpp/include/cudf/strings/repeat_strings.hpp        | 2 ++
 cpp/include/cudf/strings/replace_re.hpp            | 2 ++
 cpp/src/io/utilities/trie.cuh                      | 2 ++
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index 995cea13c27..dcd8e32fc9d 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 
 #include <future>
 #include <memory>
+#include <optional>
 #include <random>
 #include <thread>
 #include <vector>
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index edba01b174f..f6bf12af967 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -18,6 +18,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace strings {
 /**
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index a2c4eba1636..0e904958d15 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -20,6 +20,8 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace strings {
 /**
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index 1140a08b76b..85834ad2f0e 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -23,6 +23,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <optional>
+
 namespace cudf {
 namespace detail {
 static constexpr char trie_terminating_character = '\n';

From ab752d482ee8f8b76bdec7917538c50771b81693 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Thu, 20 Jan 2022 03:41:51 +0100
Subject: [PATCH 186/202] Simplify custreamz and cudf_kafka recipes files
 (#10065)

Adding build string constraints with Python version is not required for `python-confluent-kafka` dependency as the python version is fixed during conda build with `--python` flag

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10065
---
 conda/recipes/cudf_kafka/meta.yaml | 2 +-
 conda/recipes/custreamz/meta.yaml  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index acb56c464e4..56f2730db7a 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -35,7 +35,7 @@ requirements:
   run:
     - python
     - libcudf_kafka {{ version }}
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf {{ version }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 8bcdd1ec61e..ddeaa2ccd7b 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -26,7 +26,7 @@ build:
 requirements:
   host:
     - python
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
   run:
     - python
@@ -34,7 +34,7 @@ requirements:
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2
     - distributed>=2021.11.1,<=2021.11.2
-    - python-confluent-kafka >=1.7.0,<1.8.0a0=py{{ py_version_numeric }}*
+    - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
 
 test:                                   # [linux64]

From c00f42bf93c400055b78a11a0701d7e8c69098d7 Mon Sep 17 00:00:00 2001
From: Ryan Lee <rwlee@users.noreply.github.com>
Date: Wed, 19 Jan 2022 20:04:54 -0800
Subject: [PATCH 187/202] Spark Decimal128 hashing (#9919)

cudf work for https://github.com/NVIDIA/spark-rapids/issues/3878

Shortens the hashed data by removing preceding zero values -- ensuring the leave a sign bit -- and flipping the endianness before hashing the value.

Authors:
  - Ryan Lee (https://github.com/rwlee)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9919
---
 .../cudf/detail/utilities/hash_functions.cuh  | 76 +++++++++++++++----
 cpp/tests/hashing/hash_test.cpp               | 32 ++++++--
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index c35d24ddeac..8a7f4276d05 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -16,12 +16,16 @@
 
 #pragma once
 
+#include <cstddef>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 
+#include <thrust/iterator/reverse_iterator.h>
+
 using hash_value_type = uint32_t;
 
 namespace cudf {
@@ -337,17 +341,21 @@ struct SparkMurmurHash3_32 {
   template <typename TKey>
   result_type __device__ inline compute(TKey const& key) const
   {
-    constexpr int len        = sizeof(TKey);
-    int8_t const* const data = reinterpret_cast<int8_t const*>(&key);
-    constexpr int nblocks    = len / 4;
+    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(TKey));
+  }
+
+  result_type __device__ compute_bytes(std::byte const* const data, cudf::size_type const len) const
+  {
+    constexpr cudf::size_type block_size = sizeof(uint32_t) / sizeof(std::byte);
+    cudf::size_type const nblocks        = len / block_size;
+    uint32_t h1                          = m_seed;
+    constexpr uint32_t c1                = 0xcc9e2d51;
+    constexpr uint32_t c2                = 0x1b873593;
 
-    uint32_t h1           = m_seed;
-    constexpr uint32_t c1 = 0xcc9e2d51;
-    constexpr uint32_t c2 = 0x1b873593;
     //----------
-    // body
-    uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data + nblocks * 4);
-    for (int i = -nblocks; i; i++) {
+    // Process all four-byte chunks
+    uint32_t const* const blocks = reinterpret_cast<uint32_t const*>(data);
+    for (cudf::size_type i = 0; i < nblocks; i++) {
       uint32_t k1 = blocks[i];
       k1 *= c1;
       k1 = rotl32(k1, 15);
@@ -357,9 +365,14 @@ struct SparkMurmurHash3_32 {
       h1 = h1 * 5 + 0xe6546b64;
     }
     //----------
-    // byte by byte tail processing
-    for (int i = nblocks * 4; i < len; i++) {
-      int32_t k1 = data[i];
+    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
+    // (does not conform to normal MurmurHash3)
+    for (cudf::size_type i = nblocks * 4; i < len; i++) {
+      // We require a two-step cast to get the k1 value from the byte. First,
+      // we must cast to a signed int8_t. Then, the sign bit is preserved when
+      // casting to uint32_t under 2's complement. Java preserves the
+      // signedness when casting byte-to-int, but C++ does not.
+      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
       k1 *= c1;
       k1 = rotl32(k1, 15);
       k1 *= c2;
@@ -427,7 +440,42 @@ template <>
 hash_value_type __device__ inline SparkMurmurHash3_32<numeric::decimal128>::operator()(
   numeric::decimal128 const& key) const
 {
-  return this->compute<__int128_t>(key.value());
+  // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
+  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
+  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
+  __int128_t const val               = key.value();
+  constexpr cudf::size_type key_size = sizeof(__int128_t);
+  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
+
+  // Small negative values start with 0xff..., small positive values start with 0x00...
+  bool const is_negative     = val < 0;
+  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
+
+  // If the value can be represented with a shorter than 16-byte integer, the
+  // leading bytes of the little-endian value are truncated and are not hashed.
+  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
+  auto const reverse_end   = thrust::reverse_iterator(data);
+  auto const first_nonzero_byte =
+    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
+      return v == zero_value;
+    }).base();
+  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
+  cudf::size_type length =
+    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
+
+  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
+  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
+  // preserve the sign bit, rather than leaving an "f" at the front which would
+  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
+  // is needed because the leftmost bit matches the sign bit. Similarly for
+  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
+  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
+
+  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
+  __int128_t big_endian_value = 0;
+  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
+  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
+  return this->compute_bytes(big_endian_data, length);
 }
 
 template <>
@@ -480,7 +528,7 @@ hash_value_type __device__ inline SparkMurmurHash3_32<cudf::string_view>::operat
   //----------
   // Spark's byte by byte tail processing
   for (int i = nblocks * 4; i < len; i++) {
-    int32_t k1 = data[i];
+    uint32_t k1 = data[i];
     k1 *= c1;
     k1 = rotl32(k1, 15);
     k1 *= c2;
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index bd6deae9dc4..1a73fb3abc9 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -298,32 +298,36 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   // The hash values were determined by running the following Scala code in Apache Spark:
   // import org.apache.spark.sql.catalyst.util.DateTimeUtils
   // val schema = new StructType().add("structs", new StructType().add("a",IntegerType)
-  //     .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType)))
+  //   .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType)))
   //   .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType)
   //   .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType)
   //   .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType)
   //   .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType)
+  //   .add("decimal128", DecimalType(38,11))
   // val data = Seq(
   // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0),
   //     0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-  //     false),
+  //     false, BigDecimal(0)),
   // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
   //     DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-  //     DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true),
+  //     DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true,
+  //     BigDecimal("0.000000001")),
   // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
   //     DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
   //     DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-  //     true),
+  //     true, BigDecimal("-0.00000000001")),
   // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
   //     "All work and no play makes Jack a dull boy", Double.MinValue,
   //     DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
   //     Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-  //     BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true),
+  //     BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
+  //     BigDecimal("-9999999999999999.99999999999")),
   // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
   //     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
   //     DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
   //     Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-  //     BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false))
+  //     BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
+  //     BigDecimal("99999999999999999999999999.99999999999")))
   // val df = spark.createDataFrame(sc.parallelize(data), schema)
   // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
   // df.select(hash(col("*"))).collect
@@ -353,8 +357,10 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
     {933211791, 751823303, -1080202046, 1110053733, 1135925485});
   fixed_width_column_wrapper<int32_t> const hash_bools_expected(
     {933211791, -559580957, -559580957, -559580957, 933211791});
+  fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
+    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
   fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {-1172364561, -442972638, 1213234395, 796626751, 214075225});
+    {401603227, 588162166, 552160517, 1132537411, -326043017});
 
   using double_limits = std::numeric_limits<double>;
   using long_limits   = std::numeric_limits<int64_t>;
@@ -394,6 +400,13 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
   fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
   fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  fixed_point_column_wrapper<__int128_t> const decimal128_col(
+    {static_cast<__int128>(0),
+     static_cast<__int128>(100),
+     static_cast<__int128>(-1),
+     (static_cast<__int128>(0xFFFFFFFFFCC4D1C3u) << 64 | 0x602F7FC318000001u),
+     (static_cast<__int128>(0x0785EE10D5DA46D9u) << 64 | 0x00F4369FFFFFFFFFu)},
+    numeric::scale_type{-11});
 
   constexpr auto hasher      = cudf::hash_id::HASH_SPARK_MURMUR3;
   auto const hash_structs    = cudf::hash(cudf::table_view({structs_col}), hasher, 42);
@@ -410,6 +423,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   auto const hash_bytes      = cudf::hash(cudf::table_view({bytes_col}), hasher, 42);
   auto const hash_bools1     = cudf::hash(cudf::table_view({bools_col1}), hasher, 42);
   auto const hash_bools2     = cudf::hash(cudf::table_view({bools_col2}), hasher, 42);
+  auto const hash_decimal128 = cudf::hash(cudf::table_view({decimal128_col}), hasher, 42);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
@@ -425,6 +439,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
 
   auto const combined_table = cudf::table_view({structs_col,
                                                 strings_col,
@@ -438,7 +453,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
                                                 ints_col,
                                                 shorts_col,
                                                 bytes_col,
-                                                bools_col2});
+                                                bools_col2,
+                                                decimal128_col});
   auto const hash_combined  = cudf::hash(combined_table, hasher, 42);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
 }

From d5f1aed8903ed3ad0e49d7852d3fdf0cfb7f376f Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 20 Jan 2022 06:20:21 -0600
Subject: [PATCH 188/202] Add in support for NULL_LOGICAL_AND and
 NULL_LOGICAL_OR binops (#10016)

These already exist as a part of the AST. Spark's AND/OR implementations follow these requirements and to be able to re-implement it using existing CUDF functionality ended up being very expensive. We found that this one change could cut almost 13% off the total run time on TPC-DS query 28.  AND/OR are common enough in all queries we expect this to have a major performance impact generally.

We tried to use the AST version instead, but depending on the hardware used the overhead of AST does not pay for itself when the input/intermediate outputs are boolean columns. It appears to be because the amount of memory transfers saved is relatively small in most boolean cases and on large GPUs like the a100 the intermediate results might even fit entirely in the L2 cache.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Conor Hoekstra (https://github.com/codereport)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Jim Brennan (https://github.com/jbrennan333)

URL: https://github.com/rapidsai/cudf/pull/10016
---
 cpp/CMakeLists.txt                            |  4 +-
 cpp/include/cudf/binaryop.hpp                 |  8 +-
 cpp/src/binaryop/binaryop.cpp                 |  5 +-
 cpp/src/binaryop/compiled/NullLogicalAnd.cu   | 26 ++++++
 cpp/src/binaryop/compiled/NullLogicalOr.cu    | 26 ++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  4 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  6 +-
 cpp/src/binaryop/compiled/operation.cuh       | 34 +++++++-
 cpp/src/binaryop/compiled/util.cpp            |  9 +-
 cpp/tests/binaryop/binop-compiled-test.cpp    | 86 +++++++++++++------
 cpp/tests/binaryop/util/operation.h           | 44 +++++++++-
 .../main/java/ai/rapids/cudf/BinaryOp.java    |  6 +-
 .../java/ai/rapids/cudf/BinaryOpTest.java     | 42 ++++++++-
 13 files changed, 259 insertions(+), 41 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullLogicalAnd.cu
 create mode 100644 cpp/src/binaryop/compiled/NullLogicalOr.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2f51f582e12..4db9f6de4d5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -186,6 +186,8 @@ add_library(
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
+  src/binaryop/compiled/NullLogicalOr.cu
+  src/binaryop/compiled/NullLogicalAnd.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index a514010c1f0..daf55c0befe 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,7 +72,11 @@ enum class binary_operator : int32_t {
                          ///< operand when one is null; or invalid when both are null
   GENERIC_BINARY,        ///< generic binary operator to be generated with input
                          ///< ptx code
-  INVALID_BINARY         ///< invalid operation
+  NULL_LOGICAL_AND,  ///< operator && with Spark rules: (null, null) is null, (null, true) is null,
+                     ///< (null, false) is false, and (valid, valid) == LOGICAL_AND(valid, valid)
+  NULL_LOGICAL_OR,   ///< operator || with Spark rules: (null, null) is null, (null, true) is true,
+                     ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
+  INVALID_BINARY     ///< invalid operation
 };
 /**
  * @brief Performs a binary operation between a scalar and a column.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 7087b71a84e..5f9ff2574e3 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -74,7 +74,8 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
 inline bool is_null_dependent(binary_operator op)
 {
   return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX;
+         op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND ||
+         op == binary_operator::NULL_LOGICAL_OR;
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/NullLogicalAnd.cu b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
new file mode 100644
index 00000000000..48ae125bc93
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullLogicalAnd.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullLogicalAnd>(mutable_column_device_view&,
+                                                   column_device_view const&,
+                                                   column_device_view const&,
+                                                   bool is_lhs_scalar,
+                                                   bool is_rhs_scalar,
+                                                   rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/NullLogicalOr.cu b/cpp/src/binaryop/compiled/NullLogicalOr.cu
new file mode 100644
index 00000000000..e0ea95ac3ee
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullLogicalOr.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullLogicalOr>(mutable_column_device_view&,
+                                                  column_device_view const&,
+                                                  column_device_view const&,
+                                                  bool is_lhs_scalar,
+                                                  bool is_rhs_scalar,
+                                                  rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index efa8cdca2cc..1d12fac1938 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,6 +339,8 @@ case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs,
 case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_LOGICAL_AND:     apply_binary_op<ops::NullLogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_LOGICAL_OR:      apply_binary_op<ops::NullLogicalOr>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 default:;
 }
   // clang-format on
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index dc1cae82796..9b3e33f491e 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,6 +103,8 @@ struct ops_wrapper {
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
                       std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
@@ -150,6 +152,8 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
                       std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 75507d055e0..313fc34567d 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -415,6 +415,38 @@ struct NullMin {
     -> decltype(static_cast<common_t>(static_cast<common_t>(x) < static_cast<common_t>(y) ? x : y));
 };
 
+struct NullLogicalAnd {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x && y)
+  {
+    bool lhs_false  = lhs_valid && !x;
+    bool rhs_false  = rhs_valid && !y;
+    bool both_valid = lhs_valid && rhs_valid;
+    output_valid    = lhs_false || rhs_false || both_valid;
+    return both_valid && !lhs_false && !rhs_false;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x && y);
+};
+
+struct NullLogicalOr {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x || y)
+  {
+    bool lhs_true   = lhs_valid && x;
+    bool rhs_true   = rhs_valid && y;
+    bool both_valid = lhs_valid && rhs_valid;
+    output_valid    = lhs_true || rhs_true || both_valid;
+    return lhs_true || rhs_true;
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x || y);
+};
+
 }  // namespace ops
 }  // namespace compiled
 }  // namespace binops
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index f89941a3d68..146e53aae59 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,8 +71,9 @@ struct is_binary_operation_supported {
       if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
         using common_t = std::common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
-      } else
+      } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
+      }
     } else {
       return false;
     }
@@ -166,6 +167,10 @@ struct is_supported_operation_functor {
       case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_LOGICAL_AND:
+        return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_LOGICAL_OR:
+        return bool_op<ops::NullLogicalOr, TypeLhs, TypeRhs>(out);
       default: return type_dispatcher(out, nested_support_functor<TypeLhs, TypeRhs>{}, op);
     }
     return false;
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 0339d52dda9..00408741653 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -475,6 +475,64 @@ TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalOr_Vector_Vector)
   this->template test<cudf::library::operation::LogicalOr>(cudf::binary_operator::LOGICAL_OR);
 }
 
+template <typename T>
+using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
+                                          cudf::test::strings_column_wrapper,
+                                          cudf::test::fixed_width_column_wrapper<T>>;
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
+auto NullOp_Result(column_view lhs, column_view rhs)
+{
+  auto [lhs_data, lhs_mask] = cudf::test::to_host<TypeLhs>(lhs);
+  auto [rhs_data, rhs_mask] = cudf::test::to_host<TypeRhs>(rhs);
+  std::vector<TypeOut> result(lhs.size());
+  std::vector<bool> result_mask;
+  std::transform(thrust::make_counting_iterator(0),
+                 thrust::make_counting_iterator(lhs.size()),
+                 result.begin(),
+                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
+                   auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
+                   auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
+                   bool output_valid = lhs_valid or rhs_valid;
+                   auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid);
+                   result_mask.push_back(output_valid);
+                   return result;
+                 });
+  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalAnd_Vector_Vector)
+{
+  using TypeOut  = bool;
+  using TypeLhs  = typename TestFixture::TypeLhs;
+  using TypeRhs  = typename TestFixture::TypeRhs;
+  using NULL_AND = cudf::library::operation::NullLogicalAnd<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_AND>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_LOGICAL_AND, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_Logical, NullLogicalOr_Vector_Vector)
+{
+  using TypeOut = bool;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+  using NULL_OR = cudf::library::operation::NullLogicalOr<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_OR>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_LOGICAL_OR, data_type(type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 // Comparison Operations ==, !=, <, >, <=, >=
 // n<!=>n, t<!=>t, d<!=>d, s<!=>s, dc<!=>dc
 using Comparison_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>,
@@ -554,32 +612,6 @@ struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<
 };
 TYPED_TEST_SUITE(BinaryOperationCompiledTest_NullOps, Null_types);
 
-template <typename T>
-using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
-                                          cudf::test::strings_column_wrapper,
-                                          cudf::test::fixed_width_column_wrapper<T>>;
-
-template <typename TypeOut, typename TypeLhs, typename TypeRhs, class OP>
-auto NullOp_Result(column_view lhs, column_view rhs)
-{
-  auto [lhs_data, lhs_mask] = cudf::test::to_host<TypeLhs>(lhs);
-  auto [rhs_data, rhs_mask] = cudf::test::to_host<TypeRhs>(rhs);
-  std::vector<TypeOut> result(lhs.size());
-  std::vector<bool> result_mask;
-  std::transform(thrust::make_counting_iterator(0),
-                 thrust::make_counting_iterator(lhs.size()),
-                 result.begin(),
-                 [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut {
-                   auto lhs_valid    = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i);
-                   auto rhs_valid    = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i);
-                   bool output_valid = lhs_valid or rhs_valid;
-                   auto result = OP{}(lhs_data[i], rhs_data[i], lhs_valid, rhs_valid, output_valid);
-                   result_mask.push_back(output_valid);
-                   return result;
-                 });
-  return column_wrapper<TypeOut>(result.cbegin(), result.cend(), result_mask.cbegin());
-}
-
 TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
 {
   using TypeOut     = bool;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index 481e5cfd4a9..22802580cd0 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -323,6 +323,48 @@ struct PyMod {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullLogicalAnd {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    if (lhs_valid && !x) {
+      output_valid = true;
+      return false;
+    }
+    if (rhs_valid && !y) {
+      output_valid = true;
+      return false;
+    }
+    if (lhs_valid && rhs_valid) {
+      output_valid = true;
+      return true;
+    }
+    output_valid = false;
+    return false;
+  }
+};
+
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullLogicalOr {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    if (lhs_valid && x) {
+      output_valid = true;
+      return true;
+    }
+    if (rhs_valid && y) {
+      output_valid = true;
+      return true;
+    }
+    if (lhs_valid && rhs_valid) {
+      output_valid = true;
+      return false;
+    }
+    output_valid = false;
+    return false;
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct NullEquals {
   TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index 8b58d8383b4..15b8d32d6da 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2020,2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,8 +49,10 @@ public enum BinaryOp {
   GREATER_EQUAL(25), // >=
   NULL_EQUALS(26), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
   NULL_MAX(27), // MAX but NULL < not NULL
-  NULL_MIN(28); // MIN but NULL > not NULL
+  NULL_MIN(28), // MIN but NULL > not NULL
   //NOT IMPLEMENTED YET GENERIC_BINARY(29);
+  NULL_LOGICAL_AND(30),
+  NULL_LOGICAL_OR(31);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
index 0ca997d3c80..862f3860d3d 100644
--- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
+++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -1363,6 +1363,46 @@ public void testBitXor() {
     }
   }
 
+  @Test
+  public void testNullAnd() {
+    try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(
+        true, true, true,
+        false, false, false,
+        null, null, null);
+         ColumnVector icv2 = ColumnVector.fromBoxedBooleans(
+             true, false, null,
+             true, false, null,
+             true, false, null)) {
+      try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_AND, icv2, DType.BOOL8);
+           ColumnVector expected = ColumnVector.fromBoxedBooleans(
+               true, false, null,
+               false, false, false,
+               null, false, null)) {
+        assertColumnsAreEqual(expected, answer, "boolean NULL AND boolean");
+      }
+    }
+  }
+
+  @Test
+  public void testNullOr() {
+    try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(
+        true, true, true,
+        false, false, false,
+        null, null, null);
+         ColumnVector icv2 = ColumnVector.fromBoxedBooleans(
+             true, false, null,
+             true, false, null,
+             true, false, null)) {
+      try (ColumnVector answer = icv1.binaryOp(BinaryOp.NULL_LOGICAL_OR, icv2, DType.BOOL8);
+           ColumnVector expected = ColumnVector.fromBoxedBooleans(
+               true, true, true,
+               true, false, null,
+               true, null, null)) {
+        assertColumnsAreEqual(expected, answer, "boolean NULL OR boolean");
+      }
+    }
+  }
+
   @Test
   public void testAnd() {
     try (ColumnVector icv1 = ColumnVector.fromBoxedBooleans(BOOLEANS_1);

From 690993cdc937be78849dcbf7a11a93d326d8aecc Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 20 Jan 2022 07:20:47 -0600
Subject: [PATCH 189/202] Add `struct` generation support in datagenerator &
 fuzz tests (#9180)

Resolves: #7618

This PR adds struct dtype support in data-generator for fuzz-testing.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/9180
---
 python/cudf/cudf/_fuzz_testing/io.py          |   6 +
 python/cudf/cudf/_fuzz_testing/orc.py         |   5 +-
 python/cudf/cudf/_fuzz_testing/parquet.py     |   2 +
 python/cudf/cudf/_fuzz_testing/utils.py       |  87 +++++++++--
 python/cudf/cudf/testing/dataset_generator.py | 144 ++++++++++++++++--
 5 files changed, 223 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py
index 1312300f714..193fb4c7f7f 100644
--- a/python/cudf/cudf/_fuzz_testing/io.py
+++ b/python/cudf/cudf/_fuzz_testing/io.py
@@ -25,6 +25,9 @@ def __init__(
         max_string_length=None,
         max_lists_length=None,
         max_lists_nesting_depth=None,
+        max_structs_nesting_depth=None,
+        max_struct_null_frequency=None,
+        max_struct_types_at_each_level=None,
     ):
         dirs = [] if dirs is None else dirs
         self._inputs = []
@@ -33,6 +36,9 @@ def __init__(
         self._max_string_length = max_string_length
         self._max_lists_length = max_lists_length
         self._max_lists_nesting_depth = max_lists_nesting_depth
+        self._max_structs_nesting_depth = max_structs_nesting_depth
+        self._max_struct_null_frequency = max_struct_null_frequency
+        self._max_struct_types_at_each_level = max_struct_types_at_each_level
 
         for i, path in enumerate(dirs):
             if i == 0 and not os.path.exists(path):
diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py
index 2aa01eb3967..78e01fb76a4 100644
--- a/python/cudf/cudf/_fuzz_testing/orc.py
+++ b/python/cudf/cudf/_fuzz_testing/orc.py
@@ -83,7 +83,10 @@ def generate_input(self):
         self._df = df
         file_obj = io.BytesIO()
         pandas_to_orc(
-            df, file_io_obj=file_obj, stripe_size=self._rand(len(df))
+            df,
+            file_io_obj=file_obj,
+            stripe_size=self._rand(len(df)),
+            arrow_table_schema=table.schema,
         )
         file_obj.seek(0)
         buf = file_obj.read()
diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py
index 5b00f96d88d..859d09b407f 100644
--- a/python/cudf/cudf/_fuzz_testing/parquet.py
+++ b/python/cudf/cudf/_fuzz_testing/parquet.py
@@ -59,6 +59,7 @@ def generate_input(self):
                 - {"uint32"}
                 | {"list", "decimal64"}
             )
+
             dtypes_meta, num_rows, num_cols = _generate_rand_meta(
                 self, dtypes_list
             )
@@ -80,6 +81,7 @@ def generate_input(self):
         # https://issues.apache.org/jira/browse/ARROW-10123
 
         # file = io.BytesIO()
+
         df.to_parquet("temp_file")
         # file.seek(0)
         # self._current_buffer = copy.copy(file.read())
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index ff5870c50be..87a8fc46374 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -6,6 +6,7 @@
 import fastavro
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pyorc
 
 import cudf
@@ -114,6 +115,26 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             meta["value_type"] = random.choice(
                 list(cudf.utils.dtypes.ALL_TYPES - {"category"})
             )
+        elif dtype == "struct":
+            if obj._max_lists_nesting_depth is None:
+                meta["nesting_max_depth"] = np.random.randint(2, 10)
+            else:
+                meta["nesting_max_depth"] = obj._max_lists_nesting_depth
+
+            if obj._max_struct_null_frequency is None:
+                meta["max_null_frequency"] = random.uniform(0, 1)
+            else:
+                meta["max_null_frequency"] = obj._max_struct_null_frequency
+
+            if obj._max_struct_types_at_each_level is None:
+                meta["max_types_at_each_level"] = np.random.randint(
+                    low=1, high=10
+                )
+            else:
+                meta[
+                    "max_types_at_each_level"
+                ] = obj._max_struct_types_at_each_level
+
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
         elif dtype == "decimal32":
@@ -161,6 +182,8 @@ def pyarrow_to_pandas(table):
             df[column._name] = pd.Series(
                 column, dtype=pyarrow_dtypes_to_pandas_dtypes[column.type]
             )
+        elif isinstance(column.type, pa.StructType):
+            df[column._name] = column.to_pandas(integer_object_nulls=True)
         else:
             df[column._name] = column.to_pandas()
 
@@ -196,6 +219,14 @@ def get_orc_dtype_info(dtype):
         )
 
 
+def get_arrow_dtype_info_for_pyorc(dtype):
+    if isinstance(dtype, pa.StructType):
+        return get_orc_schema(df=None, arrow_table_schema=dtype)
+    else:
+        pd_dtype = cudf.dtype(dtype.to_pandas_dtype())
+        return get_orc_dtype_info(pd_dtype)
+
+
 def get_avro_schema(df):
     fields = [
         {"name": col_name, "type": get_avro_dtype_info(col_dtype)}
@@ -205,11 +236,17 @@ def get_avro_schema(df):
     return schema
 
 
-def get_orc_schema(df):
-    ordered_dict = OrderedDict(
-        (col_name, get_orc_dtype_info(col_dtype))
-        for col_name, col_dtype in df.dtypes.items()
-    )
+def get_orc_schema(df, arrow_table_schema=None):
+    if arrow_table_schema is None:
+        ordered_dict = OrderedDict(
+            (col_name, get_orc_dtype_info(col_dtype))
+            for col_name, col_dtype in df.dtypes.items()
+        )
+    else:
+        ordered_dict = OrderedDict(
+            (field.name, get_arrow_dtype_info_for_pyorc(field.type))
+            for field in arrow_table_schema
+        )
 
     schema = pyorc.Struct(**ordered_dict)
     return schema
@@ -255,13 +292,25 @@ def pandas_to_avro(df, file_name=None, file_io_obj=None):
         fastavro.writer(file_io_obj, avro_schema, records)
 
 
-def _preprocess_to_orc_tuple(df):
+def _preprocess_to_orc_tuple(df, arrow_table_schema):
     def _null_to_None(value):
         if value is pd.NA or value is pd.NaT:
             return None
         else:
             return value
 
+    def sanitize(value, struct_type):
+        if value is None:
+            return None
+
+        values_list = []
+        for name, sub_type in struct_type.fields.items():
+            if isinstance(sub_type, cudf.StructDtype):
+                values_list.append(sanitize(value[name], sub_type))
+            else:
+                values_list.append(value[name])
+        return tuple(values_list)
+
     has_nulls_or_nullable_dtype = any(
         [
             True
@@ -271,19 +320,35 @@ def _null_to_None(value):
             for col in df.columns
         ]
     )
+    pdf = df.copy(deep=True)
+    for field in arrow_table_schema:
+        if isinstance(field.type, pa.StructType):
+            pdf[field.name] = pdf[field.name].apply(
+                sanitize, args=(cudf.StructDtype.from_arrow(field.type),)
+            )
+        else:
+            pdf[field.name] = pdf[field.name]
 
     tuple_list = [
         tuple(map(_null_to_None, tup)) if has_nulls_or_nullable_dtype else tup
-        for tup in df.itertuples(index=False, name=None)
+        for tup in pdf.itertuples(index=False, name=None)
     ]
 
-    return tuple_list
+    return tuple_list, pdf, df
 
 
-def pandas_to_orc(df, file_name=None, file_io_obj=None, stripe_size=67108864):
-    schema = get_orc_schema(df)
+def pandas_to_orc(
+    df,
+    file_name=None,
+    file_io_obj=None,
+    stripe_size=67108864,
+    arrow_table_schema=None,
+):
+    schema = get_orc_schema(df, arrow_table_schema=arrow_table_schema)
 
-    tuple_list = _preprocess_to_orc_tuple(df)
+    tuple_list, pdf, df = _preprocess_to_orc_tuple(
+        df, arrow_table_schema=arrow_table_schema
+    )
 
     if file_name is not None:
         with open(file_name, "wb") as data:
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 13be158ed78..e1c7b42c7a3 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -133,7 +133,25 @@ def _generate_column(column_params, num_rows):
         else:
             arrow_type = None
 
-        if not isinstance(arrow_type, pa.lib.Decimal128Type):
+        if isinstance(column_params.dtype, cudf.StructDtype):
+            vals = pa.StructArray.from_arrays(
+                column_params.generator,
+                names=column_params.dtype.fields.keys(),
+                mask=pa.array(
+                    np.random.choice(
+                        [True, False],
+                        size=num_rows,
+                        p=[
+                            column_params.null_frequency,
+                            1 - column_params.null_frequency,
+                        ],
+                    )
+                )
+                if column_params.null_frequency > 0.0
+                else None,
+            )
+            return vals
+        elif not isinstance(arrow_type, pa.lib.Decimal128Type):
             vals = pa.array(
                 column_params.generator,
                 size=column_params.cardinality,
@@ -352,6 +370,30 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "struct":
+            nesting_max_depth = meta["nesting_max_depth"]
+            max_types_at_each_level = meta["max_types_at_each_level"]
+            max_null_frequency = meta["max_null_frequency"]
+            nesting_depth = np.random.randint(1, nesting_max_depth)
+            structDtype = create_nested_struct_type(
+                max_types_at_each_level=max_types_at_each_level,
+                nesting_level=nesting_depth,
+            )
+
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=struct_generator(
+                        dtype=structDtype,
+                        cardinality=cardinality,
+                        size=rows,
+                        max_null_frequency=max_null_frequency,
+                    ),
+                    is_sorted=False,
+                    dtype=structDtype,
+                )
+            )
         elif dtype == "decimal64":
             max_precision = meta.get(
                 "max_precision", cudf.Decimal64Dtype.MAX_PRECISION
@@ -600,11 +642,15 @@ def decimal_generator(dtype, size):
     )
 
 
-def get_values_for_nested_data(dtype, lists_max_length):
+def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
     """
     Returns list of values based on dtype.
     """
-    cardinality = np.random.randint(0, lists_max_length)
+    if size is None:
+        cardinality = np.random.randint(0, lists_max_length)
+    else:
+        cardinality = size
+
     dtype = cudf.dtype(dtype)
     if dtype.kind in ("i", "u"):
         values = int_generator(dtype=dtype, size=cardinality)()
@@ -628,12 +674,7 @@ def get_values_for_nested_data(dtype, lists_max_length):
     else:
         raise TypeError(f"Unsupported dtype: {dtype}")
 
-    # To ensure numpy arrays are not passed as input to
-    # list constructor, returning a python list object here.
-    if isinstance(values, np.ndarray):
-        return values.tolist()
-    else:
-        return values
+    return values
 
 
 def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
@@ -657,9 +698,40 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list):
         top_level_list = get_values_for_nested_data(
             dtype=dtype, lists_max_length=lists_max_length
         )
+        # To ensure numpy arrays are not passed as input to
+        # list constructor, returning a python list object here.
+        if isinstance(top_level_list, np.ndarray):
+            top_level_list = top_level_list.tolist()
+
     return top_level_list
 
 
+def make_array_for_struct(dtype, cardinality, size, max_null_frequency):
+    """
+    Helper to create a pa.array with `size` and `dtype`
+    for a `StructArray`.
+    """
+
+    null_frequency = np.random.uniform(low=0, high=max_null_frequency)
+    local_cardinality = max(np.random.randint(low=0, high=cardinality), 1)
+    data = get_values_for_nested_data(
+        dtype=dtype.type.to_pandas_dtype(), size=local_cardinality
+    )
+    vals = np.random.choice(data, size=size)
+
+    return pa.array(
+        vals,
+        mask=np.random.choice(
+            [True, False], size=size, p=[null_frequency, 1 - null_frequency],
+        )
+        if null_frequency > 0.0
+        else None,
+        size=size,
+        safe=False,
+        type=dtype.type,
+    )
+
+
 def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     """
     Returns a list of nested lists with random nesting
@@ -680,6 +752,34 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length):
     return list_of_lists
 
 
+def get_nested_structs(dtype, cardinality, size, max_null_frequency):
+    """
+    Returns a list of arrays with random data
+    corresponding to the dtype provided.
+    ``dtype`` here should be a ``cudf.StructDtype``
+    """
+    list_of_arrays = []
+
+    for name, col_dtype in dtype.fields.items():
+        if isinstance(col_dtype, cudf.StructDtype):
+            result_arrays = get_nested_structs(
+                col_dtype, cardinality, size, max_null_frequency
+            )
+            result_arrays = pa.StructArray.from_arrays(
+                result_arrays, names=col_dtype.fields.keys()
+            )
+        else:
+            result_arrays = make_array_for_struct(
+                dtype=dtype._typ[name],
+                cardinality=cardinality,
+                size=size,
+                max_null_frequency=max_null_frequency,
+            )
+        list_of_arrays.append(result_arrays)
+
+    return list_of_arrays
+
+
 def list_generator(dtype, size, nesting_depth, lists_max_length):
     """
     Generator for list data
@@ -690,3 +790,29 @@ def list_generator(dtype, size, nesting_depth, lists_max_length):
         nesting_depth=nesting_depth,
         lists_max_length=lists_max_length,
     )
+
+
+def struct_generator(dtype, cardinality, size, max_null_frequency):
+    """
+    Generator for struct data
+    """
+    return lambda: get_nested_structs(
+        dtype=dtype,
+        cardinality=cardinality,
+        size=size,
+        max_null_frequency=max_null_frequency,
+    )
+
+
+def create_nested_struct_type(max_types_at_each_level, nesting_level):
+    dtypes_list = cudf.utils.dtypes.ALL_TYPES
+    picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level)
+    type_dict = {}
+    for name, type_ in enumerate(picked_types):
+        if type_ == "struct":
+            type_dict[str(name)] = create_nested_struct_type(
+                max_types_at_each_level, nesting_level - 1
+            )
+        else:
+            type_dict[str(name)] = cudf.dtype(type_)
+    return cudf.StructDtype(type_dict)

From 2bd7320c0097aa08033a68bbca41632315a5e58c Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 20 Jan 2022 05:21:57 -0800
Subject: [PATCH 190/202] Add `_from_column_like_self` factory (#10022)

Follow up to #9558

On a return trip from libcudf, it is a common pattern for cudf frame to apply its own metadata to the columns. This PR generalizes this procedure as a new factory function `_from_colums_like_self`

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Paul Taylor (https://github.com/trxcllnt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/10022
---
 python/cudf/cudf/core/_base_index.py   | 18 +++++-------------
 python/cudf/cudf/core/frame.py         | 16 ++++++++++++++++
 python/cudf/cudf/core/indexed_frame.py | 17 ++++-------------
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index be5a1e7cc93..b1335c7c076 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1444,7 +1444,8 @@ def drop_duplicates(
         """
 
         # This utilizes the fact that all `Index` is also a `Frame`.
-        result = self.__class__._from_columns(
+        # Except RangeIndex.
+        return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
                 keys=range(len(self._data)),
@@ -1453,8 +1454,6 @@ def drop_duplicates(
             ),
             self._column_names,
         )
-        result._copy_type_metadata(self, include_index=False)
-        return result
 
     def dropna(self, how="any"):
         """
@@ -1476,12 +1475,10 @@ def dropna(self, how="any"):
             for col in self._columns
         ]
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             drop_nulls(data_columns, how=how, keys=range(len(data_columns)),),
             self._column_names,
         )
-        result._copy_type_metadata(self, include_index=False)
-        return result
 
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         """Gather rows of index specified by indices in `gather_map`.
@@ -1501,14 +1498,11 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         ):
             raise IndexError("Gather map index is out of bounds.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             gather(list(self._columns), gather_map, nullify=nullify),
             self._column_names,
         )
 
-        result._copy_type_metadata(self, include_index=False)
-        return result
-
     def take(self, indices, axis=0, allow_fill=True, fill_value=None):
         """Return a new index containing the rows specified by *indices*
 
@@ -1561,12 +1555,10 @@ def _apply_boolean_mask(self, boolean_mask):
         if not is_bool_dtype(boolean_mask.dtype):
             raise ValueError("boolean_mask is not boolean type.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             apply_boolean_mask(list(self._columns), boolean_mask),
             column_names=self._column_names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def _split_columns_by_levels(self, levels):
         if isinstance(levels, int) and levels > 0:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 1d59d9f3b1a..69dc5389e7a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -163,6 +163,22 @@ def _from_columns(
 
         return cls._from_data(data, index)
 
+    def _from_columns_like_self(
+        self,
+        columns: List[ColumnBase],
+        column_names: List[str],
+        index_names: Optional[List[str]] = None,
+    ):
+        """Construct a `Frame` from a list of columns with metadata from self.
+
+        If `index_names` is set, the first `len(index_names)` columns are
+        used to construct the index of the frame.
+        """
+        frame = self.__class__._from_columns(
+            columns, column_names, index_names
+        )
+        return frame._copy_type_metadata(self, include_index=bool(index_names))
+
     def _mimic_inplace(
         self: T, result: Frame, inplace: bool = False
     ) -> Optional[Frame]:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9458057894a..e9f2de1cb1c 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -556,7 +556,7 @@ def _gather(
         ):
             raise IndexError("Gather map index is out of bounds.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.copying.gather(
                 list(self._index._columns + self._columns)
                 if keep_index
@@ -568,9 +568,6 @@ def _gather(
             self._index.names if keep_index else None,
         )
 
-        result._copy_type_metadata(self, include_index=keep_index)
-        return result
-
     def _positions_from_column_names(
         self, column_names, offset_by_index_columns=False
     ):
@@ -628,7 +625,7 @@ def drop_duplicates(
         keys = self._positions_from_column_names(
             subset, offset_by_index_columns=not ignore_index
         )
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
@@ -640,8 +637,6 @@ def drop_duplicates(
             self._column_names,
             self._index.names if not ignore_index else None,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def add_prefix(self, prefix):
         """
@@ -1354,7 +1349,7 @@ def _drop_na_rows(
                 for col in self._columns
             ]
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
                 list(self._index._data.columns) + data_columns,
                 how=how,
@@ -1366,8 +1361,6 @@ def _drop_na_rows(
             self._column_names,
             self._index.names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def _apply_boolean_mask(self, boolean_mask):
         """Apply boolean mask to each row of `self`.
@@ -1378,15 +1371,13 @@ def _apply_boolean_mask(self, boolean_mask):
         if not is_bool_dtype(boolean_mask.dtype):
             raise ValueError("boolean_mask is not boolean type.")
 
-        result = self.__class__._from_columns(
+        return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
                 list(self._index._columns + self._columns), boolean_mask
             ),
             column_names=self._column_names,
             index_names=self._index.names,
         )
-        result._copy_type_metadata(self)
-        return result
 
     def take(self, indices, axis=0):
         """Return a new frame containing the rows specified by *indices*.

From e78f47ae6cd19501d0875595b82f8618278ca4eb Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 20 Jan 2022 08:22:17 -0500
Subject: [PATCH 191/202] Add `groupby.transform` (only support for
 aggregations) (#10005)

Closes https://github.com/rapidsai/cudf/issues/4522

This PR adds support for doing groupby aggregations via the `transform()` API, where the result of the aggregation is broadcasted to the size of the group. Note that more general transformations are not supported at this time.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Michael Wang (https://github.com/isVoid)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/10005
---
 docs/cudf/source/api_docs/groupby.rst    |  1 +
 docs/cudf/source/basics/groupby.rst      | 23 +++++++++
 python/cudf/cudf/core/groupby/groupby.py | 64 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py   | 22 ++++++++
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index 575d7442cdf..190978a7581 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -34,6 +34,7 @@ Function application
    SeriesGroupBy.aggregate
    DataFrameGroupBy.aggregate
    GroupBy.pipe
+   GroupBy.transform
 
 Computations / descriptive stats
 --------------------------------
diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst
index f3269768025..cbc8f7e712f 100644
--- a/docs/cudf/source/basics/groupby.rst
+++ b/docs/cudf/source/basics/groupby.rst
@@ -1,3 +1,5 @@
+.. _basics.groupby:
+
 GroupBy
 =======
 
@@ -220,6 +222,27 @@ Limitations
  .. |describe| replace:: ``describe``
  .. _describe: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#flexible-apply
 
+
+Transform
+---------
+
+The ``.transform()`` method aggregates per group, and broadcasts the
+result to the group size, resulting in a Series/DataFrame that is of
+the same size as the input Series/DataFrame.
+
+.. code:: python
+
+     >>> import cudf
+     >>> df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+     >>> df.groupby('a').transform('max')
+        b
+     0  5
+     1  3
+     2  3
+     3  5
+     4  5
+
+
 Rolling window calculations
 ---------------------------
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 6da98bf980d..a393d8e9457 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -184,11 +184,25 @@ def agg(self, func):
         Parameters
         ----------
         func : str, callable, list or dict
+            Argument specifying the aggregation(s) to perform on the
+            groups. `func` can be any of the following:
+
+              - string: the name of a supported aggregation
+              - callable: a function that accepts a Series/DataFrame and
+                performs a supported operation on it.
+              - list: a list of strings/callables specifying the
+                aggregations to perform on every column.
+              - dict: a mapping of column names to string/callable
+                specifying the aggregations to perform on those
+                columns.
+
+        See :ref:`the user guide <basics.groupby>` for supported
+        aggregations.
 
         Returns
         -------
         A Series or DataFrame containing the combined results of the
-        aggregation.
+        aggregation(s).
 
         Examples
         --------
@@ -655,6 +669,54 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
+    def transform(self, function):
+        """Apply an aggregation, then broadcast the result to the group size.
+
+        Parameters
+        ----------
+        function: str or callable
+            Aggregation to apply to each group. Note that the set of
+            operations currently supported by `transform` is identical
+            to that supported by the `agg` method.
+
+        Returns
+        -------
+        A Series or DataFrame of the same size as the input, with the
+        result of the aggregation per group broadcasted to the group
+        size.
+
+        Examples
+        --------
+        .. code-block:: python
+
+          import cudf
+          df = cudf.DataFrame({'a': [2, 1, 1, 2, 2], 'b': [1, 2, 3, 4, 5]})
+          df.groupby('a').transform('max')
+             b
+          0  5
+          1  3
+          2  3
+          3  5
+          4  5
+
+        See also
+        --------
+        cudf.core.groupby.GroupBy.agg
+        """
+        try:
+            result = self.agg(function)
+        except TypeError as e:
+            raise NotImplementedError(
+                "Currently, `transform()` supports only aggregations."
+            ) from e
+
+        if not result.index.equals(self.grouping.keys):
+            result = result._align_to_index(
+                self.grouping.keys, how="right", allow_non_unique=True
+            )
+            result = result.reset_index(drop=True)
+        return result
+
     def rolling(self, *args, **kwargs):
         """
         Returns a `RollingGroupby` object that enables rolling window
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c73e96de470..f5decd62ea9 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2362,6 +2362,28 @@ def test_groupby_get_group(pdf, group, name, obj):
     assert_groupby_results_equal(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "by",
+    [
+        "a",
+        ["a", "b"],
+        pd.Series([2, 1, 1, 2, 2]),
+        pd.Series(["b", "a", "a", "b", "b"]),
+    ],
+)
+@pytest.mark.parametrize("agg", ["sum", "mean", lambda df: df.mean()])
+def test_groupby_transform_aggregation(by, agg):
+    gdf = cudf.DataFrame(
+        {"a": [2, 2, 1, 2, 1], "b": [1, 1, 1, 2, 2], "c": [1, 2, 3, 4, 5]}
+    )
+    pdf = gdf.to_pandas()
+
+    expected = pdf.groupby(by).transform(agg)
+    actual = gdf.groupby(by).transform(agg)
+
+    assert_groupby_results_equal(expected, actual)
+
+
 def test_groupby_select_then_ffill():
     pdf = pd.DataFrame(
         {

From 13429ffd67367ca565380c03edd52e00b3b12495 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 20 Jan 2022 08:25:29 -0500
Subject: [PATCH 192/202] Fix matching regex word-boundary (\b) in strings
 replace (#9997)

Closes #9950

Fixes matching a single word-boundary (BOW) regex pattern. This pattern will match word boundaries and not any actual characters. This means the `(begin,end)` position values will be equal. The replace code was always expecting `begin < end` character range to replace. The logic has been updated to allow for this case.

Additional gtests have been added that include a single `\b` pattern character.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9997
---
 cpp/src/strings/replace/replace_re.cu     | 70 +++++++++++++----------
 cpp/tests/strings/replace_regex_tests.cpp | 12 +++-
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 9fd1768453a..2c594bb86a8 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,37 +62,49 @@ struct replace_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
-    auto const d_str  = d_strings.element<string_view>(idx);
-    auto const nchars = d_str.length();                  // number of characters in input string
-    auto nbytes       = d_str.size_bytes();              // number of bytes in input string
-    auto mxn          = maxrepl < 0 ? nchars : maxrepl;  // max possible replaces for this string
-    auto in_ptr       = d_str.data();                    // input pointer (i)
-    auto out_ptr      = d_chars ? d_chars + d_offsets[idx] : nullptr;  // output pointer (o)
-    size_type lpos    = 0;
-    int32_t begin     = 0;
-    int32_t end       = static_cast<int32_t>(nchars);
+
+    auto const d_str = d_strings.element<string_view>(idx);
+    auto nbytes      = d_str.size_bytes();                  // number of bytes in input string
+    auto mxn = maxrepl < 0 ? d_str.length() + 1 : maxrepl;  // max possible replaces for this string
+    auto in_ptr        = d_str.data();                      // input pointer (i)
+    auto out_ptr       = d_chars ? d_chars + d_offsets[idx]  // output pointer (o)
+                                 : nullptr;
+    size_type last_pos = 0;
+    int32_t begin      = 0;   // these are for calling prog.find
+    int32_t end        = -1;  // matches final word-boundary if at the end of the string
+
     // copy input to output replacing strings as we go
-    while (mxn-- > 0)  // maximum number of replaces
-    {
-      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0)
-        break;                                        // no more matches
-      auto spos = d_str.byte_offset(begin);           // get offset for these
-      auto epos = d_str.byte_offset(end);             // character position values
-      nbytes += d_repl.size_bytes() - (epos - spos);  // compute new size
-      if (out_ptr)                                    // replace
-      {                                               // i:bbbbsssseeee
-        out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);  // o:bbbb
-        out_ptr = copy_string(out_ptr, d_repl);                             // o:bbbbrrrrrr
-                                                                            //  out_ptr ---^
-        lpos = epos;                                                        // i:bbbbsssseeee
-      }                                                                     //  in_ptr --^
-      begin = end;
-      end   = static_cast<int32_t>(nchars);
+    while (mxn-- > 0) {  // maximum number of replaces
+
+      if (prog.is_empty() || prog.find<stack_size>(idx, d_str, begin, end) <= 0) {
+        break;  // no more matches
+      }
+
+      auto const start_pos = d_str.byte_offset(begin);        // get offset for these
+      auto const end_pos   = d_str.byte_offset(end);          // character position values
+      nbytes += d_repl.size_bytes() - (end_pos - start_pos);  // and compute new size
+
+      if (out_ptr) {                                         // replace:
+                                                             // i:bbbbsssseeee
+        out_ptr = copy_and_increment(out_ptr,                //   ^
+                                     in_ptr + last_pos,      // o:bbbb
+                                     start_pos - last_pos);  //       ^
+        out_ptr = copy_string(out_ptr, d_repl);              // o:bbbbrrrrrr
+                                                             //  out_ptr ---^
+        last_pos = end_pos;                                  // i:bbbbsssseeee
+      }                                                      //  in_ptr --^
+
+      begin = end + (begin == end);
+      end   = -1;
     }
-    if (out_ptr)                                                  // copy the remainder
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // o:bbbbrrrrrreeee
-    else
+
+    if (out_ptr) {
+      memcpy(out_ptr,                         // copy the remainder
+             in_ptr + last_pos,               // o:bbbbrrrrrreeee
+             d_str.size_bytes() - last_pos);  //             ^   ^
+    } else {
       d_offsets[idx] = static_cast<int32_t>(nbytes);
+    }
   }
 };
 
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index eac06fa4588..ddbd9f5b3d6 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,6 +145,16 @@ TEST_F(StringsReplaceRegexTest, MultiReplacement)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
 }
 
+TEST_F(StringsReplaceRegexTest, WordBoundary)
+{
+  cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é"});
+  auto results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X"));
+  cudf::test::strings_column_wrapper expected(
+    {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+}
+
 TEST_F(StringsReplaceRegexTest, Multiline)
 {
   auto const multiline = cudf::strings::regex_flags::MULTILINE;

From 276bcf4171c82101a12b3d2392802e4746d0d2e3 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Thu, 20 Jan 2022 11:05:13 -0500
Subject: [PATCH 193/202] Add `clang-tidy` to libcudf (#9860)

This PR is adding clang-tidy to cudf and adding the initial checks. Note more checks will be enabled in the future.

Relevant PRs:
* `rmm`: https://github.com/rapidsai/rmm/pull/857
* `cuml`: https://github.com/rapidsai/cuml/pull/1945

To do list:
* [x] Add `.clang-tidy` file
* [x] Add python script
* [x] Apply `modernize-` changes
* [x] Revert `cxxopts` changes
* [x] Fixed Python parquet failures
* [x] Ignore `cxxopts` file
* [x] Ignore the `build/_deps` directories

Splitting out the following into a separate PR so we can get the changes merged for 22.02 (https://github.com/rapidsai/cudf/pull/10064):
* ~~[ ] Disable `clang-diagnostic-errors/warnings`~~
* ~~[ ] Fix include files being skipped~~
* ~~[ ] Set up CI script~~
* ~~[ ] Clean up python script~~

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9860
---
 .clang-tidy                                   |  27 ++
 .../common/generate_benchmark_input.hpp       |   4 +-
 .../copying/contiguous_split_benchmark.cu     |   6 +-
 cpp/benchmarks/copying/gather_benchmark.cu    |   2 +-
 cpp/benchmarks/copying/scatter_benchmark.cu   |   2 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  |  13 +-
 .../io/csv/csv_reader_benchmark.cpp           |   4 +-
 .../io/csv/csv_writer_benchmark.cpp           |   4 +-
 .../io/orc/orc_reader_benchmark.cpp           |   2 +-
 .../io/orc/orc_writer_benchmark.cpp           |   2 +-
 .../io/parquet/parquet_reader_benchmark.cpp   |   2 +-
 .../io/parquet/parquet_writer_benchmark.cpp   |   2 +-
 .../lists/copying/scatter_lists_benchmark.cu  |   2 +-
 .../type_dispatcher_benchmark.cu              |   6 +-
 cpp/include/cudf/aggregation.hpp              |  12 +-
 .../cudf/ast/detail/expression_evaluator.cuh  |   8 +-
 .../cudf/ast/detail/expression_parser.hpp     |  10 +-
 cpp/include/cudf/ast/expressions.hpp          |  49 ++--
 cpp/include/cudf/column/column.hpp            |  19 +-
 .../cudf/column/column_device_view.cuh        |  48 ++--
 cpp/include/cudf/column/column_view.hpp       |  36 +--
 cpp/include/cudf/copying.hpp                  |   4 +-
 .../cudf/detail/aggregation/aggregation.hpp   | 145 ++++++----
 .../cudf/detail/aggregation/result_cache.hpp  |   4 +-
 cpp/include/cudf/detail/merge.cuh             |   4 +-
 cpp/include/cudf/detail/structs/utilities.hpp |   6 +-
 .../cudf/detail/utilities/device_atomics.cuh  |  12 +-
 .../cudf/detail/utilities/hash_functions.cuh  |   4 +-
 .../dictionary/dictionary_column_view.hpp     |  12 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  32 +--
 cpp/include/cudf/io/avro.hpp                  |   8 +-
 cpp/include/cudf/io/csv.hpp                   |  94 ++++---
 cpp/include/cudf/io/data_sink.hpp             |   7 +-
 cpp/include/cudf/io/datasource.hpp            |  38 +--
 cpp/include/cudf/io/json.hpp                  |   2 +-
 cpp/include/cudf/io/orc.hpp                   |  40 +--
 cpp/include/cudf/io/parquet.hpp               |  35 +--
 .../cudf/io/text/data_chunk_source.hpp        |   6 +-
 .../io/text/data_chunk_source_factories.hpp   |  14 +-
 .../cudf/io/text/detail/multistate.hpp        |  16 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |   6 +-
 cpp/include/cudf/io/types.hpp                 |  50 ++--
 cpp/include/cudf/join.hpp                     |  14 +-
 .../cudf/lists/detail/scatter_helper.cuh      |   8 +-
 cpp/include/cudf/lists/list_device_view.cuh   |  21 +-
 .../cudf/lists/lists_column_device_view.cuh   |  15 +-
 cpp/include/cudf/lists/lists_column_view.hpp  |  15 +-
 .../cudf/rolling/range_window_bounds.hpp      |   4 +-
 cpp/include/cudf/scalar/scalar.hpp            |  29 +-
 .../cudf/scalar/scalar_device_view.cuh        |  16 +-
 cpp/include/cudf/strings/json.hpp             |   7 +-
 cpp/include/cudf/strings/string_view.cuh      |   8 +-
 cpp/include/cudf/strings/string_view.hpp      |  50 ++--
 .../cudf/strings/strings_column_view.hpp      |  16 +-
 .../cudf/structs/structs_column_view.hpp      |   2 +-
 cpp/include/cudf/table/table.hpp              |  10 +-
 cpp/include/cudf/table/table_device_view.cuh  |   4 +-
 cpp/include/cudf/table/table_view.hpp         |  14 +-
 .../cudf/tdigest/tdigest_column_view.cuh      |  14 +-
 cpp/include/cudf/types.hpp                    |   4 +-
 cpp/include/cudf/utilities/span.hpp           |  21 +-
 cpp/include/cudf_test/cudf_gtest.hpp          |   2 +-
 cpp/include/cudf_test/file_utilities.hpp      |   2 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |   4 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   4 +-
 cpp/scripts/run-clang-tidy.py                 | 254 ++++++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |   4 +-
 cpp/src/binaryop/compiled/operation.cuh       |   4 +-
 cpp/src/binaryop/compiled/util.cpp            |   4 +-
 cpp/src/copying/concatenate.cu                |   2 +-
 cpp/src/copying/contiguous_split.cu           |   6 +-
 cpp/src/groupby/sort/functors.hpp             |   2 +-
 cpp/src/groupby/sort/group_std.cu             |   2 +-
 cpp/src/groupby/sort/group_tdigest.cu         |   2 +-
 cpp/src/hash/concurrent_unordered_map.cuh     |   2 +-
 .../hash/concurrent_unordered_multimap.cuh    |   2 +-
 cpp/src/hash/hash_allocator.cuh               |   4 +-
 cpp/src/hash/managed.cuh                      |   2 +-
 cpp/src/interop/dlpack.cpp                    |   2 +-
 cpp/src/io/avro/avro.cpp                      |   9 +-
 cpp/src/io/avro/avro.h                        |  12 +-
 cpp/src/io/avro/avro_common.h                 |   5 +-
 cpp/src/io/avro/avro_gpu.cu                   |   2 +-
 cpp/src/io/comp/brotli_dict.cpp               |   5 +-
 cpp/src/io/comp/brotli_dict.h                 |   2 +-
 cpp/src/io/comp/brotli_tables.h               |   4 +-
 cpp/src/io/comp/cpu_unbz2.cpp                 |  16 +-
 cpp/src/io/comp/debrotli.cu                   |  44 +--
 cpp/src/io/comp/gpuinflate.cu                 |  12 +-
 cpp/src/io/comp/gpuinflate.h                  |   2 +-
 cpp/src/io/comp/snap.cu                       |  24 +-
 cpp/src/io/comp/uncomp.cpp                    |  11 +-
 cpp/src/io/comp/unsnap.cu                     |  16 +-
 cpp/src/io/csv/csv_gpu.h                      |   8 +-
 cpp/src/io/csv/writer_impl.cu                 |   2 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp     |  25 +-
 cpp/src/io/orc/orc.h                          |  37 +--
 cpp/src/io/orc/reader_impl.cu                 |   2 +-
 cpp/src/io/orc/stripe_data.cu                 |   8 +-
 cpp/src/io/orc/stripe_enc.cu                  |   2 +-
 cpp/src/io/orc/stripe_init.cu                 |   4 +-
 cpp/src/io/orc/timezone.cpp                   |   6 +-
 cpp/src/io/orc/timezone.cuh                   |   4 +-
 cpp/src/io/orc/writer_impl.cu                 |  32 +--
 cpp/src/io/orc/writer_impl.hpp                |  16 +-
 .../io/parquet/compact_protocol_writer.hpp    |   4 +-
 cpp/src/io/parquet/page_data.cu               |  21 +-
 cpp/src/io/parquet/page_enc.cu                |   4 +-
 cpp/src/io/parquet/parquet.hpp                |  69 ++---
 cpp/src/io/parquet/reader_impl.cu             |  43 +--
 cpp/src/io/parquet/writer_impl.cu             |  18 +-
 cpp/src/io/statistics/statistics.cuh          |   4 +-
 .../io/statistics/typed_statistics_chunk.cuh  |  31 +--
 cpp/src/io/utilities/block_utils.cuh          |  26 +-
 cpp/src/io/utilities/data_sink.cpp            |  13 +-
 cpp/src/io/utilities/datasource.cpp           |  15 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   6 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   6 +-
 cpp/src/io/utilities/parsing_utils.cuh        |   2 +-
 cpp/src/io/utilities/thread_pool.hpp          |  10 +-
 cpp/src/join/hash_join.cuh                    |  12 +-
 cpp/src/lists/copying/gather.cu               |   4 +-
 cpp/src/partitioning/partitioning.cu          |   2 +-
 cpp/src/quantiles/quantiles_util.hpp          |   8 +-
 cpp/src/rolling/rolling_detail.cuh            |   4 +-
 cpp/src/strings/capitalize.cu                 |   2 +-
 cpp/src/strings/combine/join_list_elements.cu |  21 +-
 cpp/src/strings/contains.cu                   |   2 +-
 cpp/src/strings/convert/convert_datetime.cu   |  20 +-
 cpp/src/strings/convert/convert_durations.cu  |   2 +-
 cpp/src/strings/findall.cu                    |   2 +-
 cpp/src/strings/json/json_path.cu             |  49 ++--
 cpp/src/strings/padding.cu                    |   2 +-
 cpp/src/strings/regex/regcomp.cpp             |   2 +-
 cpp/src/strings/regex/regcomp.h               |  18 +-
 cpp/src/strings/regex/regex.cuh               |  18 +-
 cpp/src/strings/regex/regex.inl               |  14 +-
 cpp/src/strings/regex/regexec.cu              |   3 +-
 cpp/src/strings/split/split.cu                |   2 +-
 cpp/src/text/subword/data_normalizer.cu       |   5 +-
 .../text/subword/detail/tokenizer_utils.cuh   |   2 +-
 cpp/src/text/subword/load_hash_file.cu        |   2 +-
 cpp/src/transform/row_bit_count.cu            |   8 +-
 cpp/tests/column/column_view_shallow_test.cpp |   4 +-
 cpp/tests/copying/concatenate_tests.cu        |  35 +--
 cpp/tests/copying/copy_tests.cpp              |  76 +++---
 cpp/tests/groupby/tdigest_tests.cu            |   3 +-
 cpp/tests/hash_map/multimap_test.cu           |  15 +-
 cpp/tests/hashing/hash_test.cpp               |  12 +-
 cpp/tests/io/csv_test.cpp                     |   2 +-
 cpp/tests/io/orc_test.cpp                     |  22 +-
 cpp/tests/io/parquet_test.cpp                 |  21 +-
 cpp/tests/replace/replace_tests.cpp           |   2 +-
 cpp/tests/scalar/factories_test.cpp           |   2 +-
 cpp/tests/strings/chars_types_tests.cpp       |  21 +-
 cpp/tests/strings/extract_tests.cpp           |   2 +-
 cpp/tests/strings/factories_test.cu           |   4 +-
 cpp/tests/strings/json_tests.cpp              |  46 ++--
 cpp/tests/table/table_view_tests.cu           |   2 +-
 159 files changed, 1410 insertions(+), 1020 deletions(-)
 create mode 100644 .clang-tidy
 create mode 100644 cpp/scripts/run-clang-tidy.py

diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 00000000000..043a93e6ff9
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,27 @@
+---
+Checks:    
+      'modernize-*,
+       -modernize-use-equals-default,
+       -modernize-concat-nested-namespaces,
+       -modernize-use-trailing-return-type'
+      
+      # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
+      # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
+      # -modernize-use-trailing-return-type  # just a preference
+
+WarningsAsErrors: ''
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+CheckOptions:
+ - key:             modernize-loop-convert.MaxCopySize
+   value:           '16'
+ - key:             modernize-loop-convert.MinConfidence
+   value:           reasonable
+ - key:             modernize-pass-by-value.IncludeStyle
+   value:           llvm
+ - key:             modernize-replace-auto-ptr.IncludeStyle
+   value:           llvm
+ - key:             modernize-use-nullptr.NullMacros
+   value:           'NULL'
+...
diff --git a/cpp/benchmarks/common/generate_benchmark_input.hpp b/cpp/benchmarks/common/generate_benchmark_input.hpp
index 3dbc6561839..893c8a61543 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.hpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.hpp
@@ -300,8 +300,8 @@ class data_profile {
 
   auto get_bool_probability() const { return bool_probability; }
   auto get_null_frequency() const { return null_frequency; };
-  auto get_cardinality() const { return cardinality; };
-  auto get_avg_run_length() const { return avg_run_length; };
+  [[nodiscard]] auto get_cardinality() const { return cardinality; };
+  [[nodiscard]] auto get_avg_run_length() const { return avg_run_length; };
 
   // Users should pass integral values for bounds when setting the parameters for types that have
   // discrete distributions (integers, strings, lists). Otherwise the call with have no effect.
diff --git a/cpp/benchmarks/copying/contiguous_split_benchmark.cu b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
index 55e1360efc8..bb6a9320c4a 100644
--- a/cpp/benchmarks/copying/contiguous_split_benchmark.cu
+++ b/cpp/benchmarks/copying/contiguous_split_benchmark.cu
@@ -51,10 +51,12 @@ void BM_contiguous_split_common(benchmark::State& state,
   std::vector<std::unique_ptr<cudf::column>> columns(src_cols.size());
   std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](T& in) {
     auto ret = in.release();
-    ret->null_count();
+    // computing the null count is not a part of the benchmark's target code path, and we want the
+    // property to be pre-computed so that we measure the performance of only the intended code path
+    [[maybe_unused]] auto const nulls = ret->null_count();
     return ret;
   });
-  cudf::table src_table(std::move(columns));
+  auto const src_table = cudf::table(std::move(columns));
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
diff --git a/cpp/benchmarks/copying/gather_benchmark.cu b/cpp/benchmarks/copying/gather_benchmark.cu
index f075e9c486e..eaa201a0678 100644
--- a/cpp/benchmarks/copying/gather_benchmark.cu
+++ b/cpp/benchmarks/copying/gather_benchmark.cu
@@ -39,7 +39,7 @@ template <class TypeParam, bool coalesce>
 void BM_gather(benchmark::State& state)
 {
   const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols = (cudf::size_type)state.range(1);
+  const auto n_cols = (cudf::size_type)state.range(1);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
diff --git a/cpp/benchmarks/copying/scatter_benchmark.cu b/cpp/benchmarks/copying/scatter_benchmark.cu
index 0c24dd50a13..a9ab376c8c3 100644
--- a/cpp/benchmarks/copying/scatter_benchmark.cu
+++ b/cpp/benchmarks/copying/scatter_benchmark.cu
@@ -40,7 +40,7 @@ template <class TypeParam, bool coalesce>
 void BM_scatter(benchmark::State& state)
 {
   const cudf::size_type source_size{(cudf::size_type)state.range(0)};
-  const cudf::size_type n_cols = (cudf::size_type)state.range(1);
+  const auto n_cols = (cudf::size_type)state.range(1);
 
   // Every element is valid
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8476a137c12..83f79bd68c5 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -68,13 +68,13 @@ inline auto make_pool()
  */
 class benchmark : public ::benchmark::Fixture {
  public:
-  virtual void SetUp(const ::benchmark::State& state)
+  void SetUp(const ::benchmark::State& state) override
   {
     mr = make_pool();
     rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
-  virtual void TearDown(const ::benchmark::State& state)
+  void TearDown(const ::benchmark::State& state) override
   {
     // reset default resource to the initial resource
     rmm::mr::set_current_device_resource(nullptr);
@@ -82,8 +82,8 @@ class benchmark : public ::benchmark::Fixture {
   }
 
   // eliminate partial override warnings (see benchmark/benchmark.h)
-  virtual void SetUp(::benchmark::State& st) { SetUp(const_cast<const ::benchmark::State&>(st)); }
-  virtual void TearDown(::benchmark::State& st)
+  void SetUp(::benchmark::State& st) override { SetUp(const_cast<const ::benchmark::State&>(st)); }
+  void TearDown(::benchmark::State& st) override
   {
     TearDown(const_cast<const ::benchmark::State&>(st));
   }
@@ -102,7 +102,10 @@ class memory_stats_logger {
 
   ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
 
-  size_t peak_memory_usage() const noexcept { return statistics_mr.get_bytes_counter().peak; }
+  [[nodiscard]] size_t peak_memory_usage() const noexcept
+  {
+    return statistics_mr.get_bytes_counter().peak;
+  }
 
  private:
   rmm::mr::device_memory_resource* existing_mr;
diff --git a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
index 77bf4b03a14..7de10f9f4c1 100644
--- a/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_benchmark.cpp
@@ -35,8 +35,8 @@ class CsvRead : public cudf::benchmark {
 
 void BM_csv_read_varying_input(benchmark::State& state)
 {
-  auto const data_types     = get_type_or_group(state.range(0));
-  io_type const source_type = static_cast<io_type>(state.range(1));
+  auto const data_types  = get_type_or_group(state.range(0));
+  auto const source_type = static_cast<io_type>(state.range(1));
 
   auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
   auto const view = tbl->view();
diff --git a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
index 9baab6b2571..1e757da6f33 100644
--- a/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer_benchmark.cpp
@@ -35,8 +35,8 @@ class CsvWrite : public cudf::benchmark {
 
 void BM_csv_write_varying_inout(benchmark::State& state)
 {
-  auto const data_types   = get_type_or_group(state.range(0));
-  io_type const sink_type = static_cast<io_type>(state.range(1));
+  auto const data_types = get_type_or_group(state.range(0));
+  auto const sink_type  = static_cast<io_type>(state.range(1));
 
   auto const tbl  = create_random_table(data_types, num_cols, table_size_bytes{data_size});
   auto const view = tbl->view();
diff --git a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
index 6ab8d8d09c0..0c54136226a 100644
--- a/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_benchmark.cpp
@@ -40,7 +40,7 @@ void BM_orc_read_varying_input(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const source_type = static_cast<io_type>(state.range(4));
+  auto const source_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index 933b3d02e08..be1a2073057 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -40,7 +40,7 @@ void BM_orc_write_varying_inout(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const sink_type = static_cast<io_type>(state.range(4));
+  auto const sink_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
index 888102c03be..d9e37d84036 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_benchmark.cpp
@@ -40,7 +40,7 @@ void BM_parq_read_varying_input(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const source_type = static_cast<io_type>(state.range(4));
+  auto const source_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index 1af7e206692..74289fd414a 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -40,7 +40,7 @@ void BM_parq_write_varying_inout(benchmark::State& state)
   cudf::size_type const run_length  = state.range(2);
   cudf_io::compression_type const compression =
     state.range(3) ? cudf_io::compression_type::SNAPPY : cudf_io::compression_type::NONE;
-  io_type const sink_type = static_cast<io_type>(state.range(4));
+  auto const sink_type = static_cast<io_type>(state.range(4));
 
   data_profile table_data_profile;
   table_data_profile.set_cardinality(cardinality);
diff --git a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
index 49007fda7a3..22e4be9ce9d 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists_benchmark.cu
@@ -45,7 +45,7 @@ void BM_lists_scatter(::benchmark::State& state)
 
   const size_type base_size{(size_type)state.range(0)};
   const size_type num_elements_per_row{(size_type)state.range(1)};
-  const size_type num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);
+  const auto num_rows = (size_type)ceil(double(base_size) / num_elements_per_row);
 
   auto source_base_col = make_fixed_width_column(
     data_type{type_to_id<TypeParam>()}, base_size, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
index 8e51bcca63d..90097889a86 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher_benchmark.cu
@@ -170,11 +170,11 @@ void launch_kernel(cudf::mutable_table_view input, T** d_ptr, int work_per_threa
 template <class TypeParam, FunctorType functor_type, DispatchingType dispatching_type>
 void type_dispatcher_benchmark(::benchmark::State& state)
 {
-  const cudf::size_type source_size = static_cast<cudf::size_type>(state.range(1));
+  const auto source_size = static_cast<cudf::size_type>(state.range(1));
 
-  const cudf::size_type n_cols = static_cast<cudf::size_type>(state.range(0));
+  const auto n_cols = static_cast<cudf::size_type>(state.range(0));
 
-  const cudf::size_type work_per_thread = static_cast<cudf::size_type>(state.range(2));
+  const auto work_per_thread = static_cast<cudf::size_type>(state.range(2));
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
 
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 374af536dc5..23587f49334 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -99,9 +99,9 @@ class aggregation {
   Kind kind;  ///< The aggregation to perform
   virtual ~aggregation() = default;
 
-  virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
-  virtual size_t do_hash() const { return std::hash<int>{}(kind); }
-  virtual std::unique_ptr<aggregation> clone() const = 0;
+  [[nodiscard]] virtual bool is_equal(aggregation const& other) const { return kind == other.kind; }
+  [[nodiscard]] virtual size_t do_hash() const { return std::hash<int>{}(kind); }
+  [[nodiscard]] virtual std::unique_ptr<aggregation> clone() const = 0;
 
   // override functions for compound aggregations
   virtual std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
@@ -118,7 +118,7 @@ class aggregation {
  */
 class rolling_aggregation : public virtual aggregation {
  public:
-  ~rolling_aggregation() = default;
+  ~rolling_aggregation() override = default;
 
  protected:
   rolling_aggregation() {}
@@ -130,7 +130,7 @@ class rolling_aggregation : public virtual aggregation {
  */
 class groupby_aggregation : public virtual aggregation {
  public:
-  ~groupby_aggregation() = default;
+  ~groupby_aggregation() override = default;
 
  protected:
   groupby_aggregation() {}
@@ -141,7 +141,7 @@ class groupby_aggregation : public virtual aggregation {
  */
 class groupby_scan_aggregation : public virtual aggregation {
  public:
-  ~groupby_scan_aggregation() = default;
+  ~groupby_scan_aggregation() override = default;
 
  protected:
   groupby_scan_aggregation() {}
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index ecd46ec2c23..2bfe1b03dd3 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -76,7 +76,7 @@ struct expression_result {
     subclass().template set_value<Element>(index, result);
   }
 
-  __device__ inline bool is_valid() const { return subclass().is_valid(); }
+  [[nodiscard]] __device__ inline bool is_valid() const { return subclass().is_valid(); }
 
   __device__ inline T value() const { return subclass().value(); }
 };
@@ -110,7 +110,7 @@ struct value_expression_result
   /**
    * @brief Returns true if the underlying data is valid and false otherwise.
    */
-  __device__ inline bool is_valid() const
+  [[nodiscard]] __device__ inline bool is_valid() const
   {
     if constexpr (has_nulls) { return _obj.has_value(); }
     return true;
@@ -174,7 +174,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ inline bool is_valid() const
+  [[nodiscard]] __device__ inline bool is_valid() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
@@ -186,7 +186,7 @@ struct mutable_column_expression_result
   /**
    * @brief Not implemented for this specialization.
    */
-  __device__ inline mutable_column_device_view value() const
+  [[nodiscard]] __device__ inline mutable_column_device_view value() const
   {
     // Not implemented since it would require modifying the API in the parent class to accept an
     // index.
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 4f73cb1ef6e..0b54dc7e4f0 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -166,7 +166,7 @@ class expression_parser {
    *
    * @return cudf::data_type
    */
-  cudf::data_type output_type() const;
+  [[nodiscard]] cudf::data_type output_type() const;
 
   /**
    * @brief Visit a literal expression.
@@ -206,10 +206,10 @@ class expression_parser {
    */
   class intermediate_counter {
    public:
-    intermediate_counter() : used_values(), max_used(0) {}
+    intermediate_counter() : used_values() {}
     cudf::size_type take();
     void give(cudf::size_type value);
-    cudf::size_type get_max_used() const { return max_used; }
+    [[nodiscard]] cudf::size_type get_max_used() const { return max_used; }
 
    private:
     /**
@@ -221,10 +221,10 @@ class expression_parser {
      *
      * @return cudf::size_type Smallest value not already in the container.
      */
-    cudf::size_type find_first_missing() const;
+    [[nodiscard]] cudf::size_type find_first_missing() const;
 
     std::vector<cudf::size_type> used_values;
-    cudf::size_type max_used;
+    cudf::size_type max_used{0};
   };
 
   expression_device_view device_expression_data;  ///< The collection of data required to evaluate
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 20aaa42fb68..eb98e0e0bee 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -38,14 +38,14 @@ class expression_parser;
 struct expression {
   virtual cudf::size_type accept(detail::expression_parser& visitor) const = 0;
 
-  bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const
+  [[nodiscard]] bool may_evaluate_null(table_view const& left, rmm::cuda_stream_view stream) const
   {
     return may_evaluate_null(left, left, stream);
   }
 
-  virtual bool may_evaluate_null(table_view const& left,
-                                 table_view const& right,
-                                 rmm::cuda_stream_view stream) const = 0;
+  [[nodiscard]] virtual bool may_evaluate_null(table_view const& left,
+                                               table_view const& right,
+                                               rmm::cuda_stream_view stream) const = 0;
 
   virtual ~expression() {}
 };
@@ -173,14 +173,17 @@ class literal : public expression {
    *
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type() const { return get_value().type(); }
+  [[nodiscard]] cudf::data_type get_data_type() const { return get_value().type(); }
 
   /**
    * @brief Get the value object.
    *
    * @return cudf::detail::fixed_width_scalar_device_view_base
    */
-  cudf::detail::fixed_width_scalar_device_view_base get_value() const { return value; }
+  [[nodiscard]] cudf::detail::fixed_width_scalar_device_view_base get_value() const
+  {
+    return value;
+  }
 
   /**
    * @brief Accepts a visitor class.
@@ -190,9 +193,9 @@ class literal : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return !is_valid(stream);
   }
@@ -202,7 +205,10 @@ class literal : public expression {
    *
    * @return bool
    */
-  bool is_valid(rmm::cuda_stream_view stream) const { return scalar.is_valid(stream); }
+  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream) const
+  {
+    return scalar.is_valid(stream);
+  }
 
  private:
   cudf::scalar const& scalar;
@@ -232,14 +238,14 @@ class column_reference : public expression {
    *
    * @return cudf::size_type
    */
-  cudf::size_type get_column_index() const { return column_index; }
+  [[nodiscard]] cudf::size_type get_column_index() const { return column_index; }
 
   /**
    * @brief Get the table source.
    *
    * @return table_reference
    */
-  table_reference get_table_source() const { return table_source; }
+  [[nodiscard]] table_reference get_table_source() const { return table_source; }
 
   /**
    * @brief Get the data type.
@@ -247,7 +253,7 @@ class column_reference : public expression {
    * @param table Table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(table_view const& table) const
+  [[nodiscard]] cudf::data_type get_data_type(table_view const& table) const
   {
     return table.column(get_column_index()).type();
   }
@@ -259,7 +265,8 @@ class column_reference : public expression {
    * @param right_table Right table used to determine types.
    * @return cudf::data_type
    */
-  cudf::data_type get_data_type(table_view const& left_table, table_view const& right_table) const
+  [[nodiscard]] cudf::data_type get_data_type(table_view const& left_table,
+                                              table_view const& right_table) const
   {
     auto const table = [&] {
       if (get_table_source() == table_reference::LEFT) {
@@ -281,9 +288,9 @@ class column_reference : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return (table_source == table_reference::LEFT ? left : right).column(column_index).has_nulls();
   }
@@ -327,7 +334,7 @@ class operation : public expression {
    *
    * @return ast_operator
    */
-  ast_operator get_operator() const { return op; }
+  [[nodiscard]] ast_operator get_operator() const { return op; }
 
   /**
    * @brief Get the operands.
@@ -344,9 +351,9 @@ class operation : public expression {
    */
   cudf::size_type accept(detail::expression_parser& visitor) const override;
 
-  bool may_evaluate_null(table_view const& left,
-                         table_view const& right,
-                         rmm::cuda_stream_view stream) const override
+  [[nodiscard]] bool may_evaluate_null(table_view const& left,
+                                       table_view const& right,
+                                       rmm::cuda_stream_view stream) const override
   {
     return std::any_of(operands.cbegin(),
                        operands.cend(),
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 8decce7f260..7869f9bd2aa 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -122,12 +122,12 @@ class column {
   /**
    * @brief Returns the column's logical element type
    */
-  data_type type() const noexcept { return _type; }
+  [[nodiscard]] data_type type() const noexcept { return _type; }
 
   /**
    * @brief Returns the number of elements
    */
-  size_type size() const noexcept { return _size; }
+  [[nodiscard]] size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns the count of null elements.
@@ -137,7 +137,7 @@ class column {
    * first invocation of `null_count()` will compute and store the count of null
    * elements indicated by the `null_mask` (if it exists).
    */
-  size_type null_count() const;
+  [[nodiscard]] size_type null_count() const;
 
   /**
    * @brief Sets the column's null value indicator bitmask to `new_null_mask`.
@@ -199,7 +199,7 @@ class column {
    * @return true The column can hold null values
    * @return false The column cannot hold null values
    */
-  bool nullable() const noexcept { return (_null_mask.size() > 0); }
+  [[nodiscard]] bool nullable() const noexcept { return (_null_mask.size() > 0); }
 
   /**
    * @brief Indicates whether the column contains null elements.
@@ -207,12 +207,12 @@ class column {
    * @return true One or more elements are null
    * @return false Zero elements are null
    */
-  bool has_nulls() const noexcept { return (null_count() > 0); }
+  [[nodiscard]] bool has_nulls() const noexcept { return (null_count() > 0); }
 
   /**
    * @brief Returns the number of child columns
    */
-  size_type num_children() const noexcept { return _children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return _children.size(); }
 
   /**
    * @brief Returns a reference to the specified child
@@ -228,7 +228,10 @@ class column {
    * @param child_index Index of the desired child
    * @return column const& Const reference to the desired child
    */
-  column const& child(size_type child_index) const noexcept { return *_children[child_index]; };
+  [[nodiscard]] column const& child(size_type child_index) const noexcept
+  {
+    return *_children[child_index];
+  };
 
   /**
    * @brief Wrapper for the contents of a column.
@@ -264,7 +267,7 @@ class column {
    *
    * @return column_view The immutable, non-owning view
    */
-  column_view view() const;
+  [[nodiscard]] column_view view() const;
 
   /**
    * @brief Implicit conversion operator to a `column_view`.
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index b29df1852b2..d2332ef9026 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -139,12 +139,12 @@ class alignas(16) column_device_view_base {
   /**
    * @brief Returns the number of elements in the column.
    */
-  __host__ __device__ size_type size() const noexcept { return _size; }
+  [[nodiscard]] __host__ __device__ size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns the element type
    */
-  __host__ __device__ data_type type() const noexcept { return _type; }
+  [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; }
 
   /**
    * @brief Indicates whether the column can contain null elements, i.e., if it
@@ -155,7 +155,7 @@ class alignas(16) column_device_view_base {
    * @return true The bitmask is allocated
    * @return false The bitmask is not allocated
    */
-  __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; }
+  [[nodiscard]] __host__ __device__ bool nullable() const noexcept { return nullptr != _null_mask; }
 
   /**
    * @brief Returns raw pointer to the underlying bitmask allocation.
@@ -164,13 +164,16 @@ class alignas(16) column_device_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  __host__ __device__ bitmask_type const* null_mask() const noexcept { return _null_mask; }
+  [[nodiscard]] __host__ __device__ bitmask_type const* null_mask() const noexcept
+  {
+    return _null_mask;
+  }
 
   /**
    * @brief Returns the index of the first element relative to the base memory
    * allocation, i.e., what is returned from `head<T>()`.
    */
-  __host__ __device__ size_type offset() const noexcept { return _offset; }
+  [[nodiscard]] __host__ __device__ size_type offset() const noexcept { return _offset; }
 
   /**
    * @brief Returns whether the specified element holds a valid value (i.e., not
@@ -186,7 +189,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_valid(size_type element_index) const noexcept
   {
     return not nullable() or is_valid_nocheck(element_index);
   }
@@ -203,7 +206,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid_nocheck(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_valid_nocheck(size_type element_index) const noexcept
   {
     return bit_is_set(_null_mask, offset() + element_index);
   }
@@ -221,7 +224,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is null
    * @return false The element is valid
    */
-  __device__ bool is_null(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_null(size_type element_index) const noexcept
   {
     return not is_valid(element_index);
   }
@@ -237,7 +240,7 @@ class alignas(16) column_device_view_base {
    * @return true The element is null
    * @return false The element is valid
    */
-  __device__ bool is_null_nocheck(size_type element_index) const noexcept
+  [[nodiscard]] __device__ bool is_null_nocheck(size_type element_index) const noexcept
   {
     return not is_valid_nocheck(element_index);
   }
@@ -251,7 +254,7 @@ class alignas(16) column_device_view_base {
    * @param word_index The index of the word to get
    * @return bitmask word for the given word_index
    */
-  __device__ bitmask_type get_mask_word(size_type word_index) const noexcept
+  [[nodiscard]] __device__ bitmask_type get_mask_word(size_type word_index) const noexcept
   {
     return null_mask()[word_index];
   }
@@ -476,7 +479,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_iterator<T> begin() const
+  [[nodiscard]] const_iterator<T> begin() const
   {
     return const_iterator<T>{count_it{0}, detail::value_accessor<T>{*this}};
   }
@@ -494,7 +497,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * For columns with null elements, use `make_null_replacement_iterator`.
    */
   template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_iterator<T> end() const
+  [[nodiscard]] const_iterator<T> end() const
   {
     return const_iterator<T>{count_it{size()}, detail::value_accessor<T>{*this}};
   }
@@ -602,7 +605,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_iterator<T, has_nulls> pair_begin() const
+  [[nodiscard]] const_pair_iterator<T, has_nulls> pair_begin() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{0},
                                              detail::pair_accessor<T, has_nulls>{*this}};
@@ -632,7 +635,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_rep_iterator<T, has_nulls> pair_rep_begin() const
+  [[nodiscard]] const_pair_rep_iterator<T, has_nulls> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{0},
                                                  detail::pair_rep_accessor<T, has_nulls>{*this}};
@@ -673,7 +676,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_iterator<T, has_nulls> pair_end() const
+  [[nodiscard]] const_pair_iterator<T, has_nulls> pair_end() const
   {
     return const_pair_iterator<T, has_nulls>{count_it{size()},
                                              detail::pair_accessor<T, has_nulls>{*this}};
@@ -693,7 +696,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   template <typename T,
             bool has_nulls,
             CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
-  const_pair_rep_iterator<T, has_nulls> pair_rep_end() const
+  [[nodiscard]] const_pair_rep_iterator<T, has_nulls> pair_rep_end() const
   {
     return const_pair_rep_iterator<T, has_nulls>{count_it{size()},
                                                  detail::pair_rep_accessor<T, has_nulls>{*this}};
@@ -743,7 +746,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  __device__ column_device_view child(size_type child_index) const noexcept
+  [[nodiscard]] __device__ column_device_view child(size_type child_index) const noexcept
   {
     return d_children[child_index];
   }
@@ -751,7 +754,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Returns a span containing the children of this column
    */
-  __device__ device_span<column_device_view const> children() const noexcept
+  [[nodiscard]] __device__ device_span<column_device_view const> children() const noexcept
   {
     return device_span<column_device_view const>(d_children, _num_children);
   }
@@ -761,7 +764,10 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *
    * @return The number of child columns
    */
-  __host__ __device__ size_type num_child_columns() const noexcept { return _num_children; }
+  [[nodiscard]] __host__ __device__ size_type num_child_columns() const noexcept
+  {
+    return _num_children;
+  }
 
  protected:
   column_device_view* d_children{};  ///< Array of `column_device_view`
@@ -907,7 +913,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  __host__ __device__ bitmask_type* null_mask() const noexcept
+  [[nodiscard]] __host__ __device__ bitmask_type* null_mask() const noexcept
   {
     return const_cast<bitmask_type*>(detail::column_device_view_base::null_mask());
   }
@@ -957,7 +963,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  __device__ mutable_column_device_view child(size_type child_index) const noexcept
+  [[nodiscard]] __device__ mutable_column_device_view child(size_type child_index) const noexcept
   {
     return d_children[child_index];
   }
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 3f335509da8..325f023f283 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -123,17 +123,17 @@ class column_view_base {
   /**
    * @brief Returns the number of elements in the column
    */
-  size_type size() const noexcept { return _size; }
+  [[nodiscard]] size_type size() const noexcept { return _size; }
 
   /**
    * @brief Returns true if `size()` returns zero, or false otherwise
    */
-  size_type is_empty() const noexcept { return size() == 0; }
+  [[nodiscard]] size_type is_empty() const noexcept { return size() == 0; }
 
   /**
    * @brief Returns the element `data_type`
    */
-  data_type type() const noexcept { return _type; }
+  [[nodiscard]] data_type type() const noexcept { return _type; }
 
   /**
    * @brief Indicates if the column can contain null elements, i.e., if it has
@@ -144,7 +144,7 @@ class column_view_base {
    * @return true The bitmask is allocated
    * @return false The bitmask is not allocated
    */
-  bool nullable() const noexcept { return nullptr != _null_mask; }
+  [[nodiscard]] bool nullable() const noexcept { return nullptr != _null_mask; }
 
   /**
    * @brief Returns the count of null elements
@@ -154,7 +154,7 @@ class column_view_base {
    * first invocation of `null_count()` will compute and store the count of null
    * elements indicated by the `null_mask` (if it exists).
    */
-  size_type null_count() const;
+  [[nodiscard]] size_type null_count() const;
 
   /**
    * @brief Returns the count of null elements in the range [begin, end)
@@ -169,7 +169,7 @@ class column_view_base {
    * @param[in] begin The starting index of the range (inclusive).
    * @param[in] end The index of the last element in the range (exclusive).
    */
-  size_type null_count(size_type begin, size_type end) const;
+  [[nodiscard]] size_type null_count(size_type begin, size_type end) const;
 
   /**
    * @brief Indicates if the column contains null elements,
@@ -178,7 +178,7 @@ class column_view_base {
    * @return true One or more elements are null
    * @return false All elements are valid
    */
-  bool has_nulls() const { return null_count() > 0; }
+  [[nodiscard]] bool has_nulls() const { return null_count() > 0; }
 
   /**
    * @brief Indicates if the column contains null elements in the range
@@ -192,7 +192,10 @@ class column_view_base {
    * @return true One or more elements are null in the range [begin, end)
    * @return false All elements are valid in the range [begin, end)
    */
-  bool has_nulls(size_type begin, size_type end) const { return null_count(begin, end) > 0; }
+  [[nodiscard]] bool has_nulls(size_type begin, size_type end) const
+  {
+    return null_count(begin, end) > 0;
+  }
 
   /**
    * @brief Returns raw pointer to the underlying bitmask allocation.
@@ -201,13 +204,13 @@ class column_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  bitmask_type const* null_mask() const noexcept { return _null_mask; }
+  [[nodiscard]] bitmask_type const* null_mask() const noexcept { return _null_mask; }
 
   /**
    * @brief Returns the index of the first element relative to the base memory
    * allocation, i.e., what is returned from `head<T>()`.
    */
-  size_type offset() const noexcept { return _offset; }
+  [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
   data_type _type{type_id::EMPTY};   ///< Element type
@@ -352,12 +355,15 @@ class column_view : public detail::column_view_base {
    * @param child_index The index of the desired child
    * @return column_view The requested child `column_view`
    */
-  column_view child(size_type child_index) const noexcept { return _children[child_index]; }
+  [[nodiscard]] column_view child(size_type child_index) const noexcept
+  {
+    return _children[child_index];
+  }
 
   /**
    * @brief Returns the number of child columns.
    */
-  size_type num_children() const noexcept { return _children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return _children.size(); }
 
   /**
    * @brief Returns iterator to the beginning of the ordered sequence of child column-views.
@@ -524,7 +530,7 @@ class mutable_column_view : public detail::column_view_base {
    *
    * @note If `null_count() == 0`, this may return `nullptr`.
    */
-  bitmask_type* null_mask() const noexcept
+  [[nodiscard]] bitmask_type* null_mask() const noexcept
   {
     return const_cast<bitmask_type*>(detail::column_view_base::null_mask());
   }
@@ -544,7 +550,7 @@ class mutable_column_view : public detail::column_view_base {
    * @param child_index The index of the desired child
    * @return mutable_column_view The requested child `mutable_column_view`
    */
-  mutable_column_view child(size_type child_index) const noexcept
+  [[nodiscard]] mutable_column_view child(size_type child_index) const noexcept
   {
     return mutable_children[child_index];
   }
@@ -552,7 +558,7 @@ class mutable_column_view : public detail::column_view_base {
   /**
    * @brief Returns the number of child columns.
    */
-  size_type num_children() const noexcept { return mutable_children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return mutable_children.size(); }
 
   /**
    * @brief Returns iterator to the beginning of the ordered sequence of child column-views.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 81dddbd284a..850a11426af 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -553,8 +553,8 @@ struct packed_columns {
   struct metadata {
     metadata() = default;
     metadata(std::vector<uint8_t>&& v) : data_(std::move(v)) {}
-    uint8_t const* data() const { return data_.data(); }
-    size_t size() const { return data_.size(); }
+    [[nodiscard]] uint8_t const* data() const { return data_.data(); }
+    [[nodiscard]] size_t size() const { return data_.size(); }
 
    private:
     std::vector<uint8_t> data_;
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 3674efbcc7b..fbf315776f4 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -148,7 +148,7 @@ class sum_aggregation final : public rolling_aggregation,
  public:
   sum_aggregation() : aggregation(SUM) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<sum_aggregation>(*this);
   }
@@ -167,7 +167,7 @@ class product_aggregation final : public groupby_aggregation {
  public:
   product_aggregation() : aggregation(PRODUCT) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<product_aggregation>(*this);
   }
@@ -188,7 +188,7 @@ class min_aggregation final : public rolling_aggregation,
  public:
   min_aggregation() : aggregation(MIN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<min_aggregation>(*this);
   }
@@ -209,7 +209,7 @@ class max_aggregation final : public rolling_aggregation,
  public:
   max_aggregation() : aggregation(MAX) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<max_aggregation>(*this);
   }
@@ -230,7 +230,7 @@ class count_aggregation final : public rolling_aggregation,
  public:
   count_aggregation(aggregation::Kind kind) : aggregation(kind) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<count_aggregation>(*this);
   }
@@ -249,7 +249,7 @@ class any_aggregation final : public aggregation {
  public:
   any_aggregation() : aggregation(ANY) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<any_aggregation>(*this);
   }
@@ -268,7 +268,7 @@ class all_aggregation final : public aggregation {
  public:
   all_aggregation() : aggregation(ALL) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<all_aggregation>(*this);
   }
@@ -287,7 +287,7 @@ class sum_of_squares_aggregation final : public groupby_aggregation {
  public:
   sum_of_squares_aggregation() : aggregation(SUM_OF_SQUARES) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<sum_of_squares_aggregation>(*this);
   }
@@ -306,7 +306,7 @@ class mean_aggregation final : public rolling_aggregation, public groupby_aggreg
  public:
   mean_aggregation() : aggregation(MEAN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<mean_aggregation>(*this);
   }
@@ -325,7 +325,7 @@ class m2_aggregation : public groupby_aggregation {
  public:
   m2_aggregation() : aggregation{M2} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<m2_aggregation>(*this);
   }
@@ -344,14 +344,17 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati
  public:
   size_type _ddof;  ///< Delta degrees of freedom
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<std_var_aggregation const&>(_other);
     return _ddof == other._ddof;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
  protected:
   std_var_aggregation(aggregation::Kind k, size_type ddof) : rolling_aggregation(k), _ddof{ddof}
@@ -359,7 +362,7 @@ class std_var_aggregation : public rolling_aggregation, public groupby_aggregati
     CUDF_EXPECTS(k == aggregation::STD or k == aggregation::VARIANCE,
                  "std_var_aggregation can accept only STD, VARIANCE");
   }
-  size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
+  [[nodiscard]] size_type hash_impl() const { return std::hash<size_type>{}(_ddof); }
 };
 
 /**
@@ -372,7 +375,7 @@ class var_aggregation final : public std_var_aggregation {
   {
   }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<var_aggregation>(*this);
   }
@@ -394,7 +397,7 @@ class std_aggregation final : public std_var_aggregation {
   {
   }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<std_aggregation>(*this);
   }
@@ -413,7 +416,7 @@ class median_aggregation final : public groupby_aggregation {
  public:
   median_aggregation() : aggregation(MEDIAN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<median_aggregation>(*this);
   }
@@ -437,7 +440,7 @@ class quantile_aggregation final : public groupby_aggregation {
   std::vector<double> _quantiles;  ///< Desired quantile(s)
   interpolation _interpolation;    ///< Desired interpolation
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
 
@@ -447,9 +450,12 @@ class quantile_aggregation final : public groupby_aggregation {
            std::equal(_quantiles.begin(), _quantiles.end(), other._quantiles.begin());
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<quantile_aggregation>(*this);
   }
@@ -478,7 +484,7 @@ class argmax_aggregation final : public rolling_aggregation, public groupby_aggr
  public:
   argmax_aggregation() : aggregation(ARGMAX) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<argmax_aggregation>(*this);
   }
@@ -497,7 +503,7 @@ class argmin_aggregation final : public rolling_aggregation, public groupby_aggr
  public:
   argmin_aggregation() : aggregation(ARGMIN) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<argmin_aggregation>(*this);
   }
@@ -521,16 +527,19 @@ class nunique_aggregation final : public groupby_aggregation {
 
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<nunique_aggregation const&>(_other);
     return _null_handling == other._null_handling;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<nunique_aggregation>(*this);
   }
@@ -558,16 +567,19 @@ class nth_element_aggregation final : public groupby_aggregation {
   size_type _n;                ///< nth index to return
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<nth_element_aggregation const&>(_other);
     return _n == other._n and _null_handling == other._null_handling;
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<nth_element_aggregation>(*this);
   }
@@ -592,7 +604,7 @@ class row_number_aggregation final : public rolling_aggregation {
  public:
   row_number_aggregation() : aggregation(ROW_NUMBER) {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<row_number_aggregation>(*this);
   }
@@ -611,7 +623,7 @@ class rank_aggregation final : public rolling_aggregation, public groupby_scan_a
  public:
   rank_aggregation() : aggregation{RANK} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<rank_aggregation>(*this);
   }
@@ -630,7 +642,7 @@ class dense_rank_aggregation final : public rolling_aggregation, public groupby_
  public:
   dense_rank_aggregation() : aggregation{DENSE_RANK} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<dense_rank_aggregation>(*this);
   }
@@ -654,16 +666,19 @@ class collect_list_aggregation final : public rolling_aggregation, public groupb
 
   null_policy _null_handling;  ///< include or exclude nulls
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<collect_list_aggregation const&>(_other);
     return (_null_handling == other._null_handling);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<collect_list_aggregation>(*this);
   }
@@ -698,7 +713,7 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby
   nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
                                ///< floating point types)
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<collect_set_aggregation const&>(_other);
@@ -706,9 +721,12 @@ class collect_set_aggregation final : public rolling_aggregation, public groupby
             _nans_equal == other._nans_equal);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<collect_set_aggregation>(*this);
   }
@@ -737,16 +755,19 @@ class lead_lag_aggregation final : public rolling_aggregation {
   {
   }
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<lead_lag_aggregation const&>(_other);
     return (row_offset == other.row_offset);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<lead_lag_aggregation>(*this);
   }
@@ -760,7 +781,7 @@ class lead_lag_aggregation final : public rolling_aggregation {
   size_type row_offset;
 
  private:
-  size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
+  [[nodiscard]] size_t hash_impl() const { return std::hash<size_type>()(row_offset); }
 };
 
 /**
@@ -782,7 +803,7 @@ class udf_aggregation final : public rolling_aggregation {
                  "udf_aggregation can accept only PTX, CUDA");
   }
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<udf_aggregation const&>(_other);
@@ -790,9 +811,12 @@ class udf_aggregation final : public rolling_aggregation {
             _function_name == other._function_name and _output_type == other._output_type);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<udf_aggregation>(*this);
   }
@@ -809,7 +833,7 @@ class udf_aggregation final : public rolling_aggregation {
   data_type _output_type;
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<std::string>{}(_source) ^ std::hash<std::string>{}(_operator_name) ^
            std::hash<std::string>{}(_function_name) ^
@@ -824,7 +848,7 @@ class merge_lists_aggregation final : public groupby_aggregation {
  public:
   explicit merge_lists_aggregation() : aggregation{MERGE_LISTS} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_lists_aggregation>(*this);
   }
@@ -850,16 +874,19 @@ class merge_sets_aggregation final : public groupby_aggregation {
   nan_equality _nans_equal;    ///< whether to consider NaNs as equal value (applicable only to
                                ///< floating point types)
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<merge_sets_aggregation const&>(_other);
     return (_nulls_equal == other._nulls_equal && _nans_equal == other._nans_equal);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_sets_aggregation>(*this);
   }
@@ -884,7 +911,7 @@ class merge_m2_aggregation final : public groupby_aggregation {
  public:
   explicit merge_m2_aggregation() : aggregation{MERGE_M2} {}
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_m2_aggregation>(*this);
   }
@@ -908,9 +935,12 @@ class covariance_aggregation final : public groupby_aggregation {
   size_type _min_periods;
   size_type _ddof;
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<covariance_aggregation>(*this);
   }
@@ -940,16 +970,19 @@ class correlation_aggregation final : public groupby_aggregation {
   correlation_type _type;
   size_type _min_periods;
 
-  bool is_equal(aggregation const& _other) const override
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override
   {
     if (!this->aggregation::is_equal(_other)) { return false; }
     auto const& other = dynamic_cast<correlation_aggregation const&>(_other);
     return (_type == other._type);
   }
 
-  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+  [[nodiscard]] size_t do_hash() const override
+  {
+    return this->aggregation::do_hash() ^ hash_impl();
+  }
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<correlation_aggregation>(*this);
   }
@@ -979,7 +1012,7 @@ class tdigest_aggregation final : public groupby_aggregation {
 
   int const max_centroids;
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<tdigest_aggregation>(*this);
   }
@@ -1003,7 +1036,7 @@ class merge_tdigest_aggregation final : public groupby_aggregation {
 
   int const max_centroids;
 
-  std::unique_ptr<aggregation> clone() const override
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override
   {
     return std::make_unique<merge_tdigest_aggregation>(*this);
   }
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index 41f5c19f06a..4409d7e0d73 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -49,11 +49,11 @@ class result_cache {
 
   result_cache(size_t num_columns) : _cache(num_columns) {}
 
-  bool has_result(column_view const& input, aggregation const& agg) const;
+  [[nodiscard]] bool has_result(column_view const& input, aggregation const& agg) const;
 
   void add_result(column_view const& input, aggregation const& agg, std::unique_ptr<column>&& col);
 
-  column_view get_result(column_view const& input, aggregation const& agg) const;
+  [[nodiscard]] column_view get_result(column_view const& input, aggregation const& agg) const;
 
   std::unique_ptr<column> release_result(column_view const& input, aggregation const& agg);
 
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index ee5cb5c265d..1debef17db7 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -77,8 +77,8 @@ struct tagged_element_relational_comparator {
   {
   }
 
-  __device__ weak_ordering compare(index_type lhs_tagged_index,
-                                   index_type rhs_tagged_index) const noexcept
+  [[nodiscard]] __device__ weak_ordering compare(index_type lhs_tagged_index,
+                                                 index_type rhs_tagged_index) const noexcept
   {
     auto const [l_side, l_indx] = lhs_tagged_index;
     auto const [r_side, r_indx] = rhs_tagged_index;
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 6f32e3190bf..751b7c00e8a 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -106,17 +106,17 @@ class flattened_table {
   /**
    * @brief Getter for the flattened columns, as a `table_view`.
    */
-  table_view flattened_columns() const { return _flattened_columns; }
+  [[nodiscard]] table_view flattened_columns() const { return _flattened_columns; }
 
   /**
    * @brief Getter for the cudf::order of the table_view's columns.
    */
-  std::vector<order> orders() const { return _orders; }
+  [[nodiscard]] std::vector<order> orders() const { return _orders; }
 
   /**
    * @brief Getter for the cudf::null_order of the table_view's columns.
    */
-  std::vector<null_order> null_orders() const { return _null_orders; }
+  [[nodiscard]] std::vector<null_order> null_orders() const { return _null_orders; }
 
   /**
    * @brief Conversion to `table_view`, to fetch flattened columns.
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index 6380e76fdfa..b8ea228383d 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -62,8 +62,8 @@ struct genericAtomicOperationImpl<T, Op, 1> {
   {
     using T_int = unsigned int;
 
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    auto* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift          = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -87,7 +87,7 @@ struct genericAtomicOperationImpl<T, Op, 2> {
   {
     using T_int      = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
+    auto* address_uint32 =
       reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
@@ -322,8 +322,8 @@ struct typesAtomicCASImpl<T, 1> {
   {
     using T_int = unsigned int;
 
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift          = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    auto* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -355,7 +355,7 @@ struct typesAtomicCASImpl<T, 2> {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
+    auto* address_uint32 =
       reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 8a7f4276d05..b5ca5a3590e 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -90,12 +90,12 @@ struct MurmurHash3_32 {
   MurmurHash3_32() = default;
   constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {}
 
-  __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
+  [[nodiscard]] __device__ inline uint32_t rotl32(uint32_t x, int8_t r) const
   {
     return (x << r) | (x >> (32 - r));
   }
 
-  __device__ inline uint32_t fmix32(uint32_t h) const
+  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
   {
     h ^= h >> 16;
     h *= 0x85ebca6b;
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 42f8310040e..33e29e70304 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -59,33 +59,33 @@ class dictionary_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const noexcept;
+  [[nodiscard]] column_view parent() const noexcept;
 
   /**
    * @brief Returns the column of indices
    */
-  column_view indices() const noexcept;
+  [[nodiscard]] column_view indices() const noexcept;
 
   /**
    * @brief Returns a column_view combining the indices data
    * with offset, size, and nulls from the parent.
    */
-  column_view get_indices_annotated() const noexcept;
+  [[nodiscard]] column_view get_indices_annotated() const noexcept;
 
   /**
    * @brief Returns the column of keys
    */
-  column_view keys() const noexcept;
+  [[nodiscard]] column_view keys() const noexcept;
 
   /**
    * @brief Returns the `data_type` of the keys child column.
    */
-  data_type keys_type() const noexcept;
+  [[nodiscard]] data_type keys_type() const noexcept;
 
   /**
    * @brief Returns the number of rows in the keys column.
    */
-  size_type keys_size() const noexcept;
+  [[nodiscard]] size_type keys_size() const noexcept;
 };
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 727dce0db9d..6a85428d8f0 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -82,7 +82,8 @@ template <typename Rep,
 CUDF_HOST_DEVICE inline Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
-  if (exponent == 0) return static_cast<Rep>(1);
+  if (exponent == 0) { return static_cast<Rep>(1); }
+
   auto extra  = static_cast<Rep>(1);
   auto square = static_cast<Rep>(Base);
   while (exponent > 1) {
@@ -146,12 +147,9 @@ CUDF_HOST_DEVICE inline constexpr T left_shift(T const& val, scale_type const& s
 template <typename Rep, Radix Rad, typename T>
 CUDF_HOST_DEVICE inline constexpr T shift(T const& val, scale_type const& scale)
 {
-  if (scale == 0)
-    return val;
-  else if (scale > 0)
-    return right_shift<Rep, Rad>(val, scale);
-  else
-    return left_shift<Rep, Rad>(val, scale);
+  if (scale == 0) { return val; }
+  if (scale > 0) { return right_shift<Rep, Rad>(val, scale); }
+  return left_shift<Rep, Rad>(val, scale);
 }
 
 }  // namespace detail
@@ -193,7 +191,7 @@ struct scaled_integer {
  */
 template <typename Rep, Radix Rad>
 class fixed_point {
-  Rep _value;
+  Rep _value{};
   scale_type _scale;
 
  public:
@@ -258,7 +256,7 @@ class fixed_point {
    * @brief Default constructor that constructs `fixed_point` number with a
    * value and scale of zero
    */
-  CUDF_HOST_DEVICE inline fixed_point() : _value{0}, _scale{scale_type{0}} {}
+  CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {}
 
   /**
    * @brief Explicit conversion operator for casting to floating point types
@@ -543,7 +541,7 @@ class fixed_point {
    */
   CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
-    if (scale == _scale) return *this;
+    if (scale == _scale) { return *this; }
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
     return fixed_point<Rep, Rad>{scaled_integer<Rep>{value, scale}};
   }
@@ -563,10 +561,9 @@ class fixed_point {
       auto const sign  = _value < 0 ? std::string("-") : std::string();
       return sign + detail::to_string(av / n) + std::string(".") + zeros +
              detail::to_string(av % n);
-    } else {
-      auto const zeros = std::string(_scale, '0');
-      return detail::to_string(_value) + zeros;
     }
+    auto const zeros = std::string(_scale, '0');
+    return detail::to_string(_value) + zeros;
   }
 };
 
@@ -628,12 +625,9 @@ CUDF_HOST_DEVICE inline auto multiplication_overflow(T lhs, T rhs)
 {
   auto const min = cuda::std::numeric_limits<Rep>::min();
   auto const max = cuda::std::numeric_limits<Rep>::max();
-  if (rhs > 0)
-    return lhs > max / rhs || lhs < min / rhs;
-  else if (rhs < -1)
-    return lhs > min / rhs || lhs < max / rhs;
-  else
-    return rhs == -1 && lhs == min;
+  if (rhs > 0) { return lhs > max / rhs || lhs < min / rhs; }
+  if (rhs < -1) { return lhs > min / rhs || lhs < max / rhs; }
+  return rhs == -1 && lhs == min;
 }
 
 // PLUS Operation
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 4e8bd65672f..0e00d14291d 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -74,22 +74,22 @@ class avro_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns names of the columns to be read.
    */
-  std::vector<std::string> get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> get_columns() const { return _columns; }
 
   /**
    * @brief Returns number of rows to skip from the start.
    */
-  size_type get_skip_rows() const { return _skip_rows; }
+  [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_num_rows() const { return _num_rows; }
+  [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Set names of the column to be read.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 89719cb7f67..44ede9b0d63 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -159,27 +159,27 @@ class csv_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns compression format of the source.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns number of bytes to skip from source start.
    */
-  std::size_t get_byte_range_offset() const { return _byte_range_offset; }
+  [[nodiscard]] std::size_t get_byte_range_offset() const { return _byte_range_offset; }
 
   /**
    * @brief Returns number of bytes to read.
    */
-  std::size_t get_byte_range_size() const { return _byte_range_size; }
+  [[nodiscard]] std::size_t get_byte_range_size() const { return _byte_range_size; }
 
   /**
    * @brief Returns number of bytes to read with padding.
    */
-  std::size_t get_byte_range_size_with_padding() const
+  [[nodiscard]] std::size_t get_byte_range_size_with_padding() const
   {
     if (_byte_range_size == 0) {
       return 0;
@@ -191,7 +191,7 @@ class csv_reader_options {
   /**
    * @brief Returns number of bytes to pad when reading.
    */
-  std::size_t get_byte_range_padding() const
+  [[nodiscard]] std::size_t get_byte_range_padding() const
   {
     auto const num_names   = _names.size();
     auto const num_dtypes  = std::visit([](const auto& dtypes) { return dtypes.size(); }, _dtypes);
@@ -213,127 +213,139 @@ class csv_reader_options {
   /**
    * @brief Returns names of the columns.
    */
-  std::vector<std::string> const& get_names() const { return _names; }
+  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
 
   /**
    * @brief Returns prefix to be used for column ID.
    */
-  std::string get_prefix() const { return _prefix; }
+  [[nodiscard]] std::string get_prefix() const { return _prefix; }
 
   /**
    * @brief Whether to rename duplicate column names.
    */
-  bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
+  [[nodiscard]] bool is_enabled_mangle_dupe_cols() const { return _mangle_dupe_cols; }
 
   /**
    * @brief Returns names of the columns to be read.
    */
-  std::vector<std::string> const& get_use_cols_names() const { return _use_cols_names; }
+  [[nodiscard]] std::vector<std::string> const& get_use_cols_names() const
+  {
+    return _use_cols_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read.
    */
-  std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
+  [[nodiscard]] std::vector<int> const& get_use_cols_indexes() const { return _use_cols_indexes; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_nrows() const { return _nrows; }
+  [[nodiscard]] size_type get_nrows() const { return _nrows; }
 
   /**
    * @brief Returns number of rows to skip from start.
    */
-  size_type get_skiprows() const { return _skiprows; }
+  [[nodiscard]] size_type get_skiprows() const { return _skiprows; }
 
   /**
    * @brief Returns number of rows to skip from end.
    */
-  size_type get_skipfooter() const { return _skipfooter; }
+  [[nodiscard]] size_type get_skipfooter() const { return _skipfooter; }
 
   /**
    * @brief Returns header row index.
    */
-  size_type get_header() const { return _header; }
+  [[nodiscard]] size_type get_header() const { return _header; }
 
   /**
    * @brief Returns line terminator.
    */
-  char get_lineterminator() const { return _lineterminator; }
+  [[nodiscard]] char get_lineterminator() const { return _lineterminator; }
 
   /**
    * @brief Returns field delimiter.
    */
-  char get_delimiter() const { return _delimiter; }
+  [[nodiscard]] char get_delimiter() const { return _delimiter; }
 
   /**
    * @brief Returns numeric data thousands separator.
    */
-  char get_thousands() const { return _thousands; }
+  [[nodiscard]] char get_thousands() const { return _thousands; }
 
   /**
    * @brief Returns decimal point character.
    */
-  char get_decimal() const { return _decimal; }
+  [[nodiscard]] char get_decimal() const { return _decimal; }
 
   /**
    * @brief Returns comment line start character.
    */
-  char get_comment() const { return _comment; }
+  [[nodiscard]] char get_comment() const { return _comment; }
 
   /**
    * @brief Whether to treat `\r\n` as line terminator.
    */
-  bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
+  [[nodiscard]] bool is_enabled_windowslinetermination() const { return _windowslinetermination; }
 
   /**
    * @brief Whether to treat whitespace as field delimiter.
    */
-  bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
+  [[nodiscard]] bool is_enabled_delim_whitespace() const { return _delim_whitespace; }
 
   /**
    * @brief Whether to skip whitespace after the delimiter.
    */
-  bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
+  [[nodiscard]] bool is_enabled_skipinitialspace() const { return _skipinitialspace; }
 
   /**
    * @brief Whether to ignore empty lines or parse line values as invalid.
    */
-  bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
+  [[nodiscard]] bool is_enabled_skip_blank_lines() const { return _skip_blank_lines; }
 
   /**
    * @brief Returns quoting style.
    */
-  quote_style get_quoting() const { return _quoting; }
+  [[nodiscard]] quote_style get_quoting() const { return _quoting; }
 
   /**
    * @brief Returns quoting character.
    */
-  char get_quotechar() const { return _quotechar; }
+  [[nodiscard]] char get_quotechar() const { return _quotechar; }
 
   /**
    * @brief Whether a quote inside a value is double-quoted.
    */
-  bool is_enabled_doublequote() const { return _doublequote; }
+  [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
 
   /**
    * @brief Returns names of columns to read as datetime.
    */
-  std::vector<std::string> const& get_parse_dates_names() const { return _parse_dates_names; }
+  [[nodiscard]] std::vector<std::string> const& get_parse_dates_names() const
+  {
+    return _parse_dates_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read as datetime.
    */
-  std::vector<int> const& get_parse_dates_indexes() const { return _parse_dates_indexes; }
+  [[nodiscard]] std::vector<int> const& get_parse_dates_indexes() const
+  {
+    return _parse_dates_indexes;
+  }
 
   /**
    * @brief Returns names of columns to read as hexadecimal.
    */
-  std::vector<std::string> const& get_parse_hex_names() const { return _parse_hex_names; }
+  [[nodiscard]] std::vector<std::string> const& get_parse_hex_names() const
+  {
+    return _parse_hex_names;
+  }
 
   /**
    * @brief Returns indexes of columns to read as hexadecimal.
    */
-  std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
+  [[nodiscard]] std::vector<int> const& get_parse_hex_indexes() const { return _parse_hex_indexes; }
 
   /**
    * @brief Returns per-column types.
@@ -1277,52 +1289,52 @@ class csv_writer_options {
   /**
    * @brief Returns sink used for writer output.
    */
-  sink_info const& get_sink(void) const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns table that would be written to output.
    */
-  table_view const& get_table(void) const { return _table; }
+  [[nodiscard]] table_view const& get_table() const { return _table; }
 
   /**
    * @brief Returns optional associated metadata.
    */
-  table_metadata const* get_metadata(void) const { return _metadata; }
+  [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns string to used for null entries.
    */
-  std::string get_na_rep(void) const { return _na_rep; }
+  [[nodiscard]] std::string get_na_rep() const { return _na_rep; }
 
   /**
    * @brief Whether to write headers to csv.
    */
-  bool is_enabled_include_header(void) const { return _include_header; }
+  [[nodiscard]] bool is_enabled_include_header() const { return _include_header; }
 
   /**
    * @brief Returns maximum number of rows to process for each file write.
    */
-  size_type get_rows_per_chunk(void) const { return _rows_per_chunk; }
+  [[nodiscard]] size_type get_rows_per_chunk() const { return _rows_per_chunk; }
 
   /**
    * @brief Returns character used for separating lines.
    */
-  std::string get_line_terminator(void) const { return _line_terminator; }
+  [[nodiscard]] std::string get_line_terminator() const { return _line_terminator; }
 
   /**
    * @brief Returns character used for separating lines.
    */
-  char get_inter_column_delimiter(void) const { return _inter_column_delimiter; }
+  [[nodiscard]] char get_inter_column_delimiter() const { return _inter_column_delimiter; }
 
   /**
    * @brief Returns string used for values != 0 in INT8 types.
    */
-  std::string get_true_value(void) const { return _true_value; }
+  [[nodiscard]] std::string get_true_value() const { return _true_value; }
 
   /**
    * @brief Returns string used for values == 0 in INT8 types.
    */
-  std::string get_false_value(void) const { return _false_value; }
+  [[nodiscard]] std::string get_false_value() const { return _false_value; }
 
   // Setter
   /**
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 2c1966ee6ba..6d4c8ec9b8c 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -120,7 +120,7 @@ class data_sink {
    *
    * @return bool If this writer supports device_write() calls.
    */
-  virtual bool supports_device_write() const { return false; }
+  [[nodiscard]] virtual bool supports_device_write() const { return false; }
 
   /**
    * @brief Estimates whether a direct device write would be more optimal for the given size.
@@ -128,7 +128,10 @@ class data_sink {
    * @param size Number of bytes to write
    * @return whether the device write is expected to be more performant for the given size
    */
-  virtual bool is_device_write_preferred(size_t size) const { return supports_device_write(); }
+  [[nodiscard]] virtual bool is_device_write_preferred(size_t size) const
+  {
+    return supports_device_write();
+  }
 
   /**
    * @brief Append the buffer content to the sink from a gpu address
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 627ec29a496..18ab8aad088 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -52,12 +52,12 @@ class datasource {
     /**
      * @brief Returns the buffer size in bytes.
      */
-    virtual size_t size() const = 0;
+    [[nodiscard]] virtual size_t size() const = 0;
 
     /**
      * @brief Returns the address of the data in the buffer.
      */
-    virtual uint8_t const* data() const = 0;
+    [[nodiscard]] virtual uint8_t const* data() const = 0;
 
     /**
      * @brief Base class destructor
@@ -155,7 +155,7 @@ class datasource {
    *
    * @return bool Whether this source supports device_read() calls
    */
-  virtual bool supports_device_read() const { return false; }
+  [[nodiscard]] virtual bool supports_device_read() const { return false; }
 
   /**
    * @brief Estimates whether a direct device read would be more optimal for the given size.
@@ -163,7 +163,10 @@ class datasource {
    * @param size Number of bytes to read
    * @return whether the device read is expected to be more performant for the given size
    */
-  virtual bool is_device_read_preferred(size_t size) const { return supports_device_read(); }
+  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
+  {
+    return supports_device_read();
+  }
 
   /**
    * @brief Returns a device buffer with a subset of data from the source.
@@ -243,31 +246,31 @@ class datasource {
    *
    * @return size_t The size of the source data in bytes
    */
-  virtual size_t size() const = 0;
+  [[nodiscard]] virtual size_t size() const = 0;
 
   /**
    * @brief Returns whether the source contains any data.
    *
    * @return bool True if there is data, False otherwise
    */
-  virtual bool is_empty() const { return size() == 0; }
+  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
 
   /**
    * @brief Implementation for non owning buffer where datasource holds buffer until destruction.
    */
   class non_owning_buffer : public buffer {
    public:
-    non_owning_buffer() : _data(0), _size(0) {}
+    non_owning_buffer() {}
 
     non_owning_buffer(uint8_t* data, size_t size) : _data(data), _size(size) {}
 
-    size_t size() const override { return _size; }
+    [[nodiscard]] size_t size() const override { return _size; }
 
-    uint8_t const* data() const override { return _data; }
+    [[nodiscard]] uint8_t const* data() const override { return _data; }
 
    private:
-    uint8_t* const _data;
-    size_t const _size;
+    uint8_t* const _data{nullptr};
+    size_t const _size{0};
   };
 
   /**
@@ -297,9 +300,12 @@ class datasource {
     {
     }
 
-    size_t size() const override { return _size; }
+    [[nodiscard]] size_t size() const override { return _size; }
 
-    uint8_t const* data() const override { return static_cast<uint8_t const*>(_data_ptr); }
+    [[nodiscard]] uint8_t const* data() const override
+    {
+      return static_cast<uint8_t const*>(_data_ptr);
+    }
 
    private:
     Container _data;
@@ -330,8 +336,8 @@ class arrow_io_source : public datasource {
       : arrow_buffer(arrow_buffer)
     {
     }
-    size_t size() const override { return arrow_buffer->size(); }
-    uint8_t const* data() const override { return arrow_buffer->data(); }
+    [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
+    [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
   };
 
  public:
@@ -393,7 +399,7 @@ class arrow_io_source : public datasource {
   /**
    * @brief Returns the size of the data in the `arrow` source.
    */
-  size_t size() const override
+  [[nodiscard]] size_t size() const override
   {
     auto result = arrow_file->GetSize();
     CUDF_EXPECTS(result.ok(), "Cannot get file size");
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 5f34803f28e..727c24a4431 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -111,7 +111,7 @@ class json_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns data types of the columns.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index b3a2f6bcbbb..51f82bc4061 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -102,12 +102,12 @@ class orc_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns names of the columns to read.
    */
-  std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns vector of vectors, stripes to read for each input source
@@ -491,27 +491,27 @@ class orc_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression type.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
+  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto get_stripe_size_rows() const { return _stripe_size_rows; }
+  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
@@ -525,17 +525,20 @@ class orc_writer_options {
   /**
    * @brief Returns table to be written to output.
    */
-  table_view get_table() const { return _table; }
+  [[nodiscard]] table_view get_table() const { return _table; }
 
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
    */
-  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
 
   // Setters
 
@@ -814,27 +817,27 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression type.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
    */
-  auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
+  [[nodiscard]] auto get_stripe_size_bytes() const { return _stripe_size_bytes; }
 
   /**
    * @brief Returns maximum stripe size, in rows.
    */
-  auto get_stripe_size_rows() const { return _stripe_size_rows; }
+  [[nodiscard]] auto get_stripe_size_rows() const { return _stripe_size_rows; }
 
   /**
    * @brief Returns the row index stride.
@@ -848,12 +851,15 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
    */
-  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+  [[nodiscard]] std::map<std::string, std::string> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
 
   // Setters
 
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 740f7a8b2db..2ceac947c8d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -96,33 +96,36 @@ class parquet_reader_options {
   /**
    * @brief Returns source info.
    */
-  source_info const& get_source() const { return _source; }
+  [[nodiscard]] source_info const& get_source() const { return _source; }
 
   /**
    * @brief Returns true/false depending on whether strings should be converted to categories or
    * not.
    */
-  bool is_enabled_convert_strings_to_categories() const { return _convert_strings_to_categories; }
+  [[nodiscard]] bool is_enabled_convert_strings_to_categories() const
+  {
+    return _convert_strings_to_categories;
+  }
 
   /**
    * @brief Returns true/false depending whether to use pandas metadata or not while reading.
    */
-  bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
+  [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
 
   /**
    * @brief Returns number of rows to skip from the start.
    */
-  size_type get_skip_rows() const { return _skip_rows; }
+  [[nodiscard]] size_type get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of rows to read.
    */
-  size_type get_num_rows() const { return _num_rows; }
+  [[nodiscard]] size_type get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Returns names of column to be read.
    */
-  std::vector<std::string> const& get_columns() const { return _columns; }
+  [[nodiscard]] std::vector<std::string> const& get_columns() const { return _columns; }
 
   /**
    * @brief Returns list of individual row groups to be read.
@@ -421,32 +424,32 @@ class parquet_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression format used.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns level of statistics requested in output file.
    */
-  statistics_freq get_stats_level() const { return _stats_level; }
+  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
    * @brief Returns table_view.
    */
-  table_view get_table() const { return _table; }
+  [[nodiscard]] table_view get_table() const { return _table; }
 
   /**
    * @brief Returns partitions.
    */
-  std::vector<partition_info> const& get_partitions() const { return _partitions; }
+  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
 
   /**
    * @brief Returns associated metadata.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
@@ -801,22 +804,22 @@ class chunked_parquet_writer_options {
   /**
    * @brief Returns sink info.
    */
-  sink_info const& get_sink() const { return _sink; }
+  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
 
   /**
    * @brief Returns compression format used.
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns level of statistics requested in output file.
    */
-  statistics_freq get_stats_level() const { return _stats_level; }
+  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
 
   /**
    * @brief Returns metadata information.
    */
-  table_input_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] table_input_metadata const* get_metadata() const { return _metadata; }
 
   /**
    * @brief Returns Key-Value footer metadata information.
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index e65afa04fe5..5e6dda5a514 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -36,8 +36,8 @@ namespace text {
  */
 class device_data_chunk {
  public:
-  virtual char const* data() const                 = 0;
-  virtual std::size_t size() const                 = 0;
+  [[nodiscard]] virtual char const* data() const   = 0;
+  [[nodiscard]] virtual std::size_t size() const   = 0;
   virtual operator device_span<char const>() const = 0;
 };
 
@@ -76,7 +76,7 @@ class data_chunk_reader {
  */
 class data_chunk_source {
  public:
-  virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
+  [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
 }  // namespace text
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 6b95de53ee7..aeb4b7fff53 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -41,8 +41,8 @@ class device_span_data_chunk : public device_data_chunk {
  public:
   device_span_data_chunk(device_span<char const> data) : _data(data) {}
 
-  char const* data() const override { return _data.data(); }
-  std::size_t size() const override { return _data.size(); }
+  [[nodiscard]] char const* data() const override { return _data.data(); }
+  [[nodiscard]] std::size_t size() const override { return _data.size(); }
   operator device_span<char const>() const override { return _data; }
 
  private:
@@ -53,8 +53,8 @@ class device_uvector_data_chunk : public device_data_chunk {
  public:
   device_uvector_data_chunk(rmm::device_uvector<char>&& data) : _data(std::move(data)) {}
 
-  char const* data() const override { return _data.data(); }
-  std::size_t size() const override { return _data.size(); }
+  [[nodiscard]] char const* data() const override { return _data.data(); }
+  [[nodiscard]] std::size_t size() const override { return _data.size(); }
   operator device_span<char const>() const override { return _data; }
 
  private:
@@ -171,7 +171,7 @@ class device_span_data_chunk_reader : public data_chunk_reader {
 class file_data_chunk_source : public data_chunk_source {
  public:
   file_data_chunk_source(std::string filename) : _filename(filename) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<istream_data_chunk_reader>(
       std::make_unique<std::ifstream>(_filename, std::ifstream::in));
@@ -187,7 +187,7 @@ class file_data_chunk_source : public data_chunk_source {
 class string_data_chunk_source : public data_chunk_source {
  public:
   string_data_chunk_source(std::string const& data) : _data(data) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<istream_data_chunk_reader>(std::make_unique<std::istringstream>(_data));
   }
@@ -202,7 +202,7 @@ class string_data_chunk_source : public data_chunk_source {
 class device_span_data_chunk_source : public data_chunk_source {
  public:
   device_span_data_chunk_source(device_span<char const> data) : _data(data) {}
-  std::unique_ptr<data_chunk_reader> create_reader() const override
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<device_span_data_chunk_reader>(_data);
   }
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index d3c8909ab51..e7136ac69a5 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -37,7 +37,7 @@ struct multistate {
      * @brief Creates a segment which represents (0, 0]
      */
 
-    constexpr multistate_segment() : _data(0) {}
+    constexpr multistate_segment() = default;
     /**
      * @brief Creates a segment which represents (head, tail]
      *
@@ -52,15 +52,15 @@ struct multistate {
     /**
      * @brief Get's the (head, ____] value from the segment.
      */
-    constexpr uint8_t get_head() const { return _data & 0b1111; }
+    [[nodiscard]] constexpr uint8_t get_head() const { return _data & 0b1111; }
 
     /**
      * @brief Get's the (____, tail] value from the segment.
      */
-    constexpr uint8_t get_tail() const { return _data >> 4; }
+    [[nodiscard]] constexpr uint8_t get_tail() const { return _data >> 4; }
 
    private:
-    uint8_t _data;
+    uint8_t _data{0};
   };
 
  public:
@@ -87,12 +87,12 @@ struct multistate {
   /**
    * @brief get's the number of segments this multistate represents
    */
-  constexpr uint8_t size() const { return _size; }
+  [[nodiscard]] constexpr uint8_t size() const { return _size; }
 
   /**
    * @brief get's the highest (____, tail] value this multistate represents
    */
-  constexpr uint8_t max_tail() const
+  [[nodiscard]] constexpr uint8_t max_tail() const
   {
     uint8_t maximum = 0;
 
@@ -106,12 +106,12 @@ struct multistate {
   /**
    * @brief get's the Nth (head, ____] value state this multistate represents
    */
-  constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
+  [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const { return _segments[idx].get_head(); }
 
   /**
    * @brief get's the Nth (____, tail] value state this multistate represents
    */
-  constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
+  [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const { return _segments[idx].get_tail(); }
 
  private:
   uint8_t _size = 0;
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index d14fe15b0a9..06d15276a68 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -161,13 +161,13 @@ struct trie {
   /**
    * @brief Gets the number of nodes contained in this trie.
    */
-  cudf::size_type size() const { return _nodes.size(); }
+  [[nodiscard]] cudf::size_type size() const { return _nodes.size(); }
 
   /**
    * @brief A pessimistic count of duplicate tokens in the trie. Used to determine the maximum
    * possible stack size required to compute matches of this trie in parallel.
    */
-  cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
+  [[nodiscard]] cudf::size_type max_duplicate_tokens() const { return _max_duplicate_tokens; }
 
   /**
    * @brief Create a trie which represents the given pattern.
@@ -255,7 +255,7 @@ struct trie {
                 cudf::detail::make_device_uvector_sync(trie_nodes, stream, mr)};
   }
 
-  trie_device_view view() const { return trie_device_view{_nodes}; }
+  [[nodiscard]] trie_device_view view() const { return trie_device_view{_nodes}; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 8f06de99f05..7e4ab5b8d9d 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -155,14 +155,8 @@ struct source_info {
 
   source_info() = default;
 
-  explicit source_info(std::vector<std::string> const& file_paths)
-    : _type(io_type::FILEPATH), _filepaths(file_paths)
-  {
-  }
-  explicit source_info(std::string const& file_path)
-    : _type(io_type::FILEPATH), _filepaths({file_path})
-  {
-  }
+  explicit source_info(std::vector<std::string> const& file_paths) : _filepaths(file_paths) {}
+  explicit source_info(std::string const& file_path) : _filepaths({file_path}) {}
 
   explicit source_info(std::vector<host_buffer> const& host_buffers)
     : _type(io_type::HOST_BUFFER), _buffers(host_buffers)
@@ -182,11 +176,11 @@ struct source_info {
   {
   }
 
-  auto type() const { return _type; }
-  auto const& filepaths() const { return _filepaths; }
-  auto const& buffers() const { return _buffers; }
-  auto const& files() const { return _files; }
-  auto const& user_sources() const { return _user_sources; }
+  [[nodiscard]] auto type() const { return _type; }
+  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
+  [[nodiscard]] auto const& buffers() const { return _buffers; }
+  [[nodiscard]] auto const& files() const { return _files; }
+  [[nodiscard]] auto const& user_sources() const { return _user_sources; }
 
  private:
   io_type _type = io_type::FILEPATH;
@@ -200,7 +194,7 @@ struct source_info {
  */
 struct sink_info {
   sink_info() = default;
-  sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {}
+  sink_info(size_t num_sinks) : _num_sinks(num_sinks) {}
 
   explicit sink_info(std::vector<std::string> const& file_paths)
     : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
@@ -226,11 +220,11 @@ struct sink_info {
   {
   }
 
-  auto type() const { return _type; }
-  auto num_sinks() const { return _num_sinks; }
-  auto const& filepaths() const { return _filepaths; }
-  auto const& buffers() const { return _buffers; }
-  auto const& user_sinks() const { return _user_sinks; }
+  [[nodiscard]] auto type() const { return _type; }
+  [[nodiscard]] auto num_sinks() const { return _num_sinks; }
+  [[nodiscard]] auto const& filepaths() const { return _filepaths; }
+  [[nodiscard]] auto const& buffers() const { return _buffers; }
+  [[nodiscard]] auto const& user_sinks() const { return _user_sinks; }
 
  private:
   io_type _type     = io_type::VOID;
@@ -344,51 +338,51 @@ class column_in_metadata {
    * @param i Index of the child to get
    * @return this for chaining
    */
-  column_in_metadata const& child(size_type i) const { return children[i]; }
+  [[nodiscard]] column_in_metadata const& child(size_type i) const { return children[i]; }
 
   /**
    * @brief Get the name of this column
    */
-  std::string get_name() const { return _name; }
+  [[nodiscard]] std::string get_name() const { return _name; }
 
   /**
    * @brief Get whether nullability has been explicitly set for this column.
    */
-  bool is_nullability_defined() const { return _nullable.has_value(); }
+  [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); }
 
   /**
    * @brief Gets the explicitly set nullability for this column.
    * @throws If nullability is not explicitly defined for this column.
    *         Check using `is_nullability_defined()` first.
    */
-  bool nullable() const { return _nullable.value(); }
+  [[nodiscard]] bool nullable() const { return _nullable.value(); }
 
   /**
    * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
    */
-  bool is_map() const { return _list_column_is_map; }
+  [[nodiscard]] bool is_map() const { return _list_column_is_map; }
 
   /**
    * @brief Get whether to encode this timestamp column using deprecated int96 physical type
    */
-  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
 
   /**
    * @brief Get whether precision has been set for this decimal column
    */
-  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+  [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
 
   /**
    * @brief Get the decimal precision that was set for this column.
    * @throws If decimal precision was not set for this column.
    *         Check using `is_decimal_precision_set()` first.
    */
-  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+  [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
   /**
    * @brief Get the number of children of this column
    */
-  size_type num_children() const { return children.size(); }
+  [[nodiscard]] size_type num_children() const { return children.size(); }
 };
 
 class table_input_metadata {
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 30400074c50..8520cb1bb0d 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -607,9 +607,10 @@ class hash_join {
    * @return The exact number of output when performing an inner join between two tables with
    * `build` and `probe` as the the join keys .
    */
-  std::size_t inner_join_size(cudf::table_view const& probe,
-                              null_equality compare_nulls  = null_equality::EQUAL,
-                              rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::size_t inner_join_size(
+    cudf::table_view const& probe,
+    null_equality compare_nulls  = null_equality::EQUAL,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a left join with the specified probe
@@ -622,9 +623,10 @@ class hash_join {
    * @return The exact number of output when performing a left join between two tables with `build`
    * and `probe` as the the join keys .
    */
-  std::size_t left_join_size(cudf::table_view const& probe,
-                             null_equality compare_nulls  = null_equality::EQUAL,
-                             rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::size_t left_join_size(
+    cudf::table_view const& probe,
+    null_equality compare_nulls  = null_equality::EQUAL,
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a full join with the specified probe
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index bdf68037944..34747f4a2c7 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -91,17 +91,17 @@ struct unbound_list_view {
   /**
    * @brief Returns number of elements in this list row.
    */
-  __device__ inline size_type size() const { return _size; }
+  [[nodiscard]] __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Returns whether this row came from the `scatter()` source or target
    */
-  __device__ inline label_type label() const { return _label; }
+  [[nodiscard]] __device__ inline label_type label() const { return _label; }
 
   /**
    * @brief Returns the index in the source/target column
    */
-  __device__ inline size_type row_index() const { return _row_index; }
+  [[nodiscard]] __device__ inline size_type row_index() const { return _row_index; }
 
   /**
    * @brief Binds to source/target column (depending on SOURCE/TARGET labels),
@@ -111,7 +111,7 @@ struct unbound_list_view {
    * @param scatter_target Target column for the scatter operation
    * @return A (bound) list_view for the row that this object represents
    */
-  __device__ inline list_device_view bind_to_column(
+  [[nodiscard]] __device__ inline list_device_view bind_to_column(
     lists_column_device_view const& scatter_source,
     lists_column_device_view const& scatter_target) const
   {
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 5071f046e0c..e4803f98e68 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -69,7 +69,7 @@ class list_device_view {
    * The offset of this element as stored in the child column (i.e. 5)
    * may be fetched using this method.
    */
-  __device__ inline size_type element_offset(size_type idx) const
+  [[nodiscard]] __device__ inline size_type element_offset(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "idx out of bounds");
     return begin_offset + idx;
@@ -91,7 +91,7 @@ class list_device_view {
   /**
    * @brief Checks whether element is null at specified index in the list row.
    */
-  __device__ inline bool is_null(size_type idx) const
+  [[nodiscard]] __device__ inline bool is_null(size_type idx) const
   {
     cudf_assert(idx >= 0 && idx < size() && "Index out of bounds.");
     auto element_offset = begin_offset + idx;
@@ -101,17 +101,20 @@ class list_device_view {
   /**
    * @brief Checks whether this list row is null.
    */
-  __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
+  [[nodiscard]] __device__ inline bool is_null() const { return lists_column.is_null(_row_index); }
 
   /**
    * @brief Fetches the number of elements in this list row.
    */
-  __device__ inline size_type size() const { return _size; }
+  [[nodiscard]] __device__ inline size_type size() const { return _size; }
 
   /**
    * @brief Fetches the lists_column_device_view that contains this list.
    */
-  __device__ inline lists_column_device_view const& get_column() const { return lists_column; }
+  [[nodiscard]] __device__ inline lists_column_device_view const& get_column() const
+  {
+    return lists_column;
+  }
 
   template <typename T>
   struct pair_accessor;
@@ -141,7 +144,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  __device__ inline const_pair_iterator<T> pair_begin() const
+  [[nodiscard]] __device__ inline const_pair_iterator<T> pair_begin() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(0), pair_accessor<T>{*this}};
   }
@@ -151,7 +154,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  __device__ inline const_pair_iterator<T> pair_end() const
+  [[nodiscard]] __device__ inline const_pair_iterator<T> pair_end() const
   {
     return const_pair_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                   pair_accessor<T>{*this}};
@@ -173,7 +176,7 @@ class list_device_view {
    *   2. `p.second == false`
    */
   template <typename T>
-  __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
+  [[nodiscard]] __device__ inline const_pair_rep_iterator<T> pair_rep_begin() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(0),
                                       pair_rep_accessor<T>{*this}};
@@ -184,7 +187,7 @@ class list_device_view {
    * list_device_view.
    */
   template <typename T>
-  __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
+  [[nodiscard]] __device__ inline const_pair_rep_iterator<T> pair_rep_end() const
   {
     return const_pair_rep_iterator<T>{thrust::counting_iterator<size_type>(size()),
                                       pair_rep_accessor<T>{*this}};
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index aff088a7f44..e48707ec298 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -46,12 +46,12 @@ class lists_column_device_view {
   /**
    * @brief Fetches number of rows in the lists column
    */
-  CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
+  [[nodiscard]] CUDF_HOST_DEVICE inline cudf::size_type size() const { return underlying.size(); }
 
   /**
    * @brief Fetches the offsets column of the underlying list column.
    */
-  __device__ inline column_device_view offsets() const
+  [[nodiscard]] __device__ inline column_device_view offsets() const
   {
     return underlying.child(lists_column_view::offsets_column_index);
   }
@@ -59,7 +59,7 @@ class lists_column_device_view {
   /**
    * @brief Fetches the child column of the underlying list column.
    */
-  __device__ inline column_device_view child() const
+  [[nodiscard]] __device__ inline column_device_view child() const
   {
     return underlying.child(lists_column_view::child_column_index);
   }
@@ -67,19 +67,22 @@ class lists_column_device_view {
   /**
    * @brief Indicates whether the list column is nullable.
    */
-  __device__ inline bool nullable() const { return underlying.nullable(); }
+  [[nodiscard]] __device__ inline bool nullable() const { return underlying.nullable(); }
 
   /**
    * @brief Indicates whether the row (i.e. list) at the specified
    * index is null.
    */
-  __device__ inline bool is_null(size_type idx) const { return underlying.is_null(idx); }
+  [[nodiscard]] __device__ inline bool is_null(size_type idx) const
+  {
+    return underlying.is_null(idx);
+  }
 
   /**
    * @brief Fetches the offset of the underlying column_device_view,
    *        in case it is a sliced/offset column.
    */
-  __device__ inline size_type offset() const { return underlying.offset(); }
+  [[nodiscard]] __device__ inline size_type offset() const { return underlying.offset(); }
 
  private:
   column_device_view underlying;
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index b055a050bf8..d09bc2c935f 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -63,21 +63,21 @@ class lists_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the internal column of offsets
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view offsets() const;
+  [[nodiscard]] column_view offsets() const;
 
   /**
    * @brief Returns the internal child column
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view child() const;
+  [[nodiscard]] column_view child() const;
 
   /**
    * @brief Returns the internal child column, applying any offset from the root.
@@ -89,14 +89,14 @@ class lists_column_view : private column_view {
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view get_sliced_child(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] column_view get_sliced_child(rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Return first offset (accounting for column offset)
    *
    * @return int32_t const* Pointer to the first offset
    */
-  offset_iterator offsets_begin() const noexcept
+  [[nodiscard]] offset_iterator offsets_begin() const noexcept
   {
     return offsets().begin<offset_type>() + offset();
   }
@@ -111,7 +111,10 @@ class lists_column_view : private column_view {
    *
    * @return int32_t const* Pointer to one past the last offset
    */
-  offset_iterator offsets_end() const noexcept { return offsets_begin() + size() + 1; }
+  [[nodiscard]] offset_iterator offsets_end() const noexcept
+  {
+    return offsets_begin() + size() + 1;
+  }
 };
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index a4f0a51eac7..4d31bb98f9c 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -56,12 +56,12 @@ struct range_window_bounds {
    * @return true If window is unbounded
    * @return false If window is of finite bounds
    */
-  bool is_unbounded() const { return _is_unbounded; }
+  [[nodiscard]] bool is_unbounded() const { return _is_unbounded; }
 
   /**
    * @brief Returns the underlying scalar value for the bounds
    */
-  scalar const& range_scalar() const { return *_range_scalar; }
+  [[nodiscard]] scalar const& range_scalar() const { return *_range_scalar; }
 
   range_window_bounds(range_window_bounds const&) =
     default;                        // Required to return (by copy) from functions.
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index dc2df368bae..0db729aec28 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -52,7 +52,7 @@ class scalar {
   /**
    * @brief Returns the scalar's logical value type.
    */
-  data_type type() const noexcept;
+  [[nodiscard]] data_type type() const noexcept;
 
   /**
    * @brief Updates the validity of the value.
@@ -72,7 +72,7 @@ class scalar {
    * @return true Value is valid.
    * @return false Value is invalid/null.
    */
-  bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns a raw pointer to the validity bool in device memory.
@@ -82,7 +82,7 @@ class scalar {
   /**
    * @brief Returns a const raw pointer to the validity bool in device memory.
    */
-  bool const* validity_data() const;
+  [[nodiscard]] bool const* validity_data() const;
 
  protected:
   data_type _type{type_id::EMPTY};     ///< Logical type of value in the scalar
@@ -128,7 +128,7 @@ class fixed_width_scalar : public scalar {
  public:
   using value_type = T;
 
-  ~fixed_width_scalar()                          = default;
+  ~fixed_width_scalar() override                 = default;
   fixed_width_scalar(fixed_width_scalar&& other) = default;
 
   fixed_width_scalar& operator=(fixed_width_scalar const& other) = delete;
@@ -278,7 +278,7 @@ class fixed_point_scalar : public scalar {
   using value_type = T;
 
   fixed_point_scalar()                           = delete;
-  ~fixed_point_scalar()                          = default;
+  ~fixed_point_scalar() override                 = default;
   fixed_point_scalar(fixed_point_scalar&& other) = default;
 
   fixed_point_scalar& operator=(fixed_point_scalar const& other) = delete;
@@ -392,7 +392,7 @@ class string_scalar : public scalar {
   using value_type = cudf::string_view;
 
   string_scalar()                      = delete;
-  ~string_scalar()                     = default;
+  ~string_scalar() override            = default;
   string_scalar(string_scalar&& other) = default;
 
   // string_scalar(string_scalar const& other) = delete;
@@ -479,24 +479,25 @@ class string_scalar : public scalar {
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  std::string to_string(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] std::string to_string(
+    rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Get the value of the scalar as a string_view.
    *
    * @param stream CUDA stream used for device memory operations.
    */
-  value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
+  [[nodiscard]] value_type value(rmm::cuda_stream_view stream = rmm::cuda_stream_default) const;
 
   /**
    * @brief Returns the size of the string in bytes.
    */
-  size_type size() const;
+  [[nodiscard]] size_type size() const;
 
   /**
    * @brief Returns a raw pointer to the string in device memory.
    */
-  const char* data() const;
+  [[nodiscard]] const char* data() const;
 
  protected:
   rmm::device_buffer _data{};  ///< device memory containing the string
@@ -647,7 +648,7 @@ class duration_scalar : public chrono_scalar<T> {
 class list_scalar : public scalar {
  public:
   list_scalar()                    = delete;
-  ~list_scalar()                   = default;
+  ~list_scalar() override          = default;
   list_scalar(list_scalar&& other) = default;
 
   list_scalar& operator=(list_scalar const& other) = delete;
@@ -695,7 +696,7 @@ class list_scalar : public scalar {
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
    */
-  column_view view() const;
+  [[nodiscard]] column_view view() const;
 
  private:
   cudf::column _data;
@@ -707,7 +708,7 @@ class list_scalar : public scalar {
 class struct_scalar : public scalar {
  public:
   struct_scalar()                      = delete;
-  ~struct_scalar()                     = default;
+  ~struct_scalar() override            = default;
   struct_scalar(struct_scalar&& other) = default;
   struct_scalar& operator=(struct_scalar const& other) = delete;
   struct_scalar& operator=(struct_scalar&& other) = delete;
@@ -765,7 +766,7 @@ class struct_scalar : public scalar {
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
    */
-  table_view view() const;
+  [[nodiscard]] table_view view() const;
 
  private:
   table _data;
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 56afa150dfc..ae658da9f9b 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -37,7 +37,7 @@ class scalar_device_view_base {
   /**
    * @brief Returns the value type
    */
-  __host__ __device__ data_type type() const noexcept { return _type; }
+  [[nodiscard]] __host__ __device__ data_type type() const noexcept { return _type; }
 
   /**
    * @brief Returns whether the scalar holds a valid value (i.e., not null).
@@ -45,7 +45,7 @@ class scalar_device_view_base {
    * @return true The element is valid
    * @return false The element is null
    */
-  __device__ bool is_valid() const noexcept { return *_is_valid; }
+  [[nodiscard]] __device__ bool is_valid() const noexcept { return *_is_valid; }
 
   /**
    * @brief Updates the validity of the value
@@ -260,17 +260,23 @@ class string_scalar_device_view : public detail::scalar_device_view_base {
   /**
    * @brief Returns string_view of the value of this scalar.
    */
-  __device__ ValueType value() const noexcept { return ValueType{this->data(), _size}; }
+  [[nodiscard]] __device__ ValueType value() const noexcept
+  {
+    return ValueType{this->data(), _size};
+  }
 
   /**
    * @brief Returns a raw pointer to the value in device memory
    */
-  __device__ char const* data() const noexcept { return static_cast<char const*>(_data); }
+  [[nodiscard]] __device__ char const* data() const noexcept
+  {
+    return static_cast<char const*>(_data);
+  }
 
   /**
    * @brief Returns the size of the string in bytes.
    */
-  __device__ size_type size() const noexcept { return _size; }
+  [[nodiscard]] __device__ size_type size() const noexcept { return _size; }
 
  private:
   const char* _data{};  ///< Pointer to device memory containing the value
diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp
index 8435c47eaf5..f6645f2e029 100644
--- a/cpp/include/cudf/strings/json.hpp
+++ b/cpp/include/cudf/strings/json.hpp
@@ -48,7 +48,10 @@ class get_json_object_options {
    * @brief Returns true/false depending on whether single-quotes for representing strings
    * are allowed.
    */
-  CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const { return allow_single_quotes; }
+  [[nodiscard]] CUDF_HOST_DEVICE inline bool get_allow_single_quotes() const
+  {
+    return allow_single_quotes;
+  }
 
   /**
    * @brief Returns true/false depending on whether individually returned string values have
@@ -72,7 +75,7 @@ class get_json_object_options {
    *
    * @endcode
    */
-  CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
+  [[nodiscard]] CUDF_HOST_DEVICE inline bool get_strip_quotes_from_single_strings() const
   {
     return strip_quotes_from_single_strings;
   }
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 43a90997c86..24c8bfea2be 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -44,7 +44,7 @@ namespace detail {
  */
 __device__ inline size_type characters_in_string(const char* str, size_type bytes)
 {
-  if ((str == 0) || (bytes == 0)) return 0;
+  if ((str == nullptr) || (bytes == 0)) return 0;
   auto ptr = reinterpret_cast<uint8_t const*>(str);
 #ifndef CUDF_JIT_UDF
   return thrust::count_if(
@@ -271,9 +271,9 @@ __device__ inline int string_view::compare(const string_view& in) const
 
 __device__ inline int string_view::compare(const char* data, size_type bytes) const
 {
-  size_type const len1      = size_bytes();
-  const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
-  const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
+  size_type const len1 = size_bytes();
+  const auto* ptr1     = reinterpret_cast<const unsigned char*>(this->data());
+  const auto* ptr2     = reinterpret_cast<const unsigned char*>(data);
   if ((ptr1 == ptr2) && (bytes == len1)) return 0;
   size_type idx = 0;
   for (; (idx < len1) && (idx < bytes); ++idx) {
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 22409ab3dc7..f88f573ac0c 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -51,20 +51,20 @@ class string_view {
   /**
    * @brief Return the number of bytes in this string
    */
-  CUDF_HOST_DEVICE inline size_type size_bytes() const { return _bytes; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline size_type size_bytes() const { return _bytes; }
   /**
    * @brief Return the number of characters in this string
    */
-  __device__ inline size_type length() const;
+  __device__ [[nodiscard]] inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    */
-  CUDF_HOST_DEVICE inline const char* data() const { return _data; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline const char* data() const { return _data; }
 
   /**
    * @brief Return true if string has no characters
    */
-  CUDF_HOST_DEVICE inline bool empty() const { return size_bytes() == 0; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline bool empty() const { return size_bytes() == 0; }
 
   /**
    * @brief Handy iterator for navigating through encoded characters.
@@ -96,8 +96,8 @@ class string_view {
     __device__ inline bool operator>(const const_iterator&) const;
     __device__ inline bool operator>=(const const_iterator&) const;
     __device__ inline char_utf8 operator*() const;
-    __device__ inline size_type position() const;
-    __device__ inline size_type byte_offset() const;
+    [[nodiscard]] __device__ inline size_type position() const;
+    [[nodiscard]] __device__ inline size_type byte_offset() const;
 
    private:
     const char* p{};
@@ -109,11 +109,11 @@ class string_view {
   /**
    * @brief Return new iterator pointing to the beginning of this string
    */
-  __device__ inline const_iterator begin() const;
+  __device__ [[nodiscard]] inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    */
-  __device__ inline const_iterator end() const;
+  __device__ [[nodiscard]] inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
@@ -126,7 +126,7 @@ class string_view {
    *
    * @param pos Character position
    */
-  __device__ inline size_type byte_offset(size_type pos) const;
+  __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -141,7 +141,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ inline int compare(const string_view& str) const;
+  __device__ [[nodiscard]] inline int compare(const string_view& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -193,9 +193,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if str is not found in this string.
    */
-  __device__ inline size_type find(const string_view& str,
-                                   size_type pos   = 0,
-                                   size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type find(const string_view& str,
+                                                 size_type pos   = 0,
+                                                 size_type count = -1) const;
   /**
    * @brief Returns the character position of the first occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -221,9 +221,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type find(char_utf8 character,
-                                   size_type pos   = 0,
-                                   size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type find(char_utf8 character,
+                                                 size_type pos   = 0,
+                                                 size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * argument str is found in this string within the character range [pos,pos+n).
@@ -234,9 +234,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type rfind(const string_view& str,
-                                    size_type pos   = 0,
-                                    size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type rfind(const string_view& str,
+                                                  size_type pos   = 0,
+                                                  size_type count = -1) const;
   /**
    * @brief Returns the character position of the last occurrence where the
    * array str is found in this string within the character range [pos,pos+n).
@@ -262,9 +262,9 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return -1 if arg string is not found in this string.
    */
-  __device__ inline size_type rfind(char_utf8 character,
-                                    size_type pos   = 0,
-                                    size_type count = -1) const;
+  __device__ [[nodiscard]] inline size_type rfind(char_utf8 character,
+                                                  size_type pos   = 0,
+                                                  size_type count = -1) const;
 
   /**
    * @brief Return a sub-string of this string. The original string and device
@@ -274,7 +274,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  __device__ inline string_view substr(size_type start, size_type length) const;
+  __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -300,7 +300,7 @@ class string_view {
   /**
    * @brief Default constructor represents an empty string.
    */
-  CUDF_HOST_DEVICE inline string_view() : _data(""), _bytes(0), _length(0) {}
+  CUDF_HOST_DEVICE inline string_view() : _data("") {}
 
   /**
    * @brief Create instance from existing device char array.
@@ -330,7 +330,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  __device__ inline size_type character_offset(size_type bytepos) const;
+  __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
 };
 
 namespace strings {
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index fb3b9387a9b..aab898932de 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -58,14 +58,14 @@ class strings_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the internal column of offsets
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view offsets() const;
+  [[nodiscard]] column_view offsets() const;
 
   /**
    * @brief Return an iterator for the offsets child column.
@@ -74,7 +74,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing to the first offset value.
    */
-  offset_iterator offsets_begin() const;
+  [[nodiscard]] offset_iterator offsets_begin() const;
 
   /**
    * @brief Return an end iterator for the offsets child column.
@@ -83,14 +83,14 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing 1 past the last offset value.
    */
-  offset_iterator offsets_end() const;
+  [[nodiscard]] offset_iterator offsets_end() const;
 
   /**
    * @brief Returns the internal column of chars
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view chars() const;
+  [[nodiscard]] column_view chars() const;
 
   /**
    * @brief Returns the number of bytes in the chars child column.
@@ -98,7 +98,7 @@ class strings_column_view : private column_view {
    * This accounts for empty columns but does not reflect a sliced parent column
    * view  (i.e.: non-zero offset or reduced row count).
    */
-  size_type chars_size() const noexcept;
+  [[nodiscard]] size_type chars_size() const noexcept;
 
   /**
    * @brief Return an iterator for the chars child column.
@@ -111,7 +111,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing to the first char byte.
    */
-  chars_iterator chars_begin() const;
+  [[nodiscard]] chars_iterator chars_begin() const;
 
   /**
    * @brief Return an end iterator for the offsets child column.
@@ -121,7 +121,7 @@ class strings_column_view : private column_view {
    *
    * @return Iterator pointing 1 past the last char byte.
    */
-  chars_iterator chars_end() const;
+  [[nodiscard]] chars_iterator chars_end() const;
 };
 
 //! Strings column APIs.
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index a25dce9c278..329c24cfe0a 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -60,7 +60,7 @@ class structs_column_view : public column_view {
    *
    * @throw cudf::logic error if this is an empty column
    */
-  column_view get_sliced_child(int index) const;
+  [[nodiscard]] column_view get_sliced_child(int index) const;
 };         // class structs_column_view;
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 3c4b4dda61e..4a3c31d08e9 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -71,18 +71,18 @@ class table {
   /**
    * @brief Returns the number of columns in the table
    */
-  size_type num_columns() const noexcept { return _columns.size(); }
+  [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
    */
-  size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; }
 
   /**
    * @brief Returns an immutable, non-owning `table_view` of the contents of
    *this `table`.
    */
-  table_view view() const;
+  [[nodiscard]] table_view view() const;
 
   /**
    * @brief Conversion operator to an immutable, non-owning `table_view` of the
@@ -141,7 +141,7 @@ class table {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<cudf::size_type> const& column_indices) const
+  [[nodiscard]] table_view select(std::vector<cudf::size_type> const& column_indices) const
   {
     return select(column_indices.begin(), column_indices.end());
   };
@@ -166,7 +166,7 @@ class table {
    * @param i Index of the desired column
    * @return A const reference to the desired column
    */
-  column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); }
+  [[nodiscard]] column const& get_column(cudf::size_type i) const { return *(_columns.at(i)); }
 
  private:
   std::vector<std::unique_ptr<column>> _columns{};
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 2404fe88a9c..ce61e8853b6 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -61,9 +61,9 @@ class table_device_view_base {
     return _columns[column_index];
   }
 
-  __host__ __device__ size_type num_columns() const noexcept { return _num_columns; }
+  [[nodiscard]] __host__ __device__ size_type num_columns() const noexcept { return _num_columns; }
 
-  __host__ __device__ size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] __host__ __device__ size_type num_rows() const noexcept { return _num_rows; }
 
   void destroy();
 
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 8abd7aed8e9..77b9e539506 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -87,7 +87,7 @@ class table_view_base {
   /**
    * @brief Returns an iterator to the first view in the `table`.
    */
-  const_iterator begin() const noexcept { return std::begin(_columns); }
+  [[nodiscard]] const_iterator begin() const noexcept { return std::begin(_columns); }
 
   /**
    * @brief Returns an iterator one past the last column view in the `table`.
@@ -103,7 +103,7 @@ class table_view_base {
    * `end()` acts as a place holder. Attempting to dereference it results in
    * undefined behavior.
    */
-  const_iterator end() const noexcept { return std::end(_columns); }
+  [[nodiscard]] const_iterator end() const noexcept { return std::end(_columns); }
 
   /**
    * @brief Returns a reference to the view of the specified column
@@ -119,17 +119,17 @@ class table_view_base {
   /**
    * @brief Returns the number of columns
    */
-  size_type num_columns() const noexcept { return _columns.size(); }
+  [[nodiscard]] size_type num_columns() const noexcept { return _columns.size(); }
 
   /**
    * @brief Returns the number of rows
    */
-  size_type num_rows() const noexcept { return _num_rows; }
+  [[nodiscard]] size_type num_rows() const noexcept { return _num_rows; }
 
   /**
    * @brief Returns true if `num_columns()` returns zero, or false otherwise
    */
-  size_type is_empty() const noexcept { return num_columns() == 0; }
+  [[nodiscard]] size_type is_empty() const noexcept { return num_columns() == 0; }
 
   table_view_base() = default;
 
@@ -208,7 +208,7 @@ class table_view : public detail::table_view_base<column_view> {
    * @return A table_view consisting of columns from the original table
    * specified by the elements of `column_indices`
    */
-  table_view select(std::vector<size_type> const& column_indices) const;
+  [[nodiscard]] table_view select(std::vector<size_type> const& column_indices) const;
 };
 
 /**
@@ -227,7 +227,7 @@ class mutable_table_view : public detail::table_view_base<mutable_column_view> {
 
   mutable_table_view() = default;
 
-  mutable_column_view& column(size_type column_index) const
+  [[nodiscard]] mutable_column_view& column(size_type column_index) const
   {
     return const_cast<mutable_column_view&>(table_view_base::column(column_index));
   }
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
index c7513452387..696657191ca 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
@@ -82,28 +82,28 @@ class tdigest_column_view : private column_view {
   /**
    * @brief Returns the parent column.
    */
-  column_view parent() const;
+  [[nodiscard]] column_view parent() const;
 
   /**
    * @brief Returns the column of centroids
    */
-  lists_column_view centroids() const;
+  [[nodiscard]] lists_column_view centroids() const;
 
   /**
    * @brief Returns the internal column of mean values
    */
-  column_view means() const;
+  [[nodiscard]] column_view means() const;
 
   /**
    * @brief Returns the internal column of weight values
    */
-  column_view weights() const;
+  [[nodiscard]] column_view weights() const;
 
   /**
    * @brief Returns an iterator that returns the size of each tdigest
    * in the column (each row is 1 digest)
    */
-  auto size_begin() const
+  [[nodiscard]] auto size_begin() const
   {
     return cudf::detail::make_counting_transform_iterator(
       0, tdigest_size{centroids().offsets_begin()});
@@ -113,13 +113,13 @@ class tdigest_column_view : private column_view {
    * @brief Returns the first min value for the column. Each row corresponds
    * to the minimum value for the accompanying digest.
    */
-  double const* min_begin() const;
+  [[nodiscard]] double const* min_begin() const;
 
   /**
    * @brief Returns the first max value for the column. Each row corresponds
    * to the maximum value for the accompanying digest.
    */
-  double const* max_begin() const;
+  [[nodiscard]] double const* max_begin() const;
 };
 
 }  // namespace tdigest
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 459a4182aa0..6222b2e680e 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -268,12 +268,12 @@ class data_type {
   /**
    * @brief Returns the type identifier
    */
-  constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    */
-  constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 766fe93b9d1..0ac41b2c4a1 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -54,7 +54,7 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;
 
-  constexpr span_base() noexcept : _data(nullptr), _size(0) {}
+  constexpr span_base() noexcept {}
   constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
   constexpr span_base(span_base const& other) noexcept = default;
@@ -71,9 +71,9 @@ class span_base {
   constexpr iterator end() const noexcept { return _data + _size; }
   constexpr pointer data() const noexcept { return _data; }
 
-  constexpr size_type size() const noexcept { return _size; }
-  constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
-  constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -98,8 +98,8 @@ class span_base {
   }
 
  private:
-  pointer _data;
-  size_type _size;
+  pointer _data{nullptr};
+  size_type _size{0};
 };
 
 }  // namespace detail
@@ -251,7 +251,7 @@ class base_2dspan {
   constexpr auto data() const noexcept { return _data; }
   constexpr auto size() const noexcept { return _size; }
   constexpr auto count() const noexcept { return size().first * size().second; }
-  constexpr bool is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
 
   static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept
   {
@@ -263,8 +263,11 @@ class base_2dspan {
     return {this->data() + flatten_index(row, 0, this->size()), this->size().second};
   }
 
-  constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
-  constexpr RowType<T, dynamic_extent> back() const { return (*this)[size().first - 1]; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> front() const { return (*this)[0]; }
+  [[nodiscard]] constexpr RowType<T, dynamic_extent> back() const
+  {
+    return (*this)[size().first - 1];
+  }
 
   constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept
   {
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index 87e4c94070b..d078bf90a8a 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -79,7 +79,7 @@ using Templates0 = Templates<>;
 
 template <typename T>
 struct TypeList {
-  typedef Types<T> type;
+  using type = Types<T>;
 };
 
 template <class... TYPES>
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index 8e242e5a4f3..6c21d8dfad2 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -58,5 +58,5 @@ class temp_directory {
    *
    * @return string path of the temporary directory
    */
-  const std::string& path() const { return _path; }
+  [[nodiscard]] const std::string& path() const { return _path; }
 };
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index b105c5c280e..9f4640f1daf 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -21,8 +21,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <stdint.h>
-#include <string.h>
+#include <cstdint>
+#include <cstring>
 
 namespace nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 2b09ec66203..43cc059eddd 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -19,8 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <stdint.h>
-#include <string.h>
+#include <cstdint>
+#include <cstring>
 
 namespace nvtext {
 
diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py
new file mode 100644
index 00000000000..3a1a663e231
--- /dev/null
+++ b/cpp/scripts/run-clang-tidy.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from __future__ import print_function
+import re
+import os
+import subprocess
+import argparse
+import json
+import multiprocessing as mp
+import shutil
+
+
+EXPECTED_VERSION = "11.1.0"
+VERSION_REGEX = re.compile(r"  LLVM version ([0-9.]+)")
+GPU_ARCH_REGEX = re.compile(r"sm_(\d+)")
+SPACES = re.compile(r"\s+")
+SEPARATOR = "-" * 16
+
+
+def parse_args():
+    argparser = argparse.ArgumentParser("Runs clang-tidy on a project")
+    argparser.add_argument("-cdb", type=str,
+                           # TODO This is a hack, needs to be fixed
+                           default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json",
+                           help="Path to cmake-generated compilation database"
+                           " file. It is always found inside the root of the "
+                           "cmake build folder. So make sure that `cmake` has "
+                           "been run once before running this script!")
+    argparser.add_argument("-exe", type=str, default="clang-tidy",
+                           help="Path to clang-tidy exe")
+    argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/",
+                           help="Regex used to ignore files from checking")
+    argparser.add_argument("-select", type=str, default=None,
+                           help="Regex used to select files for checking")
+    argparser.add_argument("-j", type=int, default=-1,
+                           help="Number of parallel jobs to launch.")
+    args = argparser.parse_args()
+    if args.j <= 0:
+        args.j = mp.cpu_count()
+    args.ignore_compiled = re.compile(args.ignore) if args.ignore else None
+    args.select_compiled = re.compile(args.select) if args.select else None
+    ret = subprocess.check_output("%s --version" % args.exe, shell=True)
+    ret = ret.decode("utf-8")
+    version = VERSION_REGEX.search(ret)
+    if version is None:
+        raise Exception("Failed to figure out clang-tidy version!")
+    version = version.group(1)
+    if version != EXPECTED_VERSION:
+        raise Exception("clang-tidy exe must be v%s found '%s'" % \
+                        (EXPECTED_VERSION, version))
+    if not os.path.exists(args.cdb):
+        raise Exception("Compilation database '%s' missing" % args.cdb)
+    return args
+
+
+def get_all_commands(cdb):
+    with open(cdb, "r") as fp:
+        return json.load(fp)
+
+
+def get_gpu_archs(command):
+    archs = []
+    for loc in range(len(command)):
+        if command[loc] != "-gencode":
+            continue
+        arch_flag = command[loc + 1]
+        match = GPU_ARCH_REGEX.search(arch_flag)
+        if match is not None:
+            archs.append("--cuda-gpu-arch=sm_%s" % match.group(1))
+    return archs
+
+
+def get_index(arr, item):
+    try:
+        return arr.index(item)
+    except:
+        return -1
+
+
+def remove_item(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc]
+    return loc
+
+
+def remove_item_plus_one(arr, item):
+    loc = get_index(arr, item)
+    if loc >= 0:
+        del arr[loc + 1]
+        del arr[loc]
+    return loc
+
+
+def get_clang_includes(exe):
+    dir = os.getenv("CONDA_PREFIX")
+    if dir is None:
+        ret = subprocess.check_output("which %s 2>&1" % exe, shell=True)
+        ret = ret.decode("utf-8")
+        dir = os.path.dirname(os.path.dirname(ret))
+    header = os.path.join(dir, "include", "ClangHeaders")
+    return ["-I", header]
+
+
+def get_tidy_args(cmd, exe):
+    command, file = cmd["command"], cmd["file"]
+    is_cuda = file.endswith(".cu")
+    command = re.split(SPACES, command)
+    # compiler is always clang++!
+    command[0] = "clang++"
+    # remove compilation and output targets from the original command
+    remove_item_plus_one(command, "-c")
+    remove_item_plus_one(command, "-o")
+    if is_cuda:
+        # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..."
+        archs = get_gpu_archs(command)
+        command.extend(archs)
+        while True:
+            loc = remove_item_plus_one(command, "-gencode")
+            if loc < 0:
+                break
+        # "-x cuda" is the right usage in clang
+        loc = get_index(command, "-x")
+        if loc >= 0:
+            command[loc + 1] = "cuda"
+        remove_item_plus_one(command, "-ccbin")
+        remove_item(command, "--expt-extended-lambda")
+        remove_item(command, "--diag_suppress=unrecognized_gcc_pragma")
+    command.extend(get_clang_includes(exe))
+    return command, is_cuda
+
+
+def run_clang_tidy_command(tidy_cmd):
+    cmd = " ".join(tidy_cmd)
+    result = subprocess.run(cmd, check=False, shell=True,
+                            stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    status = result.returncode == 0
+    if status:
+        out = ""
+    else:
+        out = "CMD: " + cmd
+    out += result.stdout.decode("utf-8").rstrip()
+    return status, out
+
+
+def run_clang_tidy(cmd, args):
+    command, is_cuda = get_tidy_args(cmd, args.exe)
+    tidy_cmd = [args.exe,
+                "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'",
+                cmd["file"], "--", ]
+    tidy_cmd.extend(command)
+    status = True
+    out = ""
+    if is_cuda:
+        tidy_cmd.append("--cuda-device-only")
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        out += out1
+        out += "%s" % SEPARATOR
+        if not ret:
+            status = ret
+        tidy_cmd[-2] = "--cuda-host-only"
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    else:
+        tidy_cmd.append(cmd["file"])
+        ret, out1 = run_clang_tidy_command(tidy_cmd)
+        if not ret:
+            status = ret
+        out += out1
+    return status, out, cmd["file"]
+
+
+# yikes! global var :(
+results = []
+def collect_result(result):
+    global results
+    results.append(result)
+
+
+def print_result(passed, stdout, file):
+    status_str = "PASSED" if passed else "FAILED"
+    print("%s File:%s %s %s" % (SEPARATOR, file, status_str, SEPARATOR))
+    if stdout:
+        print(stdout)
+        print("%s File:%s ENDS %s" % (SEPARATOR, file, SEPARATOR))
+
+
+def print_results():
+    global results
+    status = True
+    for passed, stdout, file in results:
+        print_result(passed, stdout, file)
+        if not passed:
+            status = False
+    return status
+
+
+def run_tidy_for_all_files(args, all_files):
+    pool = None if args.j == 1 else mp.Pool(args.j)
+    # actual tidy checker
+    for cmd in all_files:
+        # skip files that we don't want to look at
+        if args.ignore_compiled is not None and \
+           re.search(args.ignore_compiled, cmd["file"]) is not None:
+            continue
+        if args.select_compiled is not None and \
+           re.search(args.select_compiled, cmd["file"]) is None:
+            continue
+        if pool is not None:
+            pool.apply_async(run_clang_tidy, args=(cmd, args),
+                             callback=collect_result)
+        else:
+            passed, stdout, file = run_clang_tidy(cmd, args)
+            collect_result((passed, stdout, file))
+    if pool is not None:
+        pool.close()
+        pool.join()
+    return print_results()
+
+
+def main():
+    args = parse_args()
+    # Attempt to making sure that we run this script from root of repo always
+    if not os.path.exists(".git"):
+        raise Exception("This needs to always be run from the root of repo")
+    # Check whether clang-tidy exists
+    # print(args)
+    if "exe" not in args and shutil.which("clang-tidy") is not None:
+        print("clang-tidy not found. Exiting...")
+        return
+    all_files = get_all_commands(args.cdb)
+    status = run_tidy_for_all_files(args, all_files)
+    if not status:
+        raise Exception("clang-tidy failed! Refer to the errors above.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1d12fac1938..995c6702cf8 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -164,13 +164,13 @@ struct compare_functor {
 // This functor performs null aware binop between two columns or a column and a scalar by
 // iterating over them on the device
 struct null_considering_binop {
-  auto get_device_view(cudf::scalar const& scalar_item) const
+  [[nodiscard]] auto get_device_view(cudf::scalar const& scalar_item) const
   {
     return get_scalar_device_view(
       static_cast<cudf::scalar_type_t<cudf::string_view>&>(const_cast<scalar&>(scalar_item)));
   }
 
-  auto get_device_view(column_device_view const& col_item) const { return col_item; }
+  [[nodiscard]] auto get_device_view(column_device_view const& col_item) const { return col_item; }
 
   template <typename LhsViewT, typename RhsViewT, typename OutT, typename CompareFunc>
   void populate_out_col(LhsViewT const& lhsv,
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 313fc34567d..4b5f78dc400 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -179,8 +179,8 @@ struct PyMod {
     std::enable_if_t<(std::is_floating_point_v<std::common_type_t<TypeLhs, TypeRhs>>)>* = nullptr>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> double
   {
-    double x1 = static_cast<double>(x);
-    double y1 = static_cast<double>(y);
+    auto x1 = static_cast<double>(x);
+    auto y1 = static_cast<double>(y);
     return fmod(fmod(x1, y1) + y1, y1);
   }
 
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 146e53aae59..9481c236142 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -64,7 +64,7 @@ template <typename BinaryOperator>
 struct is_binary_operation_supported {
   // For types where Out type is fixed. (eg. comparison types)
   template <typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()(void)
+  inline constexpr bool operator()()
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
@@ -80,7 +80,7 @@ struct is_binary_operation_supported {
   }
 
   template <typename TypeOut, typename TypeLhs, typename TypeRhs>
-  inline constexpr bool operator()(void)
+  inline constexpr bool operator()()
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>() and
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 34c0cea683e..3412733f0b2 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -113,7 +113,7 @@ __global__ void concatenate_masks_kernel(column_device_view const* views,
       thrust::upper_bound(
         thrust::seq, output_offsets, output_offsets + number_of_views, mask_index) -
       output_offsets - 1;
-    bool bit_is_set = 1;
+    bool bit_is_set = true;
     if (source_view_index < number_of_views) {
       size_type const column_element_index = mask_index - output_offsets[source_view_index];
       bit_is_set = views[source_view_index].is_valid(column_element_index);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index f8c0006ed45..a74b97da5a1 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1017,7 +1017,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   rmm::device_buffer d_indices_and_source_info(indices_size + src_buf_info_size + offset_stack_size,
                                                stream,
                                                rmm::mr::get_current_device_resource());
-  size_type* d_indices         = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
+  auto* d_indices              = reinterpret_cast<size_type*>(d_indices_and_source_info.data());
   src_buf_info* d_src_buf_info = reinterpret_cast<src_buf_info*>(
     reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) + indices_size);
   size_type* d_offset_stack =
@@ -1198,8 +1198,8 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
   rmm::device_buffer d_src_and_dst_buffers(src_bufs_size + dst_bufs_size + offset_stack_size,
                                            stream,
                                            rmm::mr::get_current_device_resource());
-  uint8_t const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
-  uint8_t** d_dst_bufs       = reinterpret_cast<uint8_t**>(
+  auto const** d_src_bufs = reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data());
+  uint8_t** d_dst_bufs    = reinterpret_cast<uint8_t**>(
     reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size);
 
   // setup src buffers
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 05330a7c492..fa3d19bdcfd 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -55,7 +55,7 @@ struct store_result_functor {
   /**
    * @brief Check if the groupby keys are presorted
    */
-  bool is_presorted() const { return keys_are_sorted == sorted::YES; }
+  [[nodiscard]] bool is_presorted() const { return keys_are_sorted == sorted::YES; }
 
   /**
    * @brief Get the grouped values
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 9ebb516ee14..50e3b812b62 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -49,7 +49,7 @@ struct var_transform {
   {
     if (d_values.is_null(i)) return 0.0;
 
-    ResultType x = static_cast<ResultType>(values_iter[i]);
+    auto x = static_cast<ResultType>(values_iter[i]);
 
     size_type group_idx  = d_group_labels[i];
     size_type group_size = d_group_sizes[group_idx];
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index b7b45341ad2..f48ab852f24 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -327,7 +327,7 @@ __global__ void generate_cluster_limits_kernel(int delta,
   // compute the first cluster limit
   double nearest_w;
   int nearest_w_index;  // group-relative index into the input stream
-  while (1) {
+  while (true) {
     cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
     if (cur_weight >= total_weight) { break; }
 
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index a3f954920c8..64ab69cd377 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -242,7 +242,7 @@ class concurrent_unordered_map {
 
   __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; }
 
-  __host__ __device__ size_type capacity() const { return m_capacity; }
+  [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; }
 
  private:
   /**
diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
index 2b92c9142ca..cdf5b6a8649 100644
--- a/cpp/src/hash/concurrent_unordered_multimap.cuh
+++ b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -503,7 +503,7 @@ class concurrent_unordered_multimap {
     if (count_collisions) m_collisions = 0;
   }
 
-  unsigned long long get_num_collisions() const { return m_collisions; }
+  [[nodiscard]] unsigned long long get_num_collisions() const { return m_collisions; }
 
   void print()
   {
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh
index 0c4acccf33d..db836917808 100644
--- a/cpp/src/hash/hash_allocator.cuh
+++ b/cpp/src/hash/hash_allocator.cuh
@@ -26,7 +26,7 @@
 
 template <class T>
 struct managed_allocator {
-  typedef T value_type;
+  using value_type                    = T;
   rmm::mr::device_memory_resource* mr = new rmm::mr::managed_memory_resource;
 
   managed_allocator() = default;
@@ -62,7 +62,7 @@ bool operator!=(const managed_allocator<T>&, const managed_allocator<U>&)
 
 template <class T>
 struct default_allocator {
-  typedef T value_type;
+  using value_type                    = T;
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
 
   default_allocator() = default;
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index c6cc60a6917..c5aab78589e 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -22,7 +22,7 @@
 struct managed {
   static void* operator new(size_t n)
   {
-    void* ptr          = 0;
+    void* ptr          = nullptr;
     cudaError_t result = cudaMallocManaged(&ptr, n);
     if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
     return ptr;
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 0e0ce8c4335..f368ae9fab5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -168,7 +168,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
   data_type const dtype = DLDataType_to_data_type(tensor.dtype);
 
   size_t const byte_width = size_of(dtype);
-  size_t const num_rows   = static_cast<size_t>(tensor.shape[0]);
+  auto const num_rows     = static_cast<size_t>(tensor.shape[0]);
   size_t const bytes      = num_rows * byte_width;
 
   // For 2D tensors, if the strides pointer is not null, then strides[1] is the
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 9d3db35fea6..c1fa10d19b7 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -15,7 +15,8 @@
  */
 
 #include "avro.h"
-#include <string.h>
+
+#include <cstring>
 #include <unordered_map>
 
 namespace cudf {
@@ -75,7 +76,7 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   sig4 |= get_raw<uint8_t>() << 24;
   if (sig4 != avro_magic) { return false; }
   for (;;) {
-    uint32_t num_md_items = static_cast<uint32_t>(get_encoded<int64_t>());
+    auto num_md_items = static_cast<uint32_t>(get_encoded<int64_t>());
     if (num_md_items == 0) { break; }
     for (uint32_t i = 0; i < num_md_items; i++) {
       auto const key   = get_encoded<std::string>();
@@ -103,8 +104,8 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
     auto const block_size   = static_cast<uint32_t>(get_encoded<int64_t>());
     if (block_size <= 0 || object_count <= 0 || m_cur + block_size + 16 > m_end) { break; }
     if (object_count > first_row) {
-      uint32_t block_row = static_cast<uint32_t>(total_object_count);
-      max_block_size     = std::max(max_block_size, block_size);
+      auto block_row = static_cast<uint32_t>(total_object_count);
+      max_block_size = std::max(max_block_size, block_size);
       total_object_count += object_count;
       if (!md->block_list.size()) {
         md->skip_rows = static_cast<uint32_t>(first_row);
diff --git a/cpp/src/io/avro/avro.h b/cpp/src/io/avro/avro.h
index f84693fdba3..3dd989ffa79 100644
--- a/cpp/src/io/avro/avro.h
+++ b/cpp/src/io/avro/avro.h
@@ -19,11 +19,11 @@
 #include "avro_common.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <map>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
 #include <string>
 #include <vector>
 
@@ -85,7 +85,7 @@ class schema_parser {
   bool parse(std::vector<schema_entry>& schema, const std::string& str);
 
  protected:
-  bool more_data() const { return (m_cur < m_end); }
+  [[nodiscard]] bool more_data() const { return (m_cur < m_end); }
   std::string get_str();
 
  protected:
@@ -103,7 +103,7 @@ class container {
   {
   }
 
-  auto bytecount() const { return m_cur - m_base; }
+  [[nodiscard]] auto bytecount() const { return m_cur - m_base; }
 
   template <typename T>
   T get_raw()
diff --git a/cpp/src/io/avro/avro_common.h b/cpp/src/io/avro/avro_common.h
index 17f12da3165..1df6d176e95 100644
--- a/cpp/src/io/avro/avro_common.h
+++ b/cpp/src/io/avro/avro_common.h
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <io/utilities/column_buffer.hpp>
-#include <stdint.h>
-#include <stdio.h>
+
+#include <cstdint>
+#include <cstdio>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/avro/avro_gpu.cu b/cpp/src/io/avro/avro_gpu.cu
index cb1c32458a3..7985d5df345 100644
--- a/cpp/src/io/avro/avro_gpu.cu
+++ b/cpp/src/io/avro/avro_gpu.cu
@@ -120,7 +120,7 @@ avro_decode_row(schemadesc_s const* schema,
           if (dataptr != nullptr && row < max_rows) { static_cast<int64_t*>(dataptr)[row] = v; }
         } else {  // string or enum
           size_t count    = 0;
-          const char* ptr = 0;
+          const char* ptr = nullptr;
           if (kind == type_enum) {  // dictionary
             size_t idx = schema[i].count + v;
             if (idx < global_dictionary.size()) {
diff --git a/cpp/src/io/comp/brotli_dict.cpp b/cpp/src/io/comp/brotli_dict.cpp
index 3e6939bb816..ef0fab51be6 100644
--- a/cpp/src/io/comp/brotli_dict.cpp
+++ b/cpp/src/io/comp/brotli_dict.cpp
@@ -49,7 +49,8 @@ THE SOFTWARE.
 */
 
 #include "brotli_dict.h"
-#include <stdint.h>
+
+#include <cstdint>
 
 namespace cudf {
 namespace io {
@@ -6528,7 +6529,7 @@ static const brotli_dictionary_s g_dictionary = {
    136, 224, 164, 184, 224, 164, 149, 224, 165, 141, 224, 164, 176, 224, 164, 191, 224, 164, 175,
    224, 164, 164, 224, 164, 190}};
 
-const brotli_dictionary_s* get_brotli_dictionary(void) { return &g_dictionary; }
+const brotli_dictionary_s* get_brotli_dictionary() { return &g_dictionary; }
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_dict.h b/cpp/src/io/comp/brotli_dict.h
index 4c1fec1492c..315fbd9712b 100644
--- a/cpp/src/io/comp/brotli_dict.h
+++ b/cpp/src/io/comp/brotli_dict.h
@@ -79,7 +79,7 @@ struct brotli_dictionary_s {
 constexpr int brotli_min_dictionary_word_length = 4;
 constexpr int brotli_max_dictionary_word_length = 24;
 
-const brotli_dictionary_s* get_brotli_dictionary(void);
+const brotli_dictionary_s* get_brotli_dictionary();
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/comp/brotli_tables.h b/cpp/src/io/comp/brotli_tables.h
index 6e869999329..72a9b40bf95 100644
--- a/cpp/src/io/comp/brotli_tables.h
+++ b/cpp/src/io/comp/brotli_tables.h
@@ -2149,14 +2149,14 @@ CONSTANT uint8_t kContextLookup[2048] = {
   7,
 };
 
-typedef struct CmdLutElement {
+using CmdLutElement = struct CmdLutElement {
   uint8_t insert_len_extra_bits;
   uint8_t copy_len_extra_bits;
   int8_t distance_code;
   uint8_t context;
   uint16_t insert_len_offset;
   uint16_t copy_len_offset;
-} CmdLutElement;
+};
 
 CONSTANT CmdLutElement kCmdLut[brotli_num_command_symbols] = {
   {0x00, 0x00, 0, 0x00, 0x0000, 0x0002},  {0x00, 0x00, 0, 0x01, 0x0000, 0x0003},
diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index 7f37b62e9c2..113623a2e67 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -81,8 +81,9 @@ For more information on these sources, see the manual.
 
 #include "io_uncomp.h"
 #include "unbz2.h"
-#include <stdio.h>
-#include <stdlib.h>
+
+#include <cstdio>
+#include <cstdlib>
 #include <vector>
 
 namespace cudf {
@@ -111,15 +112,15 @@ namespace io {
 
 #define BZ_MAX_SELECTORS (2 + (900000 / BZ_G_SIZE))
 
-typedef struct {
+using huff_s = struct {
   int32_t minLen;
   int32_t limit[BZ_MAX_CODE_LEN];
   int32_t base[BZ_MAX_CODE_LEN];
   uint16_t perm[BZ_MAX_ALPHA_SIZE];
-} huff_s;
+};
 
 // Decoder state
-typedef struct {
+using unbz_state_s = struct {
   // Input
   const uint8_t* cur;
   const uint8_t* end;
@@ -153,7 +154,7 @@ typedef struct {
   uint8_t len[BZ_MAX_ALPHA_SIZE];
 
   huff_s ht[BZ_N_GROUPS];
-} unbz_state_s;
+};
 
 // return next 32 bits
 static inline uint32_t next32bits(const unbz_state_s* s)
@@ -530,7 +531,8 @@ int32_t cpu_bz2_uncompress(
   int ret;
   size_t last_valid_block_in, last_valid_block_out;
 
-  if (dest == NULL || destLen == NULL || source == NULL || sourceLen < 12) return BZ_PARAM_ERROR;
+  if (dest == nullptr || destLen == nullptr || source == nullptr || sourceLen < 12)
+    return BZ_PARAM_ERROR;
   s.currBlockNo = 0;
 
   s.cur  = source;
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 8229245276b..b4a42a66133 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -201,8 +201,8 @@ inline __device__ uint32_t Log2Floor(uint32_t value) { return 32 - __clz(value);
 /// @brief initializes the bit reader
 __device__ void initbits(debrotli_state_s* s, const uint8_t* base, size_t len, size_t pos = 0)
 {
-  const uint8_t* p      = base + pos;
-  uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
+  const uint8_t* p  = base + pos;
+  auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
   p -= prefix_bytes;
   s->base     = base;
   s->end      = base + len;
@@ -248,7 +248,7 @@ inline __device__ uint32_t getbits(debrotli_state_s* s, uint32_t n)
 
 inline __device__ uint32_t getbits_bytealign(debrotli_state_s* s)
 {
-  uint32_t n    = (uint32_t)((-(int32_t)s->bitpos) & 7);
+  auto n        = (uint32_t)((-(int32_t)s->bitpos) & 7);
   uint32_t bits = showbits(s, n);
   skipbits(s, n);
   return bits;
@@ -315,7 +315,7 @@ static __device__ uint8_t* local_alloc(debrotli_state_s* s, uint32_t bytes)
   int heap_used  = s->heap_used;
   auto const len = allocation_size(bytes);
   if (heap_used + len <= s->heap_limit) {
-    uint8_t* ptr = reinterpret_cast<uint8_t*>(&s->heap[heap_used]);
+    auto* ptr    = reinterpret_cast<uint8_t*>(&s->heap[heap_used]);
     s->heap_used = (uint16_t)(heap_used + len);
     return ptr;
   } else {
@@ -351,9 +351,9 @@ static __device__ uint8_t* ext_heap_alloc(uint32_t bytes,
                                           uint8_t* ext_heap_base,
                                           uint32_t ext_heap_size)
 {
-  uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t* heap_ptr = reinterpret_cast<volatile uint32_t*>(ext_heap_base);
-  uint32_t first_free_block   = ~0;
+  uint32_t len              = (bytes + 0xf) & ~0xf;
+  volatile auto* heap_ptr   = reinterpret_cast<volatile uint32_t*>(ext_heap_base);
+  uint32_t first_free_block = ~0;
   for (;;) {
     uint32_t blk_next, blk_prev;
     first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
@@ -421,10 +421,10 @@ static __device__ void ext_heap_free(void* ptr,
                                      uint8_t* ext_heap_base,
                                      uint32_t ext_heap_size)
 {
-  uint32_t len                = (bytes + 0xf) & ~0xf;
-  volatile uint32_t* heap_ptr = (volatile uint32_t*)ext_heap_base;
-  uint32_t first_free_block   = ~0;
-  uint32_t cur_blk            = static_cast<uint32_t>(static_cast<uint8_t*>(ptr) - ext_heap_base);
+  uint32_t len              = (bytes + 0xf) & ~0xf;
+  volatile auto* heap_ptr   = (volatile uint32_t*)ext_heap_base;
+  uint32_t first_free_block = ~0;
+  auto cur_blk              = static_cast<uint32_t>(static_cast<uint8_t*>(ptr) - ext_heap_base);
   for (;;) {
     first_free_block = atomicExch((unsigned int*)heap_ptr, first_free_block);
     if (first_free_block != ~0) { break; }
@@ -1299,7 +1299,7 @@ static __device__ void InverseMoveToFrontTransform(debrotli_state_s* s, uint8_t*
   uint32_t i           = 1;
   uint32_t upper_bound = s->mtf_upper_bound;
   uint32_t* mtf        = &s->mtf[1];  // Make mtf[-1] addressable.
-  uint8_t* mtf_u8      = reinterpret_cast<uint8_t*>(mtf);
+  auto* mtf_u8         = reinterpret_cast<uint8_t*>(mtf);
   uint32_t pattern     = 0x03020100;  // Little-endian
 
   // Initialize list using 4 consequent values pattern.
@@ -1419,12 +1419,12 @@ static __device__ debrotli_huff_tree_group_s* HuffmanTreeGroupInit(debrotli_stat
                                                                    uint32_t max_symbol,
                                                                    uint32_t ntrees)
 {
-  debrotli_huff_tree_group_s* group = reinterpret_cast<debrotli_huff_tree_group_s*>(local_alloc(
+  auto* group          = reinterpret_cast<debrotli_huff_tree_group_s*>(local_alloc(
     s, sizeof(debrotli_huff_tree_group_s) + ntrees * sizeof(uint16_t*) - sizeof(uint16_t*)));
-  group->alphabet_size              = (uint16_t)alphabet_size;
-  group->max_symbol                 = (uint16_t)max_symbol;
-  group->num_htrees                 = (uint16_t)ntrees;
-  group->htrees[0]                  = nullptr;
+  group->alphabet_size = (uint16_t)alphabet_size;
+  group->max_symbol    = (uint16_t)max_symbol;
+  group->num_htrees    = (uint16_t)ntrees;
+  group->htrees[0]     = nullptr;
   return group;
 }
 
@@ -1640,7 +1640,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
   const uint8_t *context_map_slice, *dist_context_map_slice;
   int dist_rb_idx;
   uint32_t blen_L, blen_I, blen_D;
-  uint8_t* const dict_scratch = reinterpret_cast<uint8_t*>(
+  auto* const dict_scratch = reinterpret_cast<uint8_t*>(
     &s->hs);  // 24+13 bytes (max length of a dictionary word including prefix & suffix)
   int context_mode;
 
@@ -1808,7 +1808,7 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction
               pos         = meta_block_len;
               copy_length = 0;
             } else {
-              int32_t offset         = (int32_t)words->offsets_by_length[copy_length];
+              auto offset            = (int32_t)words->offsets_by_length[copy_length];
               uint32_t shift         = words->size_bits_by_length[copy_length];
               uint32_t address       = distance_code - max_distance - 1;
               int32_t word_idx       = address & ((1 << shift) - 1);
@@ -1927,8 +1927,8 @@ extern "C" __global__ void __launch_bounds__(block_size, 2)
   if (z >= count) { return; }
   // Thread0: initializes shared state and decode stream header
   if (!t) {
-    uint8_t const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
-    size_t src_size    = inputs[z].srcSize;
+    auto const* src = static_cast<uint8_t const*>(inputs[z].srcDevice);
+    size_t src_size = inputs[z].srcSize;
     if (src && src_size >= 8) {
       s->error = 0;
       s->out = s->outbase = static_cast<uint8_t*>(inputs[z].dstDevice);
@@ -2084,7 +2084,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs,
 {
   uint32_t count32 = (count > 0) ? count : 0;
   uint32_t fb_heap_size;
-  uint8_t* scratch_u8 = static_cast<uint8_t*>(scratch);
+  auto* scratch_u8 = static_cast<uint8_t*>(scratch);
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(count32, 1);  // TODO: Check max grid dimensions vs max expected count
 
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index dab8ce1afa5..508e960430d 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -926,8 +926,8 @@ __device__ void copy_stored(inflate_state_s* s, int t)
   __syncthreads();
   if (t == 0) {
     // Reset bitstream to end of block
-    uint8_t* p            = cur + len;
-    uint32_t prefix_bytes = (uint32_t)(((size_t)p) & 3);
+    uint8_t* p        = cur + len;
+    auto prefix_bytes = (uint32_t)(((size_t)p) & 3);
     p -= prefix_bytes;
     s->cur      = p;
     s->bitbuf.x = (p < s->end) ? *reinterpret_cast<uint32_t*>(p) : 0;
@@ -952,7 +952,7 @@ __device__ void prefetch_warp(volatile inflate_state_s* s, int t)
   const uint8_t* cur_p = s->pref.cur_p;
   const uint8_t* end   = s->end;
   while (shuffle((t == 0) ? s->pref.run : 0)) {
-    int32_t cur_lo = (int32_t)(size_t)cur_p;
+    auto cur_lo = (int32_t)(size_t)cur_p;
     int do_pref =
       shuffle((t == 0) ? (cur_lo - *(volatile int32_t*)&s->cur < prefetch_size - 32 * 4 - 4) : 0);
     if (do_pref) {
@@ -1035,7 +1035,7 @@ __global__ void __launch_bounds__(block_size)
   inflate_state_s* state = &state_g;
 
   if (!t) {
-    uint8_t* p      = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
+    auto* p         = const_cast<uint8_t*>(static_cast<uint8_t const*>(inputs[z].srcDevice));
     size_t src_size = inputs[z].srcSize;
     uint32_t prefix_bytes;
     // Parse header if needed
@@ -1181,8 +1181,8 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes << 3;
   while (len >= 32) {
-    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
-    uint32_t copy_cnt     = min(len >> 2, 1024);
+    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    uint32_t copy_cnt = min(len >> 2, 1024);
     if (t < copy_cnt) {
       uint32_t v = src32[t];
       if (src_align_bits != 0) { v = __funnelshift_r(v, src32[t + 1], src_align_bits); }
diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h
index 3ca9c9eee10..29856bcd3f3 100644
--- a/cpp/src/io/comp/gpuinflate.h
+++ b/cpp/src/io/comp/gpuinflate.h
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <stdint.h>
+#include <cstdint>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index d55c06a7d96..9f0a610f8f7 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -55,9 +55,9 @@ static inline __device__ uint32_t snap_hash(uint32_t v)
  */
 static inline __device__ uint32_t fetch4(const uint8_t* src)
 {
-  uint32_t src_align    = 3 & reinterpret_cast<uintptr_t>(src);
-  const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align);
-  uint32_t v            = src32[0];
+  uint32_t src_align = 3 & reinterpret_cast<uintptr_t>(src);
+  const auto* src32  = reinterpret_cast<const uint32_t*>(src - src_align);
+  uint32_t v         = src32[0];
   return (src_align) ? __funnelshift_r(v, src32[1], src_align * 8) : v;
 }
 
@@ -268,15 +268,15 @@ __global__ void __launch_bounds__(128)
   const uint8_t* src;
 
   if (!t) {
-    const uint8_t* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
-    uint32_t src_len   = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
-    uint8_t* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
-    uint32_t dst_len   = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
-    uint8_t* end       = dst + dst_len;
-    s->src             = src;
-    s->src_len         = src_len;
-    s->dst_base        = dst;
-    s->end             = end;
+    const auto* src = static_cast<const uint8_t*>(inputs[blockIdx.x].srcDevice);
+    auto src_len    = static_cast<uint32_t>(inputs[blockIdx.x].srcSize);
+    auto* dst       = static_cast<uint8_t*>(inputs[blockIdx.x].dstDevice);
+    auto dst_len    = static_cast<uint32_t>(inputs[blockIdx.x].dstSize);
+    uint8_t* end    = dst + dst_len;
+    s->src          = src;
+    s->src_len      = src_len;
+    s->dst_base     = dst;
+    s->end          = end;
     while (src_len > 0x7f) {
       if (dst < end) { dst[0] = src_len | 0x80; }
       dst++;
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 94721fb9ce1..66d73074af0 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -22,7 +22,7 @@
 
 #include <cuda_runtime.h>
 
-#include <string.h>  // memset
+#include <cstring>  // memset
 
 #include <zlib.h>  // uncompress
 
@@ -196,17 +196,16 @@ bool OpenZipArchive(zip_archive_s* dst, const uint8_t* raw, size_t len)
     for (ptrdiff_t i = len - sizeof(zip_eocd_s) - 2;
          i + sizeof(zip_eocd_s) + 2 + 0xffff >= len && i >= 0;
          i--) {
-      const zip_eocd_s* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
+      const auto* eocd = reinterpret_cast<zip_eocd_s const*>(raw + i);
       if (eocd->sig == 0x06054b50 &&
           eocd->disk_id == eocd->start_disk  // multi-file archives not supported
           && eocd->num_entries == eocd->total_entries &&
           eocd->cdir_size >= sizeof(zip_cdfh_s) * eocd->num_entries && eocd->cdir_offset < len &&
           i + *reinterpret_cast<const uint16_t*>(eocd + 1) <= static_cast<ptrdiff_t>(len)) {
-        const zip_cdfh_s* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
-        dst->eocd              = eocd;
+        const auto* cdfh = reinterpret_cast<const zip_cdfh_s*>(raw + eocd->cdir_offset);
+        dst->eocd        = eocd;
         if (i >= static_cast<ptrdiff_t>(sizeof(zip64_eocdl))) {
-          const zip64_eocdl* eocdl =
-            reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
+          const auto* eocdl = reinterpret_cast<const zip64_eocdl*>(raw + i - sizeof(zip64_eocdl));
           if (eocdl->sig == 0x07064b50) { dst->eocdl = eocdl; }
         }
         // Start of central directory
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index bdd9ddaf1ea..791a16bc912 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -87,10 +87,10 @@ inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos)
  */
 __device__ void snappy_prefetch_bytestream(unsnap_state_s* s, int t)
 {
-  const uint8_t* base  = s->base;
-  uint32_t end         = (uint32_t)(s->end - base);
-  uint32_t align_bytes = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
-  int32_t pos          = min(align_bytes, end);
+  const uint8_t* base = s->base;
+  auto end            = (uint32_t)(s->end - base);
+  auto align_bytes    = (uint32_t)(0x20 - (0x1f & reinterpret_cast<uintptr_t>(base)));
+  int32_t pos         = min(align_bytes, end);
   int32_t blen;
   // Start by prefetching up to the next a 32B-aligned location
   if (t < pos) { s->q.buf[t] = base[t]; }
@@ -278,7 +278,7 @@ inline __device__ uint32_t get_len5_mask(uint32_t v0, uint32_t v1)
 __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t)
 {
   uint32_t cur        = 0;
-  uint32_t end        = static_cast<uint32_t>(s->end - s->base);
+  auto end            = static_cast<uint32_t>(s->end - s->base);
   uint32_t bytes_left = s->uncompressed_size;
   uint32_t dst_pos    = 0;
   int32_t batch       = 0;
@@ -498,7 +498,7 @@ template <typename Storage>
 __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage)
 {
   const uint8_t* literal_base = s->base;
-  uint8_t* out                = static_cast<uint8_t*>(s->in.dstDevice);
+  auto* out                   = static_cast<uint8_t*>(s->in.dstDevice);
   int batch                   = 0;
 
   do {
@@ -610,7 +610,7 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s
     __syncwarp();
     if (t == 0) { s->q.batch_len[batch] = 0; }
     batch = (batch + 1) & (batch_count - 1);
-  } while (1);
+  } while (true);
 }
 
 /**
@@ -639,7 +639,7 @@ __global__ void __launch_bounds__(block_size)
   if (t < batch_count) { s->q.batch_len[t] = 0; }
   __syncthreads();
   if (!t) {
-    const uint8_t* cur = static_cast<const uint8_t*>(s->in.srcDevice);
+    const auto* cur    = static_cast<const uint8_t*>(s->in.srcDevice);
     const uint8_t* end = cur + s->in.srcSize;
     s->error           = 0;
     if (log_cyclecount) { s->tstart = clock(); }
diff --git a/cpp/src/io/csv/csv_gpu.h b/cpp/src/io/csv/csv_gpu.h
index 9b83028fa92..ec45dea3072 100644
--- a/cpp/src/io/csv/csv_gpu.h
+++ b/cpp/src/io/csv/csv_gpu.h
@@ -48,8 +48,8 @@ constexpr uint32_t rowofs_block_bytes = rowofs_block_dim * 32;  // 16KB/threadbl
  * Format: row_count * 4 + id, where `row_count` is the number of rows
  * in a character block, and `id` is the row parser state at the end of the block.
  */
-typedef uint32_t rowctx32_t;
-typedef uint64_t rowctx64_t;
+using rowctx32_t = uint32_t;
+using rowctx64_t = uint64_t;
 
 /**
  * Packed row context format
@@ -61,7 +61,7 @@ typedef uint64_t rowctx64_t;
  * always zero (EOF input state implies a zero row count) and therefore
  * stored as 64-bit.
  */
-typedef uint64_t packed_rowctx_t;
+using packed_rowctx_t = uint64_t;
 
 /**
  * @brief return a row context from a {count, id} pair
@@ -116,7 +116,7 @@ inline __host__ __device__ rowctx32_t get_row_context(packed_rowctx_t packed_ctx
 inline __host__ __device__ rowctx64_t select_row_context(rowctx64_t sel_ctx,
                                                          packed_rowctx_t packed_ctx)
 {
-  uint32_t ctxid = static_cast<uint32_t>(sel_ctx & 3);
+  auto ctxid     = static_cast<uint32_t>(sel_ctx & 3);
   rowctx32_t ctx = get_row_context(packed_ctx, ctxid);
   return (sel_ctx & ~3) + ctx;
 }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 1b66df860a3..1517226952a 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -126,7 +126,7 @@ struct column_to_strings_fn {
   // fails to compile var-templs);
   //
   template <typename column_type>
-  constexpr static bool is_not_handled(void)
+  constexpr static bool is_not_handled()
   {
     // Note: the case (not std::is_same_v<column_type, bool>)
     // is already covered by is_integral)
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index 01418fd3bd6..416beaebe5d 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -47,17 +47,17 @@ class aggregate_orc_metadata {
   /**
    * @brief Sums up the number of rows of each source
    */
-  size_type calc_num_rows() const;
+  [[nodiscard]] size_type calc_num_rows() const;
 
   /**
    * @brief Number of columns in a ORC file.
    */
-  size_type calc_num_cols() const;
+  [[nodiscard]] size_type calc_num_cols() const;
 
   /**
    * @brief Sums up the number of stripes of each source
    */
-  size_type calc_num_stripes() const;
+  [[nodiscard]] size_type calc_num_stripes() const;
 
  public:
   std::vector<metadata> per_file_metadata;
@@ -67,26 +67,29 @@ class aggregate_orc_metadata {
 
   aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
 
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].ff.types[schema_idx];
+  }
 
   auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
 
-  auto get_num_rows() const { return num_rows; }
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
   auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
 
-  auto get_num_stripes() const { return num_stripes; }
+  [[nodiscard]] auto get_num_stripes() const { return num_stripes; }
 
-  auto const& get_types() const { return per_file_metadata[0].ff.types; }
+  [[nodiscard]] auto const& get_types() const { return per_file_metadata[0].ff.types; }
 
-  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
 
-  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+  [[nodiscard]] auto is_row_grp_idx_present() const { return row_grp_idx_present; }
 
   /**
    * @brief Returns the name of the given column from the given source.
    */
-  std::string const& column_name(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_name(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
@@ -98,7 +101,7 @@ class aggregate_orc_metadata {
    *
    * Full name includes ancestor columns' names.
    */
-  std::string const& column_path(const int source_idx, const int column_id) const
+  [[nodiscard]] std::string const& column_path(const int source_idx, const int column_id) const
   {
     CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
                  "Out of range source_idx provided");
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 4fa3480c90a..311f18bf72e 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -25,10 +25,10 @@
 #include <io/comp/io_uncomp.h>
 
 #include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <optional>
-#include <stddef.h>
-#include <stdint.h>
 #include <string>
 #include <vector>
 
@@ -87,7 +87,7 @@ struct Stream {
 
   // Returns index of the column in the table, if any
   // Stream of the 'column 0' does not have a corresponding column in the table
-  std::optional<uint32_t> column_index() const noexcept
+  [[nodiscard]] std::optional<uint32_t> column_index() const noexcept
   {
     return column_id.value_or(0) > 0 ? std::optional<uint32_t>{*column_id - 1}
                                      : std::optional<uint32_t>{};
@@ -540,14 +540,14 @@ class OrcDecompressor {
  public:
   OrcDecompressor(CompressionKind kind, uint32_t blockSize);
   const uint8_t* Decompress(const uint8_t* srcBytes, size_t srcLen, size_t* dstLen);
-  uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; }
-  uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const
+  [[nodiscard]] uint32_t GetLog2MaxCompressionRatio() const { return m_log2MaxRatio; }
+  [[nodiscard]] uint32_t GetMaxUncompressedBlockSize(uint32_t block_len) const
   {
     return (block_len < (m_blockSize >> m_log2MaxRatio)) ? block_len << m_log2MaxRatio
                                                          : m_blockSize;
   }
-  CompressionKind GetKind() const { return m_kind; }
-  uint32_t GetBlockSize() const { return m_blockSize; }
+  [[nodiscard]] CompressionKind GetKind() const { return m_kind; }
+  [[nodiscard]] uint32_t GetBlockSize() const { return m_blockSize; }
 
  protected:
   CompressionKind const m_kind;
@@ -603,16 +603,16 @@ class metadata {
  public:
   explicit metadata(datasource* const src);
 
-  size_t get_total_rows() const { return ff.numberOfRows; }
-  int get_num_stripes() const { return ff.stripes.size(); }
-  int get_num_columns() const { return ff.types.size(); }
+  [[nodiscard]] size_t get_total_rows() const { return ff.numberOfRows; }
+  [[nodiscard]] int get_num_stripes() const { return ff.stripes.size(); }
+  [[nodiscard]] int get_num_columns() const { return ff.types.size(); }
   /**
    * @brief Returns the name of the column with the given ID.
    *
    * Name might not be unique in the ORC file, since columns with different parents are allowed to
    * have the same names.
    */
-  std::string const& column_name(size_type column_id) const
+  [[nodiscard]] std::string const& column_name(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_names[column_id];
@@ -623,22 +623,25 @@ class metadata {
    *
    * Each column in the ORC file has a unique path.
    */
-  std::string const& column_path(size_type column_id) const
+  [[nodiscard]] std::string const& column_path(size_type column_id) const
   {
     CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_paths[column_id];
   }
-  int get_row_index_stride() const { return ff.rowIndexStride; }
+  [[nodiscard]] int get_row_index_stride() const { return ff.rowIndexStride; }
 
   /**
    * @brief Returns the ID of the parent column of the given column.
    */
-  size_type parent_id(size_type column_id) const { return parents.at(column_id).value().id; }
+  [[nodiscard]] size_type parent_id(size_type column_id) const
+  {
+    return parents.at(column_id).value().id;
+  }
 
   /**
    * @brief Returns the index the given column has in its parent's children list.
    */
-  size_type field_index(size_type column_id) const
+  [[nodiscard]] size_type field_index(size_type column_id) const
   {
     return parents.at(column_id).value().field_idx;
   }
@@ -646,7 +649,7 @@ class metadata {
   /**
    * @brief Returns whether the given column has a parent.
    */
-  size_type column_has_parent(size_type column_id) const
+  [[nodiscard]] size_type column_has_parent(size_type column_id) const
   {
     return parents.at(column_id).has_value();
   }
@@ -693,7 +696,7 @@ struct orc_column_device_view : public column_device_view {
 struct rowgroup_rows {
   size_type begin;
   size_type end;
-  constexpr auto size() const noexcept { return end - begin; }
+  [[nodiscard]] constexpr auto size() const noexcept { return end - begin; }
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 21c52f9295b..817b9fd7b01 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -371,7 +371,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   size_t decomp_offset           = 0;
   uint32_t max_uncomp_block_size = 0;
   uint32_t start_pos             = 0;
-  uint32_t start_pos_uncomp      = (uint32_t)num_compressed_blocks;
+  auto start_pos_uncomp          = (uint32_t)num_compressed_blocks;
   for (size_t i = 0; i < compinfo.size(); ++i) {
     auto dst_base                 = static_cast<uint8_t*>(decomp_data.data());
     compinfo[i].uncompressed_data = dst_base + decomp_offset;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 05bc25597c2..dc09b3e7dd8 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1179,7 +1179,7 @@ __global__ void __launch_bounds__(block_size)
 
       row_in = s->chunk.start_row + s->top.nulls_desc_row - prev_parent_null_count;
       if (row_in + nrows > first_row && row_in < first_row + max_num_rows &&
-          s->chunk.valid_map_base != NULL) {
+          s->chunk.valid_map_base != nullptr) {
         int64_t dst_row   = row_in - first_row;
         int64_t dst_pos   = max(dst_row, (int64_t)0);
         uint32_t startbit = -static_cast<int32_t>(min(dst_row, (int64_t)0));
@@ -1325,14 +1325,14 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
          s->top.data.cur_row + s->top.data.nrows < s->top.data.end_row) {
     uint32_t nrows = min(s->top.data.end_row - (s->top.data.cur_row + s->top.data.nrows),
                          min((row_decoder_buffer_size - s->u.rowdec.nz_count) * 2, blockDim.x));
-    if (s->chunk.valid_map_base != NULL) {
+    if (s->chunk.valid_map_base != nullptr) {
       // We have a present stream
       uint32_t rmax  = s->top.data.end_row - min((uint32_t)first_row, s->top.data.end_row);
-      uint32_t r     = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
+      auto r         = (uint32_t)(s->top.data.cur_row + s->top.data.nrows + t - first_row);
       uint32_t valid = (t < nrows && r < rmax)
                          ? (((const uint8_t*)s->chunk.valid_map_base)[r >> 3] >> (r & 7)) & 1
                          : 0;
-      volatile uint16_t* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
+      volatile auto* row_ofs_plus1 = (volatile uint16_t*)&s->u.rowdec.row[s->u.rowdec.nz_count];
       uint32_t nz_pos, row_plus1, nz_count = s->u.rowdec.nz_count, last_row;
       if (t < nrows) { row_ofs_plus1[t] = valid; }
       lengths_to_positions<uint16_t>(row_ofs_plus1, nrows, t);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 660ec025d00..02ae191d55a 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1040,7 +1040,7 @@ __global__ void __launch_bounds__(block_size)
     uint32_t string_idx = (t < numvals) ? dict_data[s->cur_row + t] : 0;
     if (cid == CI_DICTIONARY) {
       // Encoding string contents
-      const char* ptr = 0;
+      const char* ptr = nullptr;
       uint32_t count  = 0;
       if (t < numvals) {
         auto string_val = string_column->element<string_view>(string_idx);
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index b197751d925..276a1f49abf 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -428,7 +428,7 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     uint32_t rowgroups_in_chunk = s->chunk.num_rowgroups;
     s->rowgroup_start           = s->chunk.rowgroup_id;
     s->rowgroup_end             = s->rowgroup_start + rowgroups_in_chunk;
-    s->is_compressed            = (strm_info != NULL);
+    s->is_compressed            = (strm_info != nullptr);
   }
   __syncthreads();
   while (s->rowgroup_start < s->rowgroup_end) {
@@ -480,7 +480,7 @@ __global__ void __launch_bounds__(block_size)
                             device_2dspan<rowgroup_rows const> rowgroup_bounds,
                             device_2dspan<size_type> set_counts)
 {
-  typedef cub::BlockReduce<size_type, block_size> BlockReduce;
+  using BlockReduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   auto const column_id   = blockIdx.x;
diff --git a/cpp/src/io/orc/timezone.cpp b/cpp/src/io/orc/timezone.cpp
index 3a1e8bf898a..810dfe87320 100644
--- a/cpp/src/io/orc/timezone.cpp
+++ b/cpp/src/io/orc/timezone.cpp
@@ -70,8 +70,8 @@ struct timezone_file {
   std::vector<localtime_type_record_s> ttype;
   std::vector<char> posix_tz_string;
 
-  auto timecnt() const { return header.timecnt; }
-  auto typecnt() const { return header.typecnt; }
+  [[nodiscard]] auto timecnt() const { return header.timecnt; }
+  [[nodiscard]] auto typecnt() const { return header.typecnt; }
 
   // Based on https://tools.ietf.org/id/draft-murchison-tzdist-tzif-00.html
   static constexpr auto leap_second_rec_size(bool is_64bit) noexcept
@@ -222,7 +222,7 @@ class posix_parser {
   /**
    * @brief Returns the next character in the input.
    */
-  char next_character() const { return *cur; }
+  [[nodiscard]] char next_character() const { return *cur; }
 
  private:
   typename Container::const_iterator cur;
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index b25dfd0a621..a14d94df540 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -24,7 +24,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 
-#include <stdint.h>
+#include <cstdint>
 #include <string>
 #include <vector>
 
@@ -121,7 +121,7 @@ class timezone_table {
     : gmt_offset{gmt_offset}, ttimes{std::move(ttimes)}, offsets{std::move(offsets)}
   {
   }
-  timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
+  [[nodiscard]] timezone_table_view view() const { return {gmt_offset, ttimes, offsets}; }
 };
 
 /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index b7264cb81ac..105c473c15e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -181,7 +181,7 @@ class orc_column_view {
 
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
   void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
-  auto dict_stride() const noexcept { return _dict_stride; }
+  [[nodiscard]] auto dict_stride() const noexcept { return _dict_stride; }
 
   /**
    * @brief Function that associates an existing dictionary chunk allocation
@@ -192,14 +192,14 @@ class orc_column_view {
     dict   = host_dict;
     d_dict = dev_dict;
   }
-  auto host_dict_chunk(size_t rowgroup) const
+  [[nodiscard]] auto host_dict_chunk(size_t rowgroup) const
   {
     CUDF_EXPECTS(is_string(), "Dictionary chunks are only present in string columns.");
     return &dict[rowgroup * _dict_stride + _str_idx];
   }
-  auto device_dict_chunk() const { return d_dict; }
+  [[nodiscard]] auto device_dict_chunk() const { return d_dict; }
 
-  auto const& decimal_offsets() const { return d_decimal_offsets; }
+  [[nodiscard]] auto const& decimal_offsets() const { return d_decimal_offsets; }
   void attach_decimal_offsets(uint32_t* sizes_ptr) { d_decimal_offsets = sizes_ptr; }
 
   /**
@@ -211,39 +211,39 @@ class orc_column_view {
     stripe_dict   = host_stripe_dict;
     d_stripe_dict = dev_stripe_dict;
   }
-  auto host_stripe_dict(size_t stripe) const
+  [[nodiscard]] auto host_stripe_dict(size_t stripe) const
   {
     CUDF_EXPECTS(is_string(), "Stripe dictionary is only present in string columns.");
     return &stripe_dict[stripe * _dict_stride + _str_idx];
   }
-  auto device_stripe_dict() const noexcept { return d_stripe_dict; }
+  [[nodiscard]] auto device_stripe_dict() const noexcept { return d_stripe_dict; }
 
   // Index in the table
-  uint32_t index() const noexcept { return _index; }
+  [[nodiscard]] uint32_t index() const noexcept { return _index; }
   // Id in the ORC file
-  auto id() const noexcept { return _index + 1; }
+  [[nodiscard]] auto id() const noexcept { return _index + 1; }
 
-  auto is_child() const noexcept { return _is_child; }
+  [[nodiscard]] auto is_child() const noexcept { return _is_child; }
   auto parent_index() const noexcept { return _parent_index.value(); }
   auto child_begin() const noexcept { return children.cbegin(); }
   auto child_end() const noexcept { return children.cend(); }
   auto num_children() const noexcept { return children.size(); }
 
-  auto type_width() const noexcept { return _type_width; }
+  [[nodiscard]] auto type_width() const noexcept { return _type_width; }
   auto size() const noexcept { return cudf_column.size(); }
 
   auto null_count() const noexcept { return cudf_column.null_count(); }
   auto null_mask() const noexcept { return cudf_column.null_mask(); }
-  bool nullable() const noexcept { return null_mask() != nullptr; }
+  [[nodiscard]] bool nullable() const noexcept { return null_mask() != nullptr; }
   auto user_defined_nullable() const noexcept { return nullable_from_metadata; }
 
-  auto scale() const noexcept { return _scale; }
-  auto precision() const noexcept { return _precision; }
+  [[nodiscard]] auto scale() const noexcept { return _scale; }
+  [[nodiscard]] auto precision() const noexcept { return _precision; }
 
   void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; }
-  auto orc_kind() const noexcept { return _type_kind; }
-  auto orc_encoding() const noexcept { return _encoding_kind; }
-  std::string_view orc_name() const noexcept { return name; }
+  [[nodiscard]] auto orc_kind() const noexcept { return _type_kind; }
+  [[nodiscard]] auto orc_encoding() const noexcept { return _encoding_kind; }
+  [[nodiscard]] std::string_view orc_name() const noexcept { return name; }
 
  private:
   column_view cudf_column;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 2738a77e50a..903ceaa1714 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -62,14 +62,14 @@ struct orc_table_view {
   rmm::device_uvector<uint32_t> d_string_column_indices;
 
   auto num_columns() const noexcept { return columns.size(); }
-  size_type num_rows() const noexcept;
+  [[nodiscard]] size_type num_rows() const noexcept;
   auto num_string_columns() const noexcept { return string_column_indices.size(); }
 
   auto& column(uint32_t idx) { return columns.at(idx); }
-  auto const& column(uint32_t idx) const { return columns.at(idx); }
+  [[nodiscard]] auto const& column(uint32_t idx) const { return columns.at(idx); }
 
   auto& string_column(uint32_t idx) { return columns.at(string_column_indices.at(idx)); }
-  auto const& string_column(uint32_t idx) const
+  [[nodiscard]] auto const& string_column(uint32_t idx) const
   {
     return columns.at(string_column_indices.at(idx));
   }
@@ -85,8 +85,8 @@ struct stripe_rowgroups {
   uint32_t first;  // first rowgroup in the stripe
   uint32_t size;   // number of rowgroups in the stripe
   stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
-  auto cbegin() const { return thrust::make_counting_iterator(first); }
-  auto cend() const { return thrust::make_counting_iterator(first + size); }
+  [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
+  [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
 
 /**
@@ -123,10 +123,10 @@ class orc_streams {
     std::vector<size_t> offsets;
     size_t non_rle_data_size = 0;
     size_t rle_data_size     = 0;
-    auto data_size() const { return non_rle_data_size + rle_data_size; }
+    [[nodiscard]] auto data_size() const { return non_rle_data_size + rle_data_size; }
   };
-  orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
-                                     size_t num_rowgroups) const;
+  [[nodiscard]] orc_stream_offsets compute_offsets(host_span<orc_column_view const> columns,
+                                                   size_t num_rowgroups) const;
 
   operator std::vector<Stream> const &() const { return streams; }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 71452bd7809..53739a26beb 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -20,8 +20,8 @@
 #include "parquet_common.hpp"
 
 #include <algorithm>
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <vector>
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 751d6b62319..df4310fcd63 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -102,7 +102,7 @@ struct page_state_s {
  */
 __device__ uint32_t device_str2hash32(const char* key, size_t len, uint32_t seed = 33)
 {
-  const uint8_t* p  = reinterpret_cast<const uint8_t*>(key);
+  const auto* p     = reinterpret_cast<const uint8_t*>(key);
   uint32_t h1       = seed, k1;
   const uint32_t c1 = 0xcc9e2d51;
   const uint32_t c2 = 0x1b873593;
@@ -513,7 +513,7 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_po
  */
 inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv)
 {
-  const char* ptr = NULL;
+  const char* ptr = nullptr;
   size_t len      = 0;
 
   if (s->dict_base) {
@@ -522,10 +522,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo
                                                sizeof(string_index_pair)
                                            : 0;
     if (dict_pos < (uint32_t)s->dict_size) {
-      const string_index_pair* src =
-        reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
-      ptr = src->first;
-      len = src->second;
+      const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+      ptr             = src->first;
+      len             = src->second;
     }
   } else {
     // Plain encoding
@@ -540,9 +539,9 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo
     *static_cast<uint32_t*>(dstv) = device_str2hash32(ptr, len);
   } else {
     // Output string descriptor
-    string_index_pair* dst = static_cast<string_index_pair*>(dstv);
-    dst->first             = ptr;
-    dst->second            = len;
+    auto* dst   = static_cast<string_index_pair*>(dstv);
+    dst->first  = ptr;
+    dst->second = len;
   }
 }
 
@@ -1016,7 +1015,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       cur += InitLevelSection(s, cur, end, level_type::DEFINITION);
 
       s->dict_bits = 0;
-      s->dict_base = 0;
+      s->dict_base = nullptr;
       s->dict_size = 0;
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
@@ -1133,7 +1132,7 @@ static __device__ void store_validity(PageNestingInfo* pni,
   int bit_offset  = pni->valid_map_offset % 32;
   // if we fit entirely in the output word
   if (bit_offset + value_count <= 32) {
-    uint32_t relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
+    auto relevant_mask = static_cast<uint32_t>((static_cast<uint64_t>(1) << value_count) - 1);
 
     if (relevant_mask == ~0) {
       pni->valid_map[word_offset] = valid_mask;
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index ec6b24b3b4e..2074304251f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1068,7 +1068,7 @@ __global__ void __launch_bounds__(128, 8)
   }
   if (t == 0) {
     uint8_t* base                = s->page.page_data + s->page.max_hdr_size;
-    uint32_t actual_data_size    = static_cast<uint32_t>(s->cur - base);
+    auto actual_data_size        = static_cast<uint32_t>(s->cur - base);
     uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size);
     s->page.max_data_size        = actual_data_size;
     s->comp_in.srcDevice         = base;
@@ -1244,7 +1244,7 @@ class header_encoder {
     *header_end = current_header_ptr;
   }
 
-  inline __device__ uint8_t* get_ptr(void) { return current_header_ptr; }
+  inline __device__ uint8_t* get_ptr() { return current_header_ptr; }
 
   inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; }
 };
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 21610638843..b4fa9b4ae82 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -19,8 +19,8 @@
 #include "parquet_common.hpp"
 
 #include <algorithm>
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 #include <string>
 #include <tuple>
 #include <vector>
@@ -65,11 +65,11 @@ struct MilliSeconds {
 };
 struct MicroSeconds {
 };
-typedef struct TimeUnit_isset {
-  TimeUnit_isset() : MILLIS(false), MICROS(false) {}
-  bool MILLIS;
-  bool MICROS;
-} TimeUnit_isset;
+using TimeUnit_isset = struct TimeUnit_isset {
+  TimeUnit_isset() {}
+  bool MILLIS{false};
+  bool MICROS{false};
+};
 
 struct TimeUnit {
   TimeUnit_isset isset;
@@ -97,35 +97,21 @@ struct BsonType {
 };
 
 // thrift generated code simplified.
-typedef struct LogicalType_isset {
-  LogicalType_isset()
-    : STRING(false),
-      MAP(false),
-      LIST(false),
-      ENUM(false),
-      DECIMAL(false),
-      DATE(false),
-      TIME(false),
-      TIMESTAMP(false),
-      INTEGER(false),
-      UNKNOWN(false),
-      JSON(false),
-      BSON(false)
-  {
-  }
-  bool STRING;
-  bool MAP;
-  bool LIST;
-  bool ENUM;
-  bool DECIMAL;
-  bool DATE;
-  bool TIME;
-  bool TIMESTAMP;
-  bool INTEGER;
-  bool UNKNOWN;
-  bool JSON;
-  bool BSON;
-} LogicalType_isset;
+using LogicalType_isset = struct LogicalType_isset {
+  LogicalType_isset() {}
+  bool STRING{false};
+  bool MAP{false};
+  bool LIST{false};
+  bool ENUM{false};
+  bool DECIMAL{false};
+  bool DATE{false};
+  bool TIME{false};
+  bool TIMESTAMP{false};
+  bool INTEGER{false};
+  bool UNKNOWN{false};
+  bool JSON{false};
+  bool BSON{false};
+};
 
 struct LogicalType {
   LogicalType_isset isset;
@@ -197,16 +183,19 @@ struct SchemaElement {
   //     required int32 num;
   //  };
   // }
-  bool is_stub() const { return repetition_type == REPEATED && num_children == 1; }
+  [[nodiscard]] bool is_stub() const { return repetition_type == REPEATED && num_children == 1; }
 
   // https://github.com/apache/parquet-cpp/blob/642da05/src/parquet/schema.h#L49-L50
   // One-level LIST encoding: Only allows required lists with required cells:
   //   repeated value_type name
-  bool is_one_level_list() const { return repetition_type == REPEATED and num_children == 0; }
+  [[nodiscard]] bool is_one_level_list() const
+  {
+    return repetition_type == REPEATED and num_children == 0;
+  }
 
   // in parquet terms, a group is a level of nesting in the schema. a group
   // can be a struct or a list
-  bool is_struct() const
+  [[nodiscard]] bool is_struct() const
   {
     return type == UNDEFINED_TYPE &&
            // this assumption might be a little weak.
@@ -369,7 +358,7 @@ class CompactProtocolReader {
     m_base = m_cur = base;
     m_end          = base + len;
   }
-  ptrdiff_t bytecount() const noexcept { return m_cur - m_base; }
+  [[nodiscard]] ptrdiff_t bytecount() const noexcept { return m_cur - m_base; }
   unsigned int getb() noexcept { return (m_cur < m_end) ? *m_cur++ : 0; }
   void skip_bytes(size_t bytecnt) noexcept
   {
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index fc4afe951db..885f36aeca4 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -337,7 +337,7 @@ class aggregate_reader_metadata {
   /**
    * @brief Sums up the number of rows of each source
    */
-  size_type calc_num_rows() const
+  [[nodiscard]] size_type calc_num_rows() const
   {
     return std::accumulate(
       per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
@@ -348,7 +348,7 @@ class aggregate_reader_metadata {
   /**
    * @brief Sums up the number of row groups of each source
    */
-  size_type calc_num_row_groups() const
+  [[nodiscard]] size_type calc_num_row_groups() const
   {
     return std::accumulate(
       per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
@@ -381,16 +381,16 @@ class aggregate_reader_metadata {
     }
   }
 
-  auto const& get_row_group(size_type row_group_index, size_type src_idx) const
+  [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const
   {
     CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
                  "invalid source index");
     return per_file_metadata[src_idx].row_groups[row_group_index];
   }
 
-  auto const& get_column_metadata(size_type row_group_index,
-                                  size_type src_idx,
-                                  int schema_idx) const
+  [[nodiscard]] auto const& get_column_metadata(size_type row_group_index,
+                                                size_type src_idx,
+                                                int schema_idx) const
   {
     auto col = std::find_if(
       per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -401,13 +401,16 @@ class aggregate_reader_metadata {
     return col->meta_data;
   }
 
-  auto get_num_rows() const { return num_rows; }
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
-  auto get_num_row_groups() const { return num_row_groups; }
+  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].schema[schema_idx]; }
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].schema[schema_idx];
+  }
 
-  auto const& get_key_value_metadata() const { return agg_keyval_map; }
+  [[nodiscard]] auto const& get_key_value_metadata() const { return agg_keyval_map; }
 
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
@@ -416,7 +419,7 @@ class aggregate_reader_metadata {
    *
    * @return comma-separated index column names in quotes
    */
-  inline int get_output_nesting_depth(int schema_index) const
+  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
   {
     auto& pfm = per_file_metadata[0];
     int depth = 0;
@@ -441,7 +444,7 @@ class aggregate_reader_metadata {
    *
    * @return comma-separated index column names in quotes
    */
-  std::string get_pandas_index() const
+  [[nodiscard]] std::string get_pandas_index() const
   {
     auto it = agg_keyval_map.find("pandas");
     if (it != agg_keyval_map.end()) {
@@ -472,7 +475,7 @@ class aggregate_reader_metadata {
    *
    * @param names List of column names to load, where index column name(s) will be added
    */
-  std::vector<std::string> get_pandas_index_names() const
+  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const
   {
     std::vector<std::string> names;
     auto str = get_pandas_index();
@@ -511,9 +514,9 @@ class aggregate_reader_metadata {
    *
    * @return List of row group indexes and its starting row
    */
-  auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
-                         size_type& row_start,
-                         size_type& row_count) const
+  [[nodiscard]] auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
+                                       size_type& row_start,
+                                       size_type& row_count) const
   {
     if (!row_groups.empty()) {
       std::vector<row_group_info> selection;
@@ -570,10 +573,10 @@ class aggregate_reader_metadata {
    * @return input column information, output column information, list of output column schema
    * indices
    */
-  auto select_columns(std::vector<std::string> const& use_names,
-                      bool include_index,
-                      bool strings_to_categorical,
-                      type_id timestamp_type_id) const
+  [[nodiscard]] auto select_columns(std::vector<std::string> const& use_names,
+                                    bool include_index,
+                                    bool strings_to_categorical,
+                                    type_id timestamp_type_id) const
   {
     auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
       auto const& col_schema_idx = std::find_if(
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index b302516ba39..a9306275b26 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -166,12 +166,12 @@ struct aggregate_writer_metadata {
     return global_rowgroup_base;
   }
 
-  bool schema_matches(std::vector<SchemaElement> const& schema) const
+  [[nodiscard]] bool schema_matches(std::vector<SchemaElement> const& schema) const
   {
     return this->schema == schema;
   }
   auto& file(size_t p) { return files[p]; }
-  size_t num_files() const { return files.size(); }
+  [[nodiscard]] size_t num_files() const { return files.size(); }
 
  private:
   int32_t version = 0;
@@ -678,18 +678,18 @@ struct parquet_column_view {
                       std::vector<schema_tree_node> const& schema_tree,
                       rmm::cuda_stream_view stream);
 
-  column_view leaf_column_view() const;
-  gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
+  [[nodiscard]] column_view leaf_column_view() const;
+  [[nodiscard]] gpu::parquet_column_device_view get_device_view(rmm::cuda_stream_view stream) const;
 
-  column_view cudf_column_view() const { return cudf_col; }
-  parquet::Type physical_type() const { return schema_node.type; }
+  [[nodiscard]] column_view cudf_column_view() const { return cudf_col; }
+  [[nodiscard]] parquet::Type physical_type() const { return schema_node.type; }
 
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
   // LIST related member functions
-  uint8_t max_def_level() const noexcept { return _max_def_level; }
-  uint8_t max_rep_level() const noexcept { return _max_rep_level; }
-  bool is_list() const noexcept { return _is_list; }
+  [[nodiscard]] uint8_t max_def_level() const noexcept { return _max_def_level; }
+  [[nodiscard]] uint8_t max_rep_level() const noexcept { return _max_rep_level; }
+  [[nodiscard]] bool is_list() const noexcept { return _is_list; }
 
  private:
   // Schema related members
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index 755f3416b1d..15fe2544930 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -20,13 +20,15 @@
  */
 
 #pragma once
-#include <stdint.h>
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstdint>
+
 namespace cudf {
 namespace io {
 
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 0992a557491..8e35fcf3c44 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -92,24 +92,20 @@ struct typed_statistics_chunk<T, true> {
   using E = typename detail::extrema_type<T>::type;
   using A = typename detail::aggregation_type<T>::type;
 
-  uint32_t non_nulls;   //!< number of non-null values in chunk
-  uint32_t null_count;  //!< number of null values in chunk
+  uint32_t non_nulls{0};   //!< number of non-null values in chunk
+  uint32_t null_count{0};  //!< number of null values in chunk
 
   E minimum_value;
   E maximum_value;
   A aggregate;
 
-  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;     //!< Nonzero if sum is valid
+  uint8_t has_minmax{false};  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{false};     //!< Nonzero if sum is valid
 
   __device__ typed_statistics_chunk()
-    : non_nulls(0),
-      null_count(0),
-      minimum_value(detail::minimum_identity<E>()),
+    : minimum_value(detail::minimum_identity<E>()),
       maximum_value(detail::maximum_identity<E>()),
-      aggregate(0),
-      has_minmax(false),
-      has_sum(false)  // Set to true when storing
+      aggregate(0)
   {
   }
 
@@ -140,22 +136,17 @@ template <typename T>
 struct typed_statistics_chunk<T, false> {
   using E = typename detail::extrema_type<T>::type;
 
-  uint32_t non_nulls;   //!< number of non-null values in chunk
-  uint32_t null_count;  //!< number of null values in chunk
+  uint32_t non_nulls{0};   //!< number of non-null values in chunk
+  uint32_t null_count{0};  //!< number of null values in chunk
 
   E minimum_value;
   E maximum_value;
 
-  uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
-  uint8_t has_sum;     //!< Nonzero if sum is valid
+  uint8_t has_minmax{false};  //!< Nonzero if min_value and max_values are valid
+  uint8_t has_sum{false};     //!< Nonzero if sum is valid
 
   __device__ typed_statistics_chunk()
-    : non_nulls(0),
-      null_count(0),
-      minimum_value(detail::minimum_identity<E>()),
-      maximum_value(detail::maximum_identity<E>()),
-      has_minmax(false),
-      has_sum(false)  // Set to true when storing
+    : minimum_value(detail::minimum_identity<E>()), maximum_value(detail::maximum_identity<E>())
   {
   }
 
diff --git a/cpp/src/io/utilities/block_utils.cuh b/cpp/src/io/utilities/block_utils.cuh
index 2b4f69df10f..d73f0ebc9b7 100644
--- a/cpp/src/io/utilities/block_utils.cuh
+++ b/cpp/src/io/utilities/block_utils.cuh
@@ -15,7 +15,7 @@
  */
 
 #pragma once
-#include <stdint.h>
+#include <cstdint>
 
 namespace cudf {
 namespace io {
@@ -32,7 +32,7 @@ inline __device__ T shuffle_xor(T var, uint32_t delta)
   return __shfl_xor_sync(~0, var, delta);
 }
 
-inline __device__ void syncwarp(void) { __syncwarp(); }
+inline __device__ void syncwarp() { __syncwarp(); }
 
 inline __device__ uint32_t ballot(int pred) { return __ballot_sync(~0, pred); }
 
@@ -126,18 +126,18 @@ inline __device__ double Int128ToDouble_rn(uint64_t lo, int64_t hi)
 
 inline __device__ uint32_t unaligned_load32(const uint8_t* p)
 {
-  uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
-  uint32_t v          = p32[0];
+  uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
+  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  uint32_t v      = p32[0];
   return (ofs) ? __funnelshift_r(v, p32[1], ofs * 8) : v;
 }
 
 inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 {
-  uint32_t ofs        = 3 & reinterpret_cast<uintptr_t>(p);
-  const uint32_t* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
-  uint32_t v0         = p32[0];
-  uint32_t v1         = p32[1];
+  uint32_t ofs    = 3 & reinterpret_cast<uintptr_t>(p);
+  const auto* p32 = reinterpret_cast<const uint32_t*>(p - ofs);
+  uint32_t v0     = p32[0];
+  uint32_t v1     = p32[1];
   if (ofs) {
     v0 = __funnelshift_r(v0, v1, ofs * 8);
     v1 = __funnelshift_r(v1, p32[2], ofs * 8);
@@ -148,8 +148,8 @@ inline __device__ uint64_t unaligned_load64(const uint8_t* p)
 template <unsigned int nthreads, bool sync_before_store>
 inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len, uint32_t t)
 {
-  uint8_t* dst       = static_cast<uint8_t*>(dstv);
-  const uint8_t* src = static_cast<const uint8_t*>(srcv);
+  auto* dst       = static_cast<uint8_t*>(dstv);
+  const auto* src = static_cast<const uint8_t*>(srcv);
   uint32_t dst_align_bytes, src_align_bytes, src_align_bits;
   // Align output to 32-bit
   dst_align_bytes = 3 & -reinterpret_cast<intptr_t>(dst);
@@ -166,8 +166,8 @@ inline __device__ void memcpy_block(void* dstv, const void* srcv, uint32_t len,
   src_align_bytes = (uint32_t)(3 & reinterpret_cast<uintptr_t>(src));
   src_align_bits  = src_align_bytes * 8;
   while (len >= 4) {
-    const uint32_t* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
-    uint32_t copy_cnt     = min(len >> 2, nthreads);
+    const auto* src32 = reinterpret_cast<const uint32_t*>(src - src_align_bytes);
+    uint32_t copy_cnt = min(len >> 2, nthreads);
     uint32_t v;
     if (t < copy_cnt) {
       v = src32[t];
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 642f3518edd..63d0103ddec 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -49,9 +49,9 @@ class file_sink : public data_sink {
 
   size_t bytes_written() override { return _bytes_written; }
 
-  bool supports_device_write() const override { return _cufile_out != nullptr; }
+  [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; }
 
-  bool is_device_write_preferred(size_t size) const override
+  [[nodiscard]] bool is_device_write_preferred(size_t size) const override
   {
     return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size);
   }
@@ -109,13 +109,13 @@ class host_buffer_sink : public data_sink {
  */
 class void_sink : public data_sink {
  public:
-  explicit void_sink() : _bytes_written(0) {}
+  explicit void_sink() {}
 
   virtual ~void_sink() {}
 
   void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
-  bool supports_device_write() const override { return true; }
+  [[nodiscard]] bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
@@ -146,7 +146,10 @@ class user_sink_wrapper : public data_sink {
 
   void host_write(void const* data, size_t size) override { user_sink->host_write(data, size); }
 
-  bool supports_device_write() const override { return user_sink->supports_device_write(); }
+  [[nodiscard]] bool supports_device_write() const override
+  {
+    return user_sink->supports_device_write();
+  }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 3de6f35cb0d..6f864ab509f 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -40,9 +40,9 @@ class file_source : public datasource {
 
   virtual ~file_source() = default;
 
-  bool supports_device_read() const override { return _cufile_in != nullptr; }
+  [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; }
 
-  bool is_device_read_preferred(size_t size) const override
+  [[nodiscard]] bool is_device_read_preferred(size_t size) const override
   {
     return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size);
   }
@@ -79,7 +79,7 @@ class file_source : public datasource {
     return _cufile_in->read_async(offset, read_size, dst, stream);
   }
 
-  size_t size() const override { return _file.size(); }
+  [[nodiscard]] size_t size() const override { return _file.size(); }
 
  protected:
   detail::file_wrapper _file;
@@ -102,7 +102,7 @@ class memory_mapped_source : public file_source {
     if (_file.size() != 0) map(_file.desc(), offset, size);
   }
 
-  virtual ~memory_mapped_source()
+  ~memory_mapped_source() override
   {
     if (_map_addr != nullptr) { munmap(_map_addr, _map_size); }
   }
@@ -210,7 +210,10 @@ class user_datasource_wrapper : public datasource {
     return source->host_read(offset, size);
   }
 
-  bool supports_device_read() const override { return source->supports_device_read(); }
+  [[nodiscard]] bool supports_device_read() const override
+  {
+    return source->supports_device_read();
+  }
 
   size_t device_read(size_t offset,
                      size_t size,
@@ -227,7 +230,7 @@ class user_datasource_wrapper : public datasource {
     return source->device_read(offset, size, stream);
   }
 
-  size_t size() const override { return source->size(); }
+  [[nodiscard]] size_t size() const override { return source->size(); }
 
  private:
   datasource* const source;  ///< A non-owning pointer to the user-implemented datasource
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 7178418bbbf..fcee4e43a20 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -45,8 +45,8 @@ class file_wrapper {
   explicit file_wrapper(std::string const& filepath, int flags);
   explicit file_wrapper(std::string const& filepath, int flags, mode_t mode);
   ~file_wrapper();
-  auto size() const { return _size; }
-  auto desc() const { return fd; }
+  [[nodiscard]] auto size() const { return _size; }
+  [[nodiscard]] auto desc() const { return fd; }
 };
 
 /**
@@ -184,7 +184,7 @@ struct cufile_registered_file {
     register_handle();
   }
 
-  auto const& handle() const noexcept { return cf_handle; }
+  [[nodiscard]] auto const& handle() const noexcept { return cf_handle; }
 
   ~cufile_registered_file();
 
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index cbf914b8da6..367bbfcbdfa 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -77,9 +77,9 @@ class hostdevice_vector {
     return false;
   }
 
-  size_t max_size() const noexcept { return max_elements; }
-  size_t size() const noexcept { return num_elements; }
-  size_t memory_size() const noexcept { return sizeof(T) * num_elements; }
+  [[nodiscard]] size_t max_size() const noexcept { return max_elements; }
+  [[nodiscard]] size_t size() const noexcept { return num_elements; }
+  [[nodiscard]] size_t memory_size() const noexcept { return sizeof(T) * num_elements; }
 
   T& operator[](size_t i) const { return h_data[i]; }
   T* host_ptr(size_t offset = 0) const { return h_data + offset; }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 6da3296055c..878b36191ac 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -68,7 +68,7 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
-  parse_options_view view() const
+  [[nodiscard]] parse_options_view view() const
   {
     return {delimiter,
             terminator,
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/src/io/utilities/thread_pool.hpp
index c57082034db..952ab58813a 100644
--- a/cpp/src/io/utilities/thread_pool.hpp
+++ b/cpp/src/io/utilities/thread_pool.hpp
@@ -44,7 +44,7 @@ namespace detail {
  * and/or obtain its eventual return value.
  */
 class thread_pool {
-  typedef std::uint_fast32_t ui32;
+  using ui32 = int;
 
  public:
   /**
@@ -79,7 +79,7 @@ class thread_pool {
    *
    * @return The number of queued tasks.
    */
-  size_t get_tasks_queued() const
+  [[nodiscard]] size_t get_tasks_queued() const
   {
     const std::scoped_lock lock(queue_mutex);
     return tasks.size();
@@ -90,7 +90,7 @@ class thread_pool {
    *
    * @return The number of running tasks.
    */
-  ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
+  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
 
   /**
    * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
@@ -98,14 +98,14 @@ class thread_pool {
    *
    * @return The total number of tasks.
    */
-  ui32 get_tasks_total() const { return tasks_total; }
+  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
 
   /**
    * @brief Get the number of threads in the pool.
    *
    * @return The number of threads.
    */
-  ui32 get_thread_count() const { return thread_count; }
+  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
 
   /**
    * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 4005d6101bd..c2115c3caa4 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -242,13 +242,13 @@ struct hash_join::hash_join_impl {
             rmm::cuda_stream_view stream,
             rmm::mr::device_memory_resource* mr) const;
 
-  std::size_t inner_join_size(cudf::table_view const& probe,
-                              null_equality compare_nulls,
-                              rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
+                                            null_equality compare_nulls,
+                                            rmm::cuda_stream_view stream) const;
 
-  std::size_t left_join_size(cudf::table_view const& probe,
-                             null_equality compare_nulls,
-                             rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
+                                           null_equality compare_nulls,
+                                           rmm::cuda_stream_view stream) const;
 
   std::size_t full_join_size(cudf::table_view const& probe,
                              null_equality compare_nulls,
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index fe45cdfc338..8d2de8997d1 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -53,8 +53,8 @@ namespace detail {
  * @endcode
  */
 struct list_gatherer {
-  typedef size_type argument_type;
-  typedef size_type result_type;
+  using argument_type = size_type;
+  using result_type   = size_type;
 
   size_t offset_count;
   size_type const* base_offsets;
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 7b3b7b0f3fd..66b26148ede 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -257,7 +257,7 @@ __global__ void copy_block_partitions(InputIter input_iter,
     reinterpret_cast<size_type*>(block_output + OPTIMIZED_BLOCK_SIZE * OPTIMIZED_ROWS_PER_THREAD);
   auto partition_offset_global = partition_offset_shared + num_partitions + 1;
 
-  typedef cub::BlockScan<size_type, OPTIMIZED_BLOCK_SIZE> BlockScan;
+  using BlockScan = cub::BlockScan<size_type, OPTIMIZED_BLOCK_SIZE>;
   __shared__ typename BlockScan::TempStorage temp_storage;
 
   // use ELEMENTS_PER_THREAD=2 to support upto 1024 partitions
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 7af1e47087b..a5dc643a688 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -45,8 +45,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
   // Underflow may occur when converting int64 to double
   // detail: https://github.com/rapidsai/cudf/issues/1417
 
-  double dlhs           = static_cast<double>(lhs);
-  double drhs           = static_cast<double>(rhs);
+  auto dlhs             = static_cast<double>(lhs);
+  auto drhs             = static_cast<double>(rhs);
   double one_minus_frac = 1.0 - frac;
   return static_cast<Result>(one_minus_frac * dlhs + frac * drhs);
 }
@@ -55,8 +55,8 @@ template <typename Result, typename T>
 CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
-  double dlhs = static_cast<double>(lhs);
-  double drhs = static_cast<double>(rhs);
+  auto dlhs = static_cast<double>(lhs);
+  auto drhs = static_cast<double>(rhs);
   return static_cast<Result>(dlhs / 2 + drhs / 2);
 }
 
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index bc1947dfeed..7c52856b147 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -950,9 +950,9 @@ __launch_bounds__(block_size) __global__
     int64_t following_window = following_window_begin[i];
 
     // compute bounds
-    size_type start = static_cast<size_type>(
+    auto start = static_cast<size_type>(
       min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
-    size_type end = static_cast<size_type>(
+    auto end = static_cast<size_type>(
       min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
     size_type start_index = min(start, end);
     size_type end_index   = max(start, end);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 9618f325fce..84ae2b73bba 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -108,7 +108,7 @@ struct base_fn {
       if (!d_chars) d_offsets[idx] = 0;
     }
 
-    Derived& derived  = static_cast<Derived&>(*this);
+    auto& derived     = static_cast<Derived&>(*this);
     auto const d_str  = d_column.element<string_view>(idx);
     offset_type bytes = 0;
     auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 9482d4db9b8..8f364f5c9bc 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -61,9 +61,9 @@ struct compute_size_and_concatenate_fn {
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
 
-  __device__ bool output_is_null(size_type const idx,
-                                 size_type const start_idx,
-                                 size_type const end_idx) const noexcept
+  [[nodiscard]] __device__ bool output_is_null(size_type const idx,
+                                               size_type const start_idx,
+                                               size_type const end_idx) const noexcept
   {
     if (func.is_null_list(lists_dv, idx)) { return true; }
     return empty_list_policy == output_if_empty_list::NULL_ELEMENT && start_idx == end_idx;
@@ -127,13 +127,16 @@ struct compute_size_and_concatenate_fn {
 struct scalar_separator_fn {
   string_scalar_device_view const d_separator;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv,
-                               size_type const idx) const noexcept
+  [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv,
+                                             size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx);
   }
 
-  __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); }
+  [[nodiscard]] __device__ string_view separator(size_type const) const noexcept
+  {
+    return d_separator.value();
+  }
 };
 
 template <typename CompFn>
@@ -222,13 +225,13 @@ struct column_separators_fn {
   column_device_view const separators_dv;
   string_scalar_device_view const sep_narep_dv;
 
-  __device__ bool is_null_list(column_device_view const& lists_dv,
-                               size_type const idx) const noexcept
+  [[nodiscard]] __device__ bool is_null_list(column_device_view const& lists_dv,
+                                             size_type const idx) const noexcept
   {
     return lists_dv.is_null(idx) || (separators_dv.is_null(idx) && !sep_narep_dv.is_valid());
   }
 
-  __device__ string_view separator(size_type const idx) const noexcept
+  [[nodiscard]] __device__ string_view separator(size_type const idx) const noexcept
   {
     return separators_dv.is_valid(idx) ? separators_dv.element<string_view>(idx)
                                        : sep_narep_dv.value();
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 9376a0082a8..efdee65c1f6 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -53,7 +53,7 @@ struct contains_fn {
 
   __device__ bool operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return 0;
+    if (d_strings.is_null(idx)) return false;
     string_view d_str = d_strings.element<string_view>(idx);
     int32_t begin     = 0;
     int32_t end       = bmatch ? 1    // match only the beginning of the string;
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 8d0c5704a7b..cd3dc3b46f3 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -156,7 +156,7 @@ struct format_compiler {
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
 
-  int8_t subsecond_precision() const { return specifiers.at('f'); }
+  [[nodiscard]] int8_t subsecond_precision() const { return specifiers.at('f'); }
 };
 
 /**
@@ -194,7 +194,7 @@ struct parse_datetime {
    *
    * @return `1x10^exponent` for `0 <= exponent <= 9`
    */
-  __device__ constexpr int64_t power_of_ten(int32_t const exponent) const
+  [[nodiscard]] __device__ constexpr int64_t power_of_ten(int32_t const exponent) const
   {
     constexpr int64_t powers_of_ten[] = {
       1L, 10L, 100L, 1000L, 10000L, 100000L, 1000000L, 10000000L, 100000000L, 1000000000L};
@@ -202,7 +202,7 @@ struct parse_datetime {
   }
 
   // Walk the format_items to parse the string into date/time components
-  __device__ timestamp_components parse_into_parts(string_view const& d_string) const
+  [[nodiscard]] __device__ timestamp_components parse_into_parts(string_view const& d_string) const
   {
     timestamp_components timeparts = {1970, 1, 1, 0};  // init to epoch time
 
@@ -310,7 +310,7 @@ struct parse_datetime {
     return timeparts;
   }
 
-  __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const
+  [[nodiscard]] __device__ int64_t timestamp_from_parts(timestamp_components const& timeparts) const
   {
     auto const ymd =  // convenient chrono class handles the leap year calculations for us
       cuda::std::chrono::year_month_day(
@@ -689,7 +689,7 @@ struct from_timestamp_base {
    *     modulo(-1,60) -> 59
    * @endcode
    */
-  __device__ int32_t modulo_time(int64_t time, int64_t base) const
+  [[nodiscard]] __device__ int32_t modulo_time(int64_t time, int64_t base) const
   {
     return static_cast<int32_t>(((time % base) + base) % base);
   };
@@ -707,12 +707,12 @@ struct from_timestamp_base {
    *     scale( 61,60) ->  1
    * @endcode
    */
-  __device__ int64_t scale_time(int64_t time, int64_t base) const
+  [[nodiscard]] __device__ int64_t scale_time(int64_t time, int64_t base) const
   {
     return (time - ((time < 0) * (base - 1L))) / base;
   };
 
-  __device__ time_components get_time_components(int64_t tstamp) const
+  [[nodiscard]] __device__ time_components get_time_components(int64_t tstamp) const
   {
     time_components result = {0};
     if constexpr (std::is_same_v<T, cudf::timestamp_D>) { return result; }
@@ -855,7 +855,7 @@ struct datetime_formatter : public from_timestamp_base<T> {
   }
 
   // from https://howardhinnant.github.io/date/date.html
-  __device__ thrust::pair<int32_t, int32_t> get_iso_week_year(
+  [[nodiscard]] __device__ thrust::pair<int32_t, int32_t> get_iso_week_year(
     cuda::std::chrono::year_month_day const& ymd) const
   {
     auto const days = cuda::std::chrono::sys_days(ymd);
@@ -885,8 +885,8 @@ struct datetime_formatter : public from_timestamp_base<T> {
       static_cast<int32_t>(year));
   }
 
-  __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days,
-                                     cuda::std::chrono::sys_days const start) const
+  [[nodiscard]] __device__ int8_t get_week_of_year(cuda::std::chrono::sys_days const days,
+                                                   cuda::std::chrono::sys_days const start) const
   {
     return days < start
              ? 0
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index f286149ea46..66e6f31cca2 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -156,7 +156,7 @@ struct format_compiler {
 
   format_item const* compiled_format_items() { return d_items.data(); }
 
-  size_type items_count() const { return static_cast<size_type>(d_items.size()); }
+  [[nodiscard]] size_type items_count() const { return static_cast<size_type>(d_items.size()); }
 };
 
 template <typename T>
diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu
index 8d96f0de415..c82ab4f81c3 100644
--- a/cpp/src/strings/findall.cu
+++ b/cpp/src/strings/findall.cu
@@ -67,7 +67,7 @@ struct findall_fn {
     string_view d_str      = d_strings.element<string_view>(idx);
     auto const nchars      = d_str.length();
     int32_t spos           = 0;
-    int32_t epos           = static_cast<int32_t>(nchars);
+    auto epos              = static_cast<int32_t>(nchars);
     size_type column_count = 0;
     while (spos <= nchars) {
       if (prog.find<stack_size>(idx, d_str, spos, epos) <= 0) break;  // no more matches found
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index c61fb8905f5..ae807db10e6 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -72,7 +72,7 @@ enum class parse_result {
  */
 class parser {
  protected:
-  CUDF_HOST_DEVICE inline parser() : input(nullptr), input_len(0), pos(nullptr) {}
+  CUDF_HOST_DEVICE inline parser() {}
   CUDF_HOST_DEVICE inline parser(const char* _input, int64_t _input_len)
     : input(_input), input_len(_input_len), pos(_input)
   {
@@ -177,9 +177,9 @@ class parser {
   }
 
  protected:
-  char const* input;
-  int64_t input_len;
-  char const* pos;
+  char const* input{nullptr};
+  int64_t input_len{0};
+  char const* pos{nullptr};
 
   CUDF_HOST_DEVICE inline bool is_whitespace(char c) { return c <= ' '; }
 };
@@ -220,18 +220,10 @@ enum json_element_type { NONE, OBJECT, ARRAY, VALUE };
  */
 class json_state : private parser {
  public:
-  __device__ json_state()
-    : parser(),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE),
-      parent_el_type(json_element_type::NONE)
-  {
-  }
+  __device__ json_state() : parser() {}
   __device__ json_state(const char* _input, int64_t _input_len, get_json_object_options _options)
     : parser(_input, _input_len),
-      cur_el_start(nullptr),
-      cur_el_type(json_element_type::NONE),
-      parent_el_type(json_element_type::NONE),
+
       options(_options)
   {
   }
@@ -340,7 +332,7 @@ class json_state : private parser {
       // next
       parse_result result = next_element_internal(false);
       if (result != parse_result::SUCCESS) { return result; }
-    } while (1);
+    } while (true);
 
     return parse_result::ERROR;
   }
@@ -486,12 +478,12 @@ class json_state : private parser {
     return (c == '\"') || (options.get_allow_single_quotes() && (c == '\''));
   }
 
-  const char* cur_el_start;          // pointer to the first character of the -value- of the current
-                                     // element - not the name
-  string_view cur_el_name;           // name of the current element (if applicable)
-  json_element_type cur_el_type;     // type of the current element
-  json_element_type parent_el_type;  // parent element type
-  get_json_object_options options;   // behavior options
+  const char* cur_el_start{nullptr};  // pointer to the first character of the -value- of the
+                                      // current element - not the name
+  string_view cur_el_name;            // name of the current element (if applicable)
+  json_element_type cur_el_type{json_element_type::NONE};     // type of the current element
+  json_element_type parent_el_type{json_element_type::NONE};  // parent element type
+  get_json_object_options options;                            // behavior options
 };
 
 enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END };
@@ -501,26 +493,23 @@ enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR,
  * an array of these operators applied to the incoming json string,
  */
 struct path_operator {
-  CUDF_HOST_DEVICE inline path_operator()
-    : type(path_operator_type::ERROR), index(-1), expected_type{NONE}
-  {
-  }
+  CUDF_HOST_DEVICE inline path_operator() {}
   CUDF_HOST_DEVICE inline path_operator(path_operator_type _type,
                                         json_element_type _expected_type = NONE)
-    : type(_type), index(-1), expected_type{_expected_type}
+    : type(_type), expected_type{_expected_type}
   {
   }
 
-  path_operator_type type;  // operator type
+  path_operator_type type{path_operator_type::ERROR};  // operator type
   // the expected element type we're applying this operation to.
   // for example:
   //    - you cannot retrieve a subscripted field (eg [5]) from an object.
   //    - you cannot retrieve a field by name (eg  .book) from an array.
   //    - you -can- use .* for both arrays and objects
   // a value of NONE imples any type accepted
-  json_element_type expected_type;  // the expected type of the element we're working with
-  string_view name;                 // name to match against (if applicable)
-  int index;                        // index for subscript operator
+  json_element_type expected_type{NONE};  // the expected type of the element we're working with
+  string_view name;                       // name to match against (if applicable)
+  int index{-1};                          // index for subscript operator
 };
 
 /**
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 20868077cf4..f2a27d1b11d 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -122,7 +122,7 @@ std::unique_ptr<column> pad(
         if (d_strings.is_null(idx)) return;
         string_view d_str = d_strings.element<string_view>(idx);
         char* ptr         = d_chars + d_offsets[idx];
-        int32_t pad       = static_cast<int32_t>(width - d_str.length());
+        auto pad          = static_cast<int32_t>(width - d_str.length());
         auto right_pad    = (width & 1) ? pad / 2 : (pad - pad / 2);  // odd width = right-justify
         auto left_pad =
           pad - right_pad;  // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++"
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 8fbd82b8dc7..7be88d01387 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -20,7 +20,7 @@
 
 #include <algorithm>
 #include <array>
-#include <string.h>
+#include <cstring>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 63d7933eebe..3131767de59 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -51,9 +51,9 @@ enum InstType {
  * @brief Class type for regex compiler instruction.
  */
 struct reclass {
-  int32_t builtins;         // bit mask identifying builtin classes
+  int32_t builtins{0};      // bit mask identifying builtin classes
   std::u32string literals;  // ranges as pairs of utf-8 characters
-  reclass() : builtins(0) {}
+  reclass() {}
   reclass(int m) : builtins(m) {}
 };
 
@@ -99,20 +99,20 @@ class reprog {
   int32_t add_class(reclass cls);
 
   void set_groups_count(int32_t groups);
-  int32_t groups_count() const;
+  [[nodiscard]] int32_t groups_count() const;
 
-  const reinst* insts_data() const;
-  int32_t insts_count() const;
+  [[nodiscard]] const reinst* insts_data() const;
+  [[nodiscard]] int32_t insts_count() const;
   reinst& inst_at(int32_t id);
 
   reclass& class_at(int32_t id);
-  int32_t classes_count() const;
+  [[nodiscard]] int32_t classes_count() const;
 
-  const int32_t* starts_data() const;
-  int32_t starts_count() const;
+  [[nodiscard]] const int32_t* starts_data() const;
+  [[nodiscard]] int32_t starts_count() const;
 
   void set_start_inst(int32_t id);
-  int32_t get_start_inst() const;
+  [[nodiscard]] int32_t get_start_inst() const;
 
   void optimize1();
   void optimize2();
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index d6b8307c3fb..a9928a6bd49 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -132,32 +132,38 @@ class reprog_device {
   /**
    * @brief Returns the number of regex instructions.
    */
-  __host__ __device__ int32_t insts_counts() const { return _insts_count; }
+  [[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; }
 
   /**
    * @brief Returns true if this is an empty program.
    */
-  __device__ bool is_empty() const { return insts_counts() == 0 || get_inst(0)->type == END; }
+  [[nodiscard]] __device__ bool is_empty() const
+  {
+    return insts_counts() == 0 || get_inst(0)->type == END;
+  }
 
   /**
    * @brief Returns the number of regex groups found in the expression.
    */
-  CUDF_HOST_DEVICE inline int32_t group_counts() const { return _num_capturing_groups; }
+  [[nodiscard]] CUDF_HOST_DEVICE inline int32_t group_counts() const
+  {
+    return _num_capturing_groups;
+  }
 
   /**
    * @brief Returns the regex instruction object for a given index.
    */
-  __device__ inline reinst* get_inst(int32_t idx) const;
+  [[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const;
 
   /**
    * @brief Returns the regex class object for a given index.
    */
-  __device__ inline reclass_device get_class(int32_t idx) const;
+  [[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const;
 
   /**
    * @brief Returns the start-instruction-ids vector.
    */
-  __device__ inline int32_t* startinst_ids() const;
+  [[nodiscard]] __device__ inline int32_t* startinst_ids() const;
 
   /**
    * @brief Does a find evaluation using the compiled expression on the given string.
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 0e11e9c1bbd..50aab8c3ac4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -246,7 +246,7 @@ __device__ inline int32_t reprog_device::regexec(
       expanded = false;
 
       for (int16_t i = 0; i < jnk.list1->size; i++) {
-        int32_t inst_id     = static_cast<int32_t>(jnk.list1->inst_ids[i]);
+        auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
         int2& range         = jnk.list1->ranges[i];
         const reinst* inst  = get_inst(inst_id);
         int32_t id_activate = -1;
@@ -283,7 +283,7 @@ __device__ inline int32_t reprog_device::regexec(
             break;
           case BOW: {
             auto codept           = utf8_to_codepoint(c);
-            char32_t last_c       = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
+            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
             auto last_codept      = utf8_to_codepoint(last_c);
             bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
             bool last_alphaNumeric =
@@ -296,7 +296,7 @@ __device__ inline int32_t reprog_device::regexec(
           }
           case NBOW: {
             auto codept           = utf8_to_codepoint(c);
-            char32_t last_c       = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
+            auto last_c           = static_cast<char32_t>(pos ? dstr[pos - 1] : 0);
             auto last_codept      = utf8_to_codepoint(last_c);
             bool cur_alphaNumeric = (codept < 0x010000) && IS_ALPHANUM(_codepoint_flags[codept]);
             bool last_alphaNumeric =
@@ -323,7 +323,7 @@ __device__ inline int32_t reprog_device::regexec(
     bool continue_execute = true;
     jnk.list2->reset();
     for (int16_t i = 0; continue_execute && i < jnk.list1->size; i++) {
-      int32_t inst_id     = static_cast<int32_t>(jnk.list1->inst_ids[i]);
+      auto inst_id        = static_cast<int32_t>(jnk.list1->inst_ids[i]);
       int2& range         = jnk.list1->ranges[i];
       const reinst* inst  = get_inst(inst_id);
       int32_t id_activate = -1;
@@ -415,11 +415,11 @@ __device__ inline int32_t reprog_device::call_regexec<RX_STACK_ANY>(
   auto const schar = get_inst(_startinst_id)->u1.c;
 
   auto const relists_size = relist::alloc_size(_insts_count);
-  u_char* listmem         = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
+  auto* listmem           = reinterpret_cast<u_char*>(_relists_mem);  // beginning of relist buffer;
   listmem += (idx * relists_size * 2);                                // two relist ptrs in reljunk:
 
-  relist* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
-  relist* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
+  auto* list1 = new (listmem) relist(static_cast<int16_t>(_insts_count));
+  auto* list2 = new (listmem + relists_size) relist(static_cast<int16_t>(_insts_count));
 
   reljunk jnk(list1, list2, stype, schar);
   return regexec(dstr, jnk, begin, end, group_id);
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index 4f93bbd6e7b..b286812226b 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -67,8 +67,7 @@ reprog_device::reprog_device(reprog& prog)
     _num_capturing_groups{prog.groups_count()},
     _insts_count{prog.insts_count()},
     _starts_count{prog.starts_count()},
-    _classes_count{prog.classes_count()},
-    _relists_mem{nullptr}
+    _classes_count{prog.classes_count()}
 {
 }
 
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index c6e52a79059..aae911e8ed6 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -547,7 +547,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
  */
 struct base_whitespace_split_tokenizer {
   // count the tokens only between non-whitespace characters
-  __device__ size_type count_tokens(size_type idx) const
+  [[nodiscard]] __device__ size_type count_tokens(size_type idx) const
   {
     if (d_strings.is_null(idx)) return 0;
     const string_view d_str = d_strings.element<string_view>(idx);
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 2e6dbe62cf1..f3b642132e3 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -250,9 +250,8 @@ __global__ void kernel_data_normalizer(unsigned char const* strings,
 
   chars_per_thread[char_for_thread] = num_new_chars;
 
-  typedef cub::
-    BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>
-      BlockStore;
+  using BlockStore =
+    cub::BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
   __shared__ typename BlockStore::TempStorage temp_storage;
 
   // Now we perform coalesced writes back to global memory using cub.
diff --git a/cpp/src/text/subword/detail/tokenizer_utils.cuh b/cpp/src/text/subword/detail/tokenizer_utils.cuh
index 48ee0fc2b51..dcd241fc045 100644
--- a/cpp/src/text/subword/detail/tokenizer_utils.cuh
+++ b/cpp/src/text/subword/detail/tokenizer_utils.cuh
@@ -20,7 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <stdint.h>
+#include <cstdint>
 
 namespace nvtext {
 namespace detail {
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 0af34eb8092..75c79381032 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -29,9 +29,9 @@
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
+#include <cstdint>
 #include <fstream>
 #include <iostream>
-#include <stdint.h>
 #include <vector>
 
 namespace nvtext {
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index ff720daa5cb..f6b10cfc583 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -116,17 +116,17 @@ struct column_info {
  *
  */
 struct hierarchy_info {
-  hierarchy_info() : simple_per_row_size(0), complex_type_count(0), max_branch_depth(0) {}
+  hierarchy_info() {}
 
   // These two fields act as an optimization. If we find that the entire table
   // is just fixed-width types, we do not need to do the more expensive kernel call that
   // traverses the individual columns. So if complex_type_count is 0, we can just
   // return a column where every row contains the value simple_per_row_size
-  size_type simple_per_row_size;  // in bits
-  size_type complex_type_count;
+  size_type simple_per_row_size{0};  // in bits
+  size_type complex_type_count{0};
 
   // max depth of span branches present in the hierarchy.
-  size_type max_branch_depth;
+  size_type max_branch_depth{0};
 };
 
 /**
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 4afa96f08d7..8a742b50baa 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -163,7 +163,7 @@ TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data)
     col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
     auto col_view_new = cudf::column_view{*col};
     EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
-    col_view_new.null_count();
+    [[maybe_unused]] auto const nulls = col_view_new.null_count();
     EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
     auto col_view_new2 = cudf::column_view{*col};
     EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2));
@@ -332,7 +332,7 @@ TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data)
     col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
     auto col_view_new = cudf::column_view{*col};
     EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
-    col_view_new.null_count();
+    [[maybe_unused]] auto const nulls = col_view_new.null_count();
     EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
     auto col_view_new2 = cudf::column_view{*col};
     EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2));
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index 306037e6473..a306736d131 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -343,7 +343,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 char columns of size 1 billion each
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
@@ -355,7 +355,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // string column, overflow on chars
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, size};
@@ -370,7 +370,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // string column, overflow on offsets (rows)
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns 1 billion rows each
     auto many_offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, size + 1);
@@ -385,8 +385,7 @@ TEST_F(OverflowTest, OverflowTest)
 
   // list<struct>, structs too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
 
     // struct
     std::vector<std::unique_ptr<column>> children;
@@ -408,9 +407,8 @@ TEST_F(OverflowTest, OverflowTest)
 
   // struct<int, list>, list child too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
-    constexpr size_type size = 3;
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(512) * 1024 * 1024);
+    constexpr size_type size  = 3;
 
     // list
     auto offsets    = cudf::test::fixed_width_column_wrapper<offset_type>{0, 0, 0, inner_size};
@@ -437,7 +435,7 @@ TEST_F(OverflowTest, Presliced)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 4 char columns of size ~1/2 billion each
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
@@ -454,7 +452,7 @@ TEST_F(OverflowTest, Presliced)
 
   // struct<int8> column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 4 char columns of size ~1/2 billion each
     std::vector<std::unique_ptr<column>> children;
@@ -542,8 +540,7 @@ TEST_F(OverflowTest, Presliced)
 
   // list<struct>, structs too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // struct
     std::vector<std::unique_ptr<column>> children;
@@ -616,8 +613,7 @@ TEST_F(OverflowTest, Presliced)
 
   // struct<int8, list>, list child elements too long
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 4;
     constexpr size_type list_size = inner_size / num_rows;
 
@@ -656,7 +652,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // primitive column
   {
-    constexpr size_type size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     auto many_chars = cudf::make_fixed_width_column(data_type{type_id::INT8}, size);
     auto sliced     = cudf::slice(*many_chars, {16, 32});
@@ -668,8 +664,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // strings column
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows    = 1024;
     constexpr size_type string_size = inner_size / num_rows;
 
@@ -696,8 +691,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // list<int8> column
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 1024;
     constexpr size_type list_size = inner_size / num_rows;
 
@@ -724,8 +718,7 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
 
   // struct<int8, list>
   {
-    constexpr size_type inner_size =
-      static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
+    constexpr auto inner_size = static_cast<size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
     constexpr size_type num_rows  = 1024;
     constexpr size_type list_size = inner_size / num_rows;
 
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 4468bc69640..2f02f4cba02 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -70,30 +70,38 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong)
   // make sure we span at least 2 warps
   int num_els = 64;
 
-  bool mask[] = {1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {true, false, true, false, true, true, true,  true,  true,  true,  true, true, true,
+                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+                 true, true,  true, true,  true, true, false, false, false, false, true, true, true,
+                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true, true,
+                 true, true,  true, true,  true, true, true,  true,  true,  true,  true, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
-  bool lhs_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool lhs_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true};
   wrapper<T, int32_t> lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                              5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                              5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
                             lhs_v);
 
-  bool rhs_v[] = {1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool rhs_v[] = {true, true, true, true, true, true, false, false, true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true, true,
+                  true, true, true, true, true, true, true,  true,  true, true, true, true};
   wrapper<T, int32_t> rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                              6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6},
                             rhs_v);
 
-  bool exp_v[] = {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool exp_v[] = {true, true, true, true, false, false, true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true, true,
+                  true, true, true, true, true,  true,  true, true, true, true, true, true};
   wrapper<T, int32_t> expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
                                   6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
                                   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5},
@@ -309,13 +317,13 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   cudf::numeric_scalar<T> lhs_w(5);
 
   const auto rhs = cudf::test::make_type_param_vector<T>({6, 6, 6, 6});
-  bool rhs_v[]   = {1, 0, 1, 1};
+  bool rhs_v[]   = {true, false, true, true};
   wrapper<T> rhs_w(rhs.begin(), rhs.end(), rhs_v);
 
   const auto expected = cudf::test::make_type_param_vector<T>({5, 6, 6, 5});
@@ -331,12 +339,12 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestColumnScalar)
 
   int num_els = 4;
 
-  bool mask[]   = {1, 0, 0, 1};
-  bool mask_v[] = {1, 1, 1, 0};
+  bool mask[]   = {true, false, false, true};
+  bool mask_v[] = {true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els, mask_v);
 
   const auto lhs = cudf::test::make_type_param_vector<T>({5, 5, 5, 5});
-  bool lhs_v[]   = {0, 1, 1, 1};
+  bool lhs_v[]   = {false, true, true, true};
   wrapper<T> lhs_w(lhs.begin(), lhs.end(), lhs_v);
 
   cudf::numeric_scalar<T> rhs_w(6);
@@ -354,7 +362,7 @@ TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   cudf::numeric_scalar<T> lhs_w(5);
@@ -399,12 +407,12 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
 
-  bool rhs_v[] = {1, 0, 1, 1};
+  bool rhs_v[] = {true, false, true, true};
   wrapper<T, int32_t> rhs_w({6, 6, 6, 6}, rhs_v);
 
   wrapper<T, int32_t> expected_w({5, 6, 6, 5}, rhs_v);
@@ -419,10 +427,10 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestColumnScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
-  bool lhs_v[] = {0, 1, 1, 1};
+  bool lhs_v[] = {false, true, true, true};
   wrapper<T, int32_t> lhs_w({5, 5, 5, 5}, lhs_v);
 
   auto rhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(6), true);
@@ -439,7 +447,7 @@ TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarScalar)
 
   int num_els = 4;
 
-  bool mask[] = {1, 0, 0, 1};
+  bool mask[] = {true, false, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
 
   auto lhs_w = create_chrono_scalar<T>{}(cudf::test::make_type_param_scalar<T>(5), true);
@@ -477,8 +485,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElse)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 1, 0, 1, 0, 1};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, true, false, true, false, true};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
@@ -504,8 +512,8 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarColumn)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 0, 1, 0, 1, 0};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, false, true, false, true, false};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(strings1, strings2, mask_w);
@@ -532,7 +540,7 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseColumnScalar)
   std::vector<const char*> h_strings2{"zz", "", "yyy", "w", "ééé", "ooo"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[] = {0, 1, 1, 1, 0, 1};
+  bool mask[] = {false, true, true, true, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
 
   auto results = cudf::copy_if_else(strings2, strings1, mask_w);
@@ -560,14 +568,14 @@ TEST_F(StringsCopyIfElseTest, CopyIfElseScalarScalar)
   cudf::string_scalar string2{h_string2[0], false};
 
   constexpr cudf::size_type mask_size = 6;
-  bool mask[]                         = {1, 0, 1, 0, 1, 0};
+  bool mask[]                         = {true, false, true, false, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + mask_size);
 
   auto results = cudf::copy_if_else(string1, string2, mask_w);
 
   std::vector<const char*> h_expected;
-  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(mask_size); ++idx) {
-    if (mask[idx]) {
+  for (bool idx : mask) {
+    if (idx) {
       h_expected.push_back(h_string1[0]);
     } else {
       h_expected.push_back(h_string2[0]);
@@ -649,8 +657,8 @@ TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings2.begin(), h_strings2.end(), valids);
 
-  bool mask[]   = {1, 1, 0, 1, 0, 1};
-  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  bool mask[]   = {true, true, false, true, false, true};
+  bool mask_v[] = {true, true, true, true, true, false};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
 
   auto results = cudf::copy_if_else(input1, input2, mask_w);
@@ -676,7 +684,7 @@ TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
   cudf::test::dictionary_column_wrapper<std::string> input2(
     h_strings.begin(), h_strings.end(), valids);
 
-  bool mask[] = {0, 1, 1, 1, 0, 1};
+  bool mask[] = {false, true, true, true, false, true};
   cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
 
   auto results = cudf::copy_if_else(input2, input1, mask_w);
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 1199dfb44f2..2591f395914 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -36,8 +36,7 @@ namespace cudf {
 namespace test {
 
 using namespace cudf;
-
-typedef thrust::tuple<size_type, double, double> expected_value;
+using expected_value = thrust::tuple<size_type, double, double>;
 
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {
diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu
index 4a0e3807a4c..456ba951a45 100644
--- a/cpp/tests/hash_map/multimap_test.cu
+++ b/cpp/tests/hash_map/multimap_test.cu
@@ -66,20 +66,19 @@ class MultimapTest : public cudf::test::BaseFixture {
     rmm::cuda_stream_default.synchronize();
   }
 
-  ~MultimapTest() {}
+  ~MultimapTest() override {}
 };
 
 // Google Test can only do a parameterized typed-test over a single type, so we
 // have to nest multiple types inside of the KeyValueTypes struct above
 // KeyValueTypes<type1, type2> implies key_type = type1, value_type = type2
 // This list is the types across which Google Test will run our tests
-typedef ::testing::Types<KeyValueTypes<int, int>,
-                         KeyValueTypes<int, long long int>,
-                         KeyValueTypes<int, unsigned long long int>,
-                         KeyValueTypes<unsigned long long int, int>,
-                         KeyValueTypes<unsigned long long int, long long int>,
-                         KeyValueTypes<unsigned long long int, unsigned long long int>>
-  Implementations;
+using Implementations = ::testing::Types<KeyValueTypes<int, int>,
+                                         KeyValueTypes<int, long long>,
+                                         KeyValueTypes<int, unsigned long long>,
+                                         KeyValueTypes<unsigned long long, int>,
+                                         KeyValueTypes<unsigned long long, long long>,
+                                         KeyValueTypes<unsigned long long, unsigned long long>>;
 
 TYPED_TEST_SUITE(MultimapTest, Implementations);
 
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index 1a73fb3abc9..da933b44b8d 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -38,7 +38,7 @@ TEST_F(HashTest, MultiValue)
                                             "The quick brown fox",
                                             "jumps over the lazy dog.",
                                             "All work and no play makes Jack a dull boy",
-                                            "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+                                            R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   using limits = std::numeric_limits<int32_t>;
   fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});
@@ -71,13 +71,13 @@ TEST_F(HashTest, MultiValueNulls)
                                              "The quick brown fox",
                                              "jumps over the lazy dog.",
                                              "All work and no play makes Jack a dull boy",
-                                             "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
                                             {0, 1, 1, 0, 1});
   strings_column_wrapper const strings_col2({"different but null",
                                              "The quick brown fox",
                                              "jumps over the lazy dog.",
                                              "I am Jack's complete lack of null value",
-                                             "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
                                             {0, 1, 1, 0, 1});
 
   // Nulls with different values should be equal
@@ -478,7 +478,7 @@ TEST_F(MD5HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e",
                                                     "682240021651ae166d08fe2a014d5c09",
@@ -525,7 +525,7 @@ TEST_F(MD5HashTest, MultiValueNulls)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"},
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
     {1, 0, 0, 1, 0});
   strings_column_wrapper const strings_col2(
     {"",
@@ -567,7 +567,7 @@ TEST_F(MD5HashTest, StringListsNulls)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer. It needed to be even longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"});
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   lists_column_wrapper<cudf::string_view> strings_list_col(
     {{""},
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index b7835b4d4d1..868b19254ca 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1901,7 +1901,7 @@ class TestSource : public cudf::io::datasource {
     return read_size;
   }
 
-  size_t size() const override { return str.size(); }
+  [[nodiscard]] size_t size() const override { return str.size(); }
 };
 
 TEST_F(CsvReaderTest, UserImplementedSource)
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 837ac96ef21..a31cd22ee3e 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -85,8 +85,8 @@ std::unique_ptr<cudf::table> create_random_fixed_table(cudf::size_type num_colum
   }
   std::vector<std::unique_ptr<cudf::column>> columns(num_columns);
   std::transform(src_cols.begin(), src_cols.end(), columns.begin(), [](column_wrapper<T>& in) {
-    auto ret = in.release();
-    ret->has_nulls();
+    auto ret                    = in.release();
+    [[maybe_unused]] auto nulls = ret->has_nulls();  // pre-cache the null count
     return ret;
   });
   return std::make_unique<cudf::table>(std::move(columns));
@@ -162,8 +162,8 @@ inline auto random_values(size_t size)
 }
 
 struct SkipRowTest {
-  int test_calls;
-  SkipRowTest(void) : test_calls(0) {}
+  int test_calls{0};
+  SkipRowTest() {}
 
   std::unique_ptr<table> get_expected_result(const std::string& filepath,
                                              int skip_rows,
@@ -773,12 +773,12 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 
 TEST_F(OrcChunkedWriterTest, Strings)
 {
-  bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
+  bool mask1[] = {true, true, false, true, true, true, true};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   str_col strings1(h_strings1.begin(), h_strings1.end(), mask1);
   table_view tbl1({strings1});
 
-  bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
+  bool mask2[] = {false, true, true, true, true, true, true};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   str_col strings2(h_strings2.begin(), h_strings2.end(), mask2);
   table_view tbl2({strings2});
@@ -885,8 +885,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
 
   int num_els = 31;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -927,8 +928,9 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
 
   int num_els = 33;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 9c656abb666..b45670fd265 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -76,7 +76,8 @@ std::unique_ptr<cudf::table> create_fixed_table(cudf::size_type num_columns,
                  columns.begin(),
                  [](cudf::test::fixed_width_column_wrapper<T>& in) {
                    auto ret = in.release();
-                   ret->has_nulls();
+                   // pre-cache the null count
+                   [[maybe_unused]] auto const nulls = ret->has_nulls();
                    return ret;
                  });
   return std::make_unique<cudf::table>(std::move(columns));
@@ -1086,7 +1087,7 @@ class custom_test_data_sink : public cudf::io::data_sink {
     outfile_.write(static_cast<char const*>(data), size);
   }
 
-  bool supports_device_write() const override { return true; }
+  [[nodiscard]] bool supports_device_write() const override { return true; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
@@ -1413,13 +1414,13 @@ TEST_F(ParquetChunkedWriterTest, Strings)
 {
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
+  bool mask1[] = {true, true, false, true, true, true, true};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
   cols.push_back(strings1.release());
   cudf::table tbl1(std::move(cols));
 
-  bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
+  bool mask2[] = {false, true, true, true, true, true, true};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
   cols.push_back(strings2.release());
@@ -2052,8 +2053,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   int num_els = 31;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -2099,8 +2101,9 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   int num_els = 33;
   std::vector<std::unique_ptr<cudf::column>> cols;
 
-  bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true,
+                 true,  true, true, true, true, true, true, true, true, true, true};
 
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
@@ -2149,7 +2152,7 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
 
   void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); }
 
-  bool supports_device_write() const override { return supports_device_writes; }
+  [[nodiscard]] bool supports_device_write() const override { return supports_device_writes; }
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index fe65fe0474a..7540dfd94c5 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -298,7 +298,7 @@ struct ReplaceTest : cudf::test::BaseFixture {
     std::srand(number_of_instantiations++);
   }
 
-  ~ReplaceTest() {}
+  ~ReplaceTest() override {}
 };
 
 /**
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index 3e89e435bc0..7cd8b655231 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -114,7 +114,7 @@ TYPED_TEST(DefaultScalarFactory, TypeCast)
 
   auto numeric_s = static_cast<cudf::scalar_type_t<TypeParam>*>(s.get());
 
-  EXPECT_NO_THROW(numeric_s->value());
+  EXPECT_NO_THROW((void)numeric_s->value());
   EXPECT_FALSE(numeric_s->is_valid());
   EXPECT_FALSE(s->is_valid());
 }
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index ff9f79ea87f..c1552ab3f57 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -51,13 +51,20 @@ TEST_P(CharsTypes, AllTypes)
                                      "de",
                                      "\t\r\n\f "};
 
-  bool expecteds[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,   // decimal
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,   // numeric
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,   // digit
-                      1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,   // alpha
-                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,   // space
-                      0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   // upper
-                      0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};  // lower
+  bool expecteds[] = {false, false, false, false, false, false, false, false,
+                      false, false, false, false, false, true,  false, false,  // decimal
+                      false, false, false, false, false, false, false, false,
+                      false, true,  false, true,  false, true,  false, false,  // numeric
+                      false, false, false, false, false, false, false, false,
+                      false, false, false, true,  false, true,  false, false,  // digit
+                      true,  true,  false, true,  false, false, false, false,
+                      false, false, false, false, false, false, true,  false,  // alpha
+                      false, false, false, false, false, false, false, false,
+                      false, false, false, false, false, false, false, true,  // space
+                      false, false, false, true,  false, false, false, false,
+                      false, false, false, false, false, false, false, false,  // upper
+                      false, true,  false, false, false, false, false, false,
+                      false, false, false, false, false, false, true,  false};  // lower
 
   auto is_parm = GetParam();
 
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 2bb1c6dac8e..516882bd8ad 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -183,7 +183,7 @@ TEST_F(StringsExtractTests, ExtractAllTest)
 
   auto results = cudf::strings::extract_all(sv, "(\\d+) (\\w+)");
 
-  bool valids[] = {1, 1, 1, 0, 0, 0, 1};
+  bool valids[] = {true, true, true, false, false, false, true};
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"123", "banana", "7", "eleven"},
                 LCW{"41", "apple"},
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 38f905078a7..d35cb5c3b9d 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -68,7 +68,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
       strings[idx] = thrust::pair<const char*, cudf::size_type>{nullptr, 0};
       nulls++;
     } else {
-      cudf::size_type length = (cudf::size_type)strlen(str);
+      auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
       strings[idx] = thrust::pair<const char*, cudf::size_type>{d_buffer.data() + offset, length};
       offset += length;
@@ -130,7 +130,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
     h_null_mask     = (h_null_mask << 1);
     const char* str = h_test_strings[idx];
     if (str) {
-      cudf::size_type length = (cudf::size_type)strlen(str);
+      auto length = (cudf::size_type)strlen(str);
       memcpy(h_buffer.data() + offset, str, length);
       offset += length;
       h_null_mask |= 1;
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index dfcc646a8f6..2dfe50d2ef5 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -389,7 +389,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
     auto result     = drop_whitespace(*result_raw);
 
-    cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"};
+    cudf::test::strings_column_wrapper expected_raw{R"(["0-553-21311-3","0-395-19395-8"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -402,7 +402,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
-      "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"};
+      R"(["reference","fiction","fiction","fiction"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -415,7 +415,7 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
     auto result     = drop_whitespace(*result_raw);
 
     cudf::test::strings_column_wrapper expected_raw{
-      "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"};
+      R"(["Sayings of the Century","Sword of Honour","Moby Dick","The Lord of the Rings"])"};
     auto expected = drop_whitespace(expected_raw);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected);
@@ -470,7 +470,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyQuery)
 {
   // empty query -> null
   {
-    cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a" : "b"})"};
     std::string json_path("");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -497,7 +497,7 @@ TEST_F(JsonPathTests, GetJsonObjectEmptyInputsAndOutputs)
   // returning something, but it happens to be empty. so we expect
   // a valid, but empty row
   {
-    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -513,7 +513,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   // can't have more than one root operator, or a root operator anywhere other
   // than the beginning
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$$");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -523,7 +523,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // invalid index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[auh46h-]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -533,7 +533,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // invalid index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[[]]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -543,7 +543,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // negative index
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[-1]");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -553,7 +553,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
 
   // child operator with no name specified
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path(".");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -562,7 +562,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   }
 
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("][");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -571,7 +571,7 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
   }
 
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("6hw6,56i3");
     auto query = [&]() {
       auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
@@ -585,7 +585,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
 {
   // non-existent field
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -596,7 +596,7 @@ TEST_F(JsonPathTests, GetJsonObjectInvalidQuery)
 
   // non-existent field
   {
-    cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"};
+    cudf::test::strings_column_wrapper input{R"({"a": "b"})"};
     std::string json_path("$[*].c[2]");
     auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path);
 
@@ -667,8 +667,8 @@ TEST_F(JsonPathTests, MixedOutput)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "{\"b\" : \"c\"}",
-      "{\"b\" : \"c\"}",
+      R"({"b" : "c"})",
+      R"({"b" : "c"})",
       "",
       "[\"y\",500]",
       "",
@@ -786,7 +786,7 @@ TEST_F(JsonPathTests, StripQuotes)
 
   // a valid, but empty row
   {
-    cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"};
+    cudf::test::strings_column_wrapper input{R"({"store": { "bicycle" : "" } })"};
     std::string json_path("$.store.bicycle");
 
     cudf::strings::get_json_object_options options;
@@ -858,8 +858,8 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "{\'b\' : \'c\'}",
-      "{\'b\' : \"c\"}",
+      R"({'b' : 'c'})",
+      R"({'b' : "c"})",
       "",
       "[\'y\',500]",
       "",
@@ -902,7 +902,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
-        "[{\"key\" : \"value[\"}]",
+        R"([{"key" : "value["}])",
       });
       // clang-format on
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -928,7 +928,7 @@ TEST_F(JsonPathTests, StringsWithSpecialChars)
 
       // clang-format off
       cudf::test::strings_column_wrapper expected({
-        "[}{}][][{[\\\"}}[\\\"]",
+        R"([}{}][][{[\"}}[\"])",
       });
       // clang-format on
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -961,8 +961,8 @@ TEST_F(JsonPathTests, EscapeSequences)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "\\\" \\\\ \\/ \\b \\f \\n \\r \\t",
-      "\\u1248 \\uacdf \\uACDF \\u10EF"
+      R"(\" \\ \/ \b \f \n \r \t)",
+      R"(\u1248 \uacdf \uACDF \u10EF)"
     });
     // clang-format on
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index c94963525a0..a1c0c49a881 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -123,7 +123,7 @@ TEST_F(TableViewTest, SelectOutOfBounds)
   fixed_width_column_wrapper<int64_t> col4{{4, 5, 6, 7}};
   cudf::table_view t{{col1, col2}};
 
-  EXPECT_THROW(t.select({2, 3, 4}), std::out_of_range);
+  EXPECT_THROW((void)t.select({2, 3, 4}), std::out_of_range);
 }
 
 TEST_F(TableViewTest, SelectNoColumns)

From 12a0f596e5f1adceab8f386f89954598fb812757 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 20 Jan 2022 11:05:25 -0500
Subject: [PATCH 194/202] Remove libcudacxx patch needed for nvcc 11.4 (#10057)

The `libcudacxx.patch` was required to fix issues with libcudacxx 1.6 and incorrect detection of the arm nvcc 11.4 compiler.

As we move to libcudacxx 1.7 this patch is not needed, and should be removed.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/10057
---
 cpp/cmake/libcudacxx.patch                   | 21 --------------------
 cpp/cmake/thirdparty/get_cucollections.cmake |  2 +-
 cpp/cmake/thirdparty/get_libcudacxx.cmake    |  6 +-----
 3 files changed, 2 insertions(+), 27 deletions(-)
 delete mode 100644 cpp/cmake/libcudacxx.patch

diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
deleted file mode 100644
index 3cdc40ef084..00000000000
--- a/cpp/cmake/libcudacxx.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
-index d55a43688..654142d7e 100644
---- a/include/cuda/std/detail/__config
-+++ b/include/cuda/std/detail/__config
-@@ -23,7 +23,7 @@
-     #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__
-     #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
-     #define _LIBCUDACXX_CUDACC_VER                                                  \
--        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
-+        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
-         _LIBCUDACXX_CUDACC_VER_BUILD
- 
-     #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
-@@ -64,7 +64,7 @@
- #  endif
- #endif
- 
--#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500))
-+#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000))
- #  define _LIBCUDACXX_HAS_NO_INT128
- #endif
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 16e7a58b020..c964c85156c 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_cucollections)
     cuco 0.0
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
-    GIT_TAG 193de1aa74f5721717f991ca757dc610c852bb17
+    GIT_TAG 0ca860b824f5dc22cf8a41f09912e62e11f07d82
     OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
   )
 
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 0917adcd764..4b2917bc11e 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -16,11 +16,7 @@
 function(find_and_configure_libcudacxx)
   include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-  rapids_cpm_libcudacxx(
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch --reject-file=- -p1 -N <
-                                    ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true
-  )
+  rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 
   set(LIBCUDACXX_INCLUDE_DIR
       "${libcudacxx_SOURCE_DIR}/include"

From 09035d606cebc66e7efa28e6b0702a698a67cff2 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 20 Jan 2022 14:36:44 -0600
Subject: [PATCH 195/202] Use fsspec.parquet for improved read_parquet
 performance from remote storage (#9589)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Important Note**: ~Marking this as WIP until the `fsspec.parquet` module is available in a filesystem_spec release~ (fsspec.parquet module is available)

This PR modifies `cudf.read_parquet` and `dask_cudf.read_parquet` to leverage the new `fsspec.parquet.open_parquet_file` function for optimized data transfer/caching from remote storage. The ~long-term~ goal is to remove the temporary data-transfer optimizations that we currently use in cudf.read_parquet.

**Performance Motivation**:

```python
In [1]: import cudf, dask_cudf
   ...: path = [
   ...:     "gs://my-bucket/criteo-parquet/day_0.parquet",
   ...:     "gs://my-bucket/criteo-parquet/day_1.parquet",
   ...: ]

# cudf BEFORE
In [2]: %time df = cudf.read_parquet(path, columns=["I10"], storage_options=…)
CPU times: user 11.1 s, sys: 11.5 s, total: 22.6 s
Wall time: 24.4 s

# cudf AFTER
In [2]: %time df = cudf.read_parquet(path, columns=["I10"], storage_options=…)
CPU times: user 3.48 s, sys: 722 ms, total: 4.2 s
Wall time: 6.32 s

# (Threaded) Dask-cudf BEFORE
In [2]: %time df = dask_cudf.read_parquet(path, columns=["I10"], storage_options=…).compute()
CPU times: user 27.1 s, sys: 15.5 s, total: 42.6 s
Wall time: 57.6 s

# (Threaded) Dask-cudf AFTER
In [2]: %time df = dask_cudf.read_parquet(path, columns=["I10"], storage_options=…).compute()
CPU times: user 3.43 s, sys: 851 ms, total: 4.28 s
Wall time: 13.1 s
```

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - https://github.com/brandon-b-miller
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9589
---
 python/cudf/cudf/io/csv.py                    |   7 -
 python/cudf/cudf/io/parquet.py                | 181 ++++++-----------
 python/cudf/cudf/tests/test_parquet.py        |  13 +-
 python/cudf/cudf/tests/test_s3.py             |  43 ++--
 python/cudf/cudf/utils/ioutils.py             | 183 +++++++++++++-----
 python/dask_cudf/dask_cudf/io/parquet.py      |  31 +--
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  20 +-
 7 files changed, 257 insertions(+), 221 deletions(-)

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 01f1fdf9020..4694243ad18 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -59,17 +59,10 @@ def read_csv(
             "`read_csv` does not yet support reading multiple files"
         )
 
-    # Only need to pass byte_ranges to get_filepath_or_buffer
-    # if `use_python_file_object=False`
-    byte_ranges = None
-    if not use_python_file_object and byte_range:
-        byte_ranges = [byte_range]
-
     filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
         iotypes=(BytesIO, StringIO, NativeFile),
-        byte_ranges=byte_ranges,
         use_python_file_object=use_python_file_object,
         **kwargs,
     )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 3e73e0c9e3d..a919b00692d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,14 +1,11 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
-import io
-import json
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
 from typing import Dict, List, Tuple
 from uuid import uuid4
 
-import fsspec
 import numpy as np
 from pyarrow import dataset as ds, parquet as pq
 
@@ -310,103 +307,6 @@ def _process_dataset(
     )
 
 
-def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs):
-
-    # This utility is used to collect the footer metadata
-    # from a parquet file. This metadata is used to define
-    # the exact byte-ranges that will be needed to read the
-    # target column-chunks from the file.
-    #
-    # This utility is only used for remote storage.
-    #
-    # The calculated byte-range information is used within
-    # cudf.io.ioutils.get_filepath_or_buffer (which uses
-    # _fsspec_data_transfer to convert non-local fsspec file
-    # objects into local byte buffers).
-
-    if row_groups is None:
-        if columns is None:
-            return None, None, None  # No reason to construct this
-        row_groups = [None for path in file_list]
-
-    # Construct a list of required byte-ranges for every file
-    all_byte_ranges, all_footers, all_sizes = [], [], []
-    for path, rgs in zip(file_list, row_groups):
-
-        # Step 0 - Get size of file
-        if fs is None:
-            file_size = path.size
-        else:
-            file_size = fs.size(path)
-
-        # Step 1 - Get 32 KB from tail of file.
-        #
-        # This "sample size" can be tunable, but should
-        # always be >= 8 bytes (so we can read the footer size)
-        tail_size = min(kwargs.get("footer_sample_size", 32_000), file_size,)
-        if fs is None:
-            path.seek(file_size - tail_size)
-            footer_sample = path.read(tail_size)
-        else:
-            footer_sample = fs.tail(path, tail_size)
-
-        # Step 2 - Read the footer size and re-read a larger
-        #          tail if necessary
-        footer_size = int.from_bytes(footer_sample[-8:-4], "little")
-        if tail_size < (footer_size + 8):
-            if fs is None:
-                path.seek(file_size - (footer_size + 8))
-                footer_sample = path.read(footer_size + 8)
-            else:
-                footer_sample = fs.tail(path, footer_size + 8)
-
-        # Step 3 - Collect required byte ranges
-        byte_ranges = []
-        md = pq.ParquetFile(io.BytesIO(footer_sample)).metadata
-        column_set = None if columns is None else set(columns)
-        if column_set is not None:
-            schema = md.schema.to_arrow_schema()
-            has_pandas_metadata = (
-                schema.metadata is not None and b"pandas" in schema.metadata
-            )
-            if has_pandas_metadata:
-                md_index = [
-                    ind
-                    for ind in json.loads(
-                        schema.metadata[b"pandas"].decode("utf8")
-                    ).get("index_columns", [])
-                    # Ignore RangeIndex information
-                    if not isinstance(ind, dict)
-                ]
-                column_set |= set(md_index)
-        for r in range(md.num_row_groups):
-            # Skip this row-group if we are targetting
-            # specific row-groups
-            if rgs is None or r in rgs:
-                row_group = md.row_group(r)
-                for c in range(row_group.num_columns):
-                    column = row_group.column(c)
-                    name = column.path_in_schema
-                    # Skip this column if we are targetting a
-                    # specific columns
-                    split_name = name.split(".")[0]
-                    if (
-                        column_set is None
-                        or name in column_set
-                        or split_name in column_set
-                    ):
-                        file_offset0 = column.dictionary_page_offset
-                        if file_offset0 is None:
-                            file_offset0 = column.data_page_offset
-                        num_bytes = column.total_compressed_size
-                        byte_ranges.append((file_offset0, num_bytes))
-
-        all_byte_ranges.append(byte_ranges)
-        all_footers.append(footer_sample)
-        all_sizes.append(file_size)
-    return all_byte_ranges, all_footers, all_sizes
-
-
 @ioutils.doc_read_parquet()
 def read_parquet(
     filepath_or_buffer,
@@ -418,13 +318,24 @@ def read_parquet(
     num_rows=None,
     strings_to_categorical=False,
     use_pandas_metadata=True,
-    use_python_file_object=False,
+    use_python_file_object=True,
     categorical_partitions=True,
+    open_file_options=None,
     *args,
     **kwargs,
 ):
     """{docstring}"""
 
+    # Do not allow the user to set file-opening options
+    # when `use_python_file_object=False` is specified
+    if use_python_file_object is False:
+        if open_file_options:
+            raise ValueError(
+                "open_file_options is not currently supported when "
+                "use_python_file_object is set to False."
+            )
+        open_file_options = {}
+
     # Multiple sources are passed as a list. If a single source is passed,
     # wrap it in a list for unified processing downstream.
     if not is_list_like(filepath_or_buffer):
@@ -470,38 +381,18 @@ def read_parquet(
         raise ValueError("cudf cannot apply filters to open file objects.")
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
-    # Check if we should calculate the specific byte-ranges
-    # needed for each parquet file. We always do this when we
-    # have a file-system object to work with and it is not a
-    # local filesystem object. We can also do it without a
-    # file-system object for `AbstractBufferedFile` buffers
-    byte_ranges, footers, file_sizes = None, None, None
-    if not use_python_file_object:
-        need_byte_ranges = fs is not None and not ioutils._is_local_filesystem(
-            fs
-        )
-        if need_byte_ranges or (
-            filepath_or_buffer
-            and isinstance(
-                filepath_or_buffer[0], fsspec.spec.AbstractBufferedFile,
-            )
-        ):
-            byte_ranges, footers, file_sizes = _get_byte_ranges(
-                filepath_or_buffer, row_groups, columns, fs, **kwargs
-            )
-
     filepaths_or_buffers = []
+    if use_python_file_object:
+        open_file_options = _default_open_file_options(
+            open_file_options, columns, row_groups, fs=fs,
+        )
     for i, source in enumerate(filepath_or_buffer):
-
         tmp_source, compression = ioutils.get_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
-            byte_ranges=byte_ranges[i] if byte_ranges else None,
-            footer=footers[i] if footers else None,
-            file_size=file_sizes[i] if file_sizes else None,
-            add_par1_magic=True,
             use_python_file_object=use_python_file_object,
+            open_file_options=open_file_options,
             **kwargs,
         )
 
@@ -953,3 +844,41 @@ def __enter__(self):
 
     def __exit__(self, *args):
         self.close()
+
+
+def _default_open_file_options(
+    open_file_options, columns, row_groups, fs=None
+):
+    """
+    Set default fields in open_file_options.
+
+    Copies and updates `open_file_options` to
+    include column and row-group information
+    under the "precache_options" key. By default,
+    we set "method" to "parquet", but precaching
+    will be disabled if the user chooses `method=None`
+
+    Parameters
+    ----------
+    open_file_options : dict or None
+    columns : list
+    row_groups : list
+    fs : fsspec.AbstractFileSystem, Optional
+    """
+    if fs and ioutils._is_local_filesystem(fs):
+        # Quick return for local fs
+        return open_file_options or {}
+    # Assume remote storage if `fs` was not specified
+    open_file_options = (open_file_options or {}).copy()
+    precache_options = open_file_options.pop("precache_options", {}).copy()
+    if precache_options.get("method", "parquet") == "parquet":
+        precache_options.update(
+            {
+                "method": "parquet",
+                "engine": precache_options.get("engine", "pyarrow"),
+                "columns": columns,
+                "row_groups": row_groups,
+            }
+        )
+    open_file_options["precache_options"] = precache_options
+    return open_file_options
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 519f24b7ca6..21556aad1eb 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -748,7 +748,10 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     assert_eq(expect, got)
 
 
-def test_parquet_reader_use_python_file_object(parquet_path_or_buf):
+@pytest.mark.parametrize("use_python_file_object", [True, False])
+def test_parquet_reader_use_python_file_object(
+    parquet_path_or_buf, use_python_file_object
+):
     # Check that the non-default `use_python_file_object=True`
     # option works as expected
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
@@ -756,11 +759,15 @@ def test_parquet_reader_use_python_file_object(parquet_path_or_buf):
 
     # Pass open fsspec file
     with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(fil, use_python_file_object=True)
+        got1 = cudf.read_parquet(
+            fil, use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(paths[0], use_python_file_object=True)
+    got2 = cudf.read_parquet(
+        paths[0], use_python_file_object=use_python_file_object
+    )
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 5738e1f0d00..da1ffc1fc16 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -131,6 +131,9 @@ def pdf_ext(scope="module"):
     df["Integer"] = np.array([i for i in range(size)])
     df["List"] = [[i] for i in range(size)]
     df["Struct"] = [{"a": i} for i in range(size)]
+    df["String"] = (["Alpha", "Beta", "Gamma", "Delta"] * (-(size // -4)))[
+        :size
+    ]
     return df
 
 
@@ -225,9 +228,16 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-@pytest.mark.parametrize("use_python_file_object", [False, True])
+@pytest.mark.parametrize("precache", [None, "parquet"])
+@pytest.mark.parametrize("use_python_file_object", [True, False])
 def test_read_parquet(
-    s3_base, s3so, pdf, bytes_per_thread, columns, use_python_file_object
+    s3_base,
+    s3so,
+    pdf,
+    bytes_per_thread,
+    columns,
+    precache,
+    use_python_file_object,
 ):
     fname = "test_parquet_reader.parquet"
     bname = "parquet"
@@ -239,10 +249,15 @@ def test_read_parquet(
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got1 = cudf.read_parquet(
             "s3://{}/{}".format(bname, fname),
-            use_python_file_object=use_python_file_object,
+            open_file_options=(
+                {"precache_options": {"method": precache}}
+                if use_python_file_object
+                else None
+            ),
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             columns=columns,
+            use_python_file_object=use_python_file_object,
         )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
@@ -256,25 +271,18 @@ def test_read_parquet(
         with fs.open("s3://{}/{}".format(bname, fname), mode="rb") as f:
             got2 = cudf.read_parquet(
                 f,
-                use_python_file_object=use_python_file_object,
                 bytes_per_thread=bytes_per_thread,
                 columns=columns,
+                use_python_file_object=use_python_file_object,
             )
     assert_eq(expect, got2)
 
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["List", "Struct"]])
-@pytest.mark.parametrize("use_python_file_object", [False, True])
 @pytest.mark.parametrize("index", [None, "Integer"])
 def test_read_parquet_ext(
-    s3_base,
-    s3so,
-    pdf_ext,
-    bytes_per_thread,
-    columns,
-    use_python_file_object,
-    index,
+    s3_base, s3so, pdf_ext, bytes_per_thread, columns, index,
 ):
     fname = "test_parquet_reader_ext.parquet"
     bname = "parquet"
@@ -290,7 +298,6 @@ def test_read_parquet_ext(
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got1 = cudf.read_parquet(
             "s3://{}/{}".format(bname, fname),
-            use_python_file_object=use_python_file_object,
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
             footer_sample_size=3200,
@@ -326,12 +333,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("python_file", [True, False])
-def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
+@pytest.mark.parametrize("precache", [None, "parquet"])
+def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     fname = "test_parquet_reader_filters.parquet"
     bname = "parquet"
     buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
+    pdf_ext.to_parquet(path=buffer)
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
@@ -339,11 +346,11 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
             "s3://{}/{}".format(bname, fname),
             storage_options=s3so,
             filters=filters,
-            use_python_file_object=python_file,
+            open_file_options={"precache_options": {"method": precache}},
         )
 
     # All row-groups should be filtered out
-    assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
+    assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
 
 
 @pytest.mark.parametrize("partition_cols", [None, ["String"]])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6f958860dad..8f8a40ae4ab 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -3,6 +3,7 @@
 import datetime
 import os
 import urllib
+import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
 
@@ -17,6 +18,13 @@
 
 from cudf.utils.docutils import docfmt_partial
 
+try:
+    import fsspec.parquet as fsspec_parquet
+
+except ImportError:
+    fsspec_parquet = None
+
+
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
   available sources
@@ -160,10 +168,17 @@
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
-use_python_file_object : boolean, default False
+use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger parquet files.
+    AbstractBufferedFile objects at IO time. Setting this argument to `False`
+    will require the entire file to be copied to host memory, and is highly
+    discouraged.
+open_file_options : dict, optional
+    Dictionary of key-value pairs to pass to the function used to open remote
+    files. By default, this will be `fsspec.parquet.open_parquet_file`. To
+    deactivate optimized precaching, set the "method" to `None` under the
+    "precache_options" key. Note that the `open_file_func` key can also be
+    used to specify a custom file-open function.
 
 Returns
 -------
@@ -1220,6 +1235,100 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
     return fs, return_paths
 
 
+def _set_context(obj, stack):
+    # Helper function to place open file on context stack
+    if stack is None:
+        return obj
+    return stack.enter_context(obj)
+
+
+def _open_remote_files(
+    paths,
+    fs,
+    context_stack=None,
+    open_file_func=None,
+    precache_options=None,
+    **kwargs,
+):
+    """Return a list of open file-like objects given
+    a list of remote file paths.
+
+    Parameters
+    ----------
+    paths : list(str)
+        List of file-path strings.
+    fs : fsspec.AbstractFileSystem
+        Fsspec file-system object.
+    context_stack : contextlib.ExitStack, Optional
+        Context manager to use for open files.
+    open_file_func : Callable, Optional
+        Call-back function to use for opening. If this argument
+        is specified, all other arguments will be ignored.
+    precache_options : dict, optional
+        Dictionary of key-word arguments to pass to use for
+        precaching. Unless the input contains ``{"method": None}``,
+        ``fsspec.parquet.open_parquet_file`` will be used for remote
+        storage.
+    **kwargs :
+        Key-word arguments to be passed to format-specific
+        open functions.
+    """
+
+    # Just use call-back function if one was specified
+    if open_file_func is not None:
+        return [
+            _set_context(open_file_func(path, **kwargs), context_stack)
+            for path in paths
+        ]
+
+    # Check if the "precache" option is supported.
+    # In the future, fsspec should do this check for us
+    precache_options = (precache_options or {}).copy()
+    precache = precache_options.pop("method", None)
+    if precache not in ("parquet", None):
+        raise ValueError(f"{precache} not a supported `precache` option.")
+
+    # Check that "parts" caching (used for all format-aware file handling)
+    # is supported by the installed fsspec/s3fs version
+    if precache == "parquet" and not fsspec_parquet:
+        warnings.warn(
+            f"This version of fsspec ({fsspec.__version__}) does "
+            f"not support parquet-optimized precaching. Please upgrade "
+            f"to the latest fsspec version for better performance."
+        )
+        precache = None
+
+    if precache == "parquet":
+        # Use fsspec.parquet module.
+        # TODO: Use `cat_ranges` to collect "known"
+        # parts for all files at once.
+        row_groups = precache_options.pop("row_groups", None) or (
+            [None] * len(paths)
+        )
+        return [
+            ArrowPythonFile(
+                _set_context(
+                    fsspec_parquet.open_parquet_file(
+                        path,
+                        fs=fs,
+                        row_groups=rgs,
+                        **precache_options,
+                        **kwargs,
+                    ),
+                    context_stack,
+                )
+            )
+            for path, rgs in zip(paths, row_groups)
+        ]
+
+    # Default open - Use pyarrow filesystem API
+    pa_fs = PyFileSystem(FSSpecHandler(fs))
+    return [
+        _set_context(pa_fs.open_input_file(fpath), context_stack)
+        for fpath in paths
+    ]
+
+
 def get_filepath_or_buffer(
     path_or_data,
     compression,
@@ -1228,6 +1337,7 @@ def get_filepath_or_buffer(
     iotypes=(BytesIO, NativeFile),
     byte_ranges=None,
     use_python_file_object=False,
+    open_file_options=None,
     **kwargs,
 ):
     """Return either a filepath string to data, or a memory buffer of data.
@@ -1249,6 +1359,9 @@ def get_filepath_or_buffer(
     use_python_file_object : boolean, default False
         If True, Arrow-backed PythonFile objects will be used in place
         of fsspec AbstractBufferedFile objects.
+    open_file_options : dict, optional
+        Optional dictionary of key-word arguments to pass to
+        `_open_remote_files` (used for remote storage only).
 
     Returns
     -------
@@ -1282,19 +1395,14 @@ def get_filepath_or_buffer(
 
         else:
             if use_python_file_object:
-                pa_fs = PyFileSystem(FSSpecHandler(fs))
-                path_or_data = [
-                    pa_fs.open_input_file(fpath) for fpath in paths
-                ]
+                path_or_data = _open_remote_files(
+                    paths, fs, **(open_file_options or {}),
+                )
             else:
                 path_or_data = [
                     BytesIO(
                         _fsspec_data_transfer(
-                            fpath,
-                            fs=fs,
-                            mode=mode,
-                            byte_ranges=byte_ranges,
-                            **kwargs,
+                            fpath, fs=fs, mode=mode, **kwargs,
                         )
                     )
                     for fpath in paths
@@ -1309,9 +1417,7 @@ def get_filepath_or_buffer(
             path_or_data = ArrowPythonFile(path_or_data)
         else:
             path_or_data = BytesIO(
-                _fsspec_data_transfer(
-                    path_or_data, mode=mode, byte_ranges=byte_ranges, **kwargs
-                )
+                _fsspec_data_transfer(path_or_data, mode=mode, **kwargs)
             )
 
     return path_or_data, compression
@@ -1545,10 +1651,7 @@ def _ensure_filesystem(passed_filesystem, path, **kwargs):
 def _fsspec_data_transfer(
     path_or_fob,
     fs=None,
-    byte_ranges=None,
-    footer=None,
     file_size=None,
-    add_par1_magic=None,
     bytes_per_thread=256_000_000,
     max_gap=64_000,
     mode="rb",
@@ -1568,48 +1671,22 @@ def _fsspec_data_transfer(
     file_size = file_size or fs.size(path_or_fob)
 
     # Check if a direct read makes the most sense
-    if not byte_ranges and bytes_per_thread >= file_size:
+    if bytes_per_thread >= file_size:
         if file_like:
             return path_or_fob.read()
         else:
-            return fs.open(path_or_fob, mode=mode, cache_type="none").read()
+            return fs.open(path_or_fob, mode=mode, cache_type="all").read()
 
     # Threaded read into "local" buffer
     buf = np.zeros(file_size, dtype="b")
-    if byte_ranges:
-
-        # Optimize/merge the ranges
-        byte_ranges = _merge_ranges(
-            byte_ranges, max_block=bytes_per_thread, max_gap=max_gap,
-        )
-
-        # Call multi-threaded data transfer of
-        # remote byte-ranges to local buffer
-        _read_byte_ranges(
-            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
-        )
-
-        # Add Header & Footer bytes
-        if footer is not None:
-            footer_size = len(footer)
-            buf[-footer_size:] = np.frombuffer(
-                footer[-footer_size:], dtype="b"
-            )
 
-        # Add parquet magic bytes (optional)
-        if add_par1_magic:
-            buf[:4] = np.frombuffer(b"PAR1", dtype="b")
-            if footer is None:
-                buf[-4:] = np.frombuffer(b"PAR1", dtype="b")
-
-    else:
-        byte_ranges = [
-            (b, min(bytes_per_thread, file_size - b))
-            for b in range(0, file_size, bytes_per_thread)
-        ]
-        _read_byte_ranges(
-            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
-        )
+    byte_ranges = [
+        (b, min(bytes_per_thread, file_size - b))
+        for b in range(0, file_size, bytes_per_thread)
+    ]
+    _read_byte_ranges(
+        path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
+    )
 
     return buf.tobytes()
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index a49d73493ec..ac5795fa2ec 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -20,7 +20,9 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
+from cudf.io.parquet import _default_open_file_options
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
+from cudf.utils.ioutils import _is_local_filesystem, _open_remote_files
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -64,6 +66,7 @@ def _read_paths(
         partitions=None,
         partitioning=None,
         partition_keys=None,
+        open_file_options=None,
         **kwargs,
     ):
 
@@ -75,15 +78,15 @@ def _read_paths(
 
             # Non-local filesystem handling
             paths_or_fobs = paths
-            if not cudf.utils.ioutils._is_local_filesystem(fs):
-
-                # Convert paths to file objects for remote data
-                paths_or_fobs = [
-                    stack.enter_context(
-                        fs.open(path, mode="rb", cache_type="none")
-                    )
-                    for path in paths
-                ]
+            if not _is_local_filesystem(fs):
+                paths_or_fobs = _open_remote_files(
+                    paths_or_fobs,
+                    fs,
+                    context_stack=stack,
+                    **_default_open_file_options(
+                        open_file_options, columns, row_groups
+                    ),
+                )
 
             # Use cudf to read in data
             df = cudf.read_parquet(
@@ -150,6 +153,7 @@ def read_partition(
         partitions=(),
         partitioning=None,
         schema=None,
+        open_file_options=None,
         **kwargs,
     ):
 
@@ -168,7 +172,10 @@ def read_partition(
         if not isinstance(pieces, list):
             pieces = [pieces]
 
+        # Extract supported kwargs from `kwargs`
         strings_to_cats = kwargs.get("strings_to_categorical", False)
+        read_kwargs = kwargs.get("read", {})
+        read_kwargs.update(open_file_options or {})
 
         # Assume multi-piece read
         paths = []
@@ -192,7 +199,7 @@ def read_partition(
                         partitions=partitions,
                         partitioning=partitioning,
                         partition_keys=last_partition_keys,
-                        **kwargs.get("read", {}),
+                        **read_kwargs,
                     )
                 )
                 paths = rgs = []
@@ -215,13 +222,13 @@ def read_partition(
                 partitions=partitions,
                 partitioning=partitioning,
                 partition_keys=last_partition_keys,
-                **kwargs.get("read", {}),
+                **read_kwargs,
             )
         )
         df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
 
         # Re-set "object" dtypes align with pa schema
-        set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None))
+        set_object_dtypes_from_pa_schema(df, schema)
 
         if index and (index[0] in df.columns):
             df = df.set_index(index[0])
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index ad53f5cfe0f..83ff1273b36 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -6,6 +6,7 @@
 from io import BytesIO
 
 import pandas as pd
+import pyarrow.fs as pa_fs
 import pytest
 
 import dask_cudf
@@ -115,7 +116,15 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
-def test_read_parquet(s3_base, s3so):
+@pytest.mark.parametrize(
+    "open_file_options",
+    [
+        {"precache_options": {"method": None}},
+        {"precache_options": {"method": "parquet"}},
+        {"open_file_func": None},
+    ],
+)
+def test_read_parquet(s3_base, s3so, open_file_options):
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
@@ -123,8 +132,15 @@ def test_read_parquet(s3_base, s3so):
     with s3_context(
         s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}
     ):
+        if "open_file_func" in open_file_options:
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            open_file_options["open_file_func"] = fs.open_input_file
         df = dask_cudf.read_parquet(
-            "s3://daskparquet/*.parq", storage_options=s3so
+            "s3://daskparquet/*.parq",
+            storage_options=s3so,
+            open_file_options=open_file_options,
         )
         assert df.a.sum().compute() == 10
         assert df.b.sum().compute() == 9

From 1b93126ee20da94acf28c07c2fb2ebba14376ea8 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Thu, 20 Jan 2022 22:05:21 +0100
Subject: [PATCH 196/202] Prepare upload scripts for Python 3.7 removal
 (#10092)

As we will remove Python 3.7, we need to update the Python version in the upload scripts

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - Sevag Hanssian (https://github.com/sevagh)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10092
---
 ci/cpu/prebuild.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 746c0005a47..8a2c9d9be7c 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -4,12 +4,13 @@
 set -e
 
 DEFAULT_CUDA_VER="11.5"
+DEFAULT_PYTHON_VER="3.8"
 
 #Always upload cudf Python package
 export UPLOAD_CUDF=1
 
 #Upload libcudf once per CUDA
-if [[ "$PYTHON" == "3.7" ]]; then
+if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then
     export UPLOAD_LIBCUDF=1
 else
     export UPLOAD_LIBCUDF=0
@@ -23,7 +24,7 @@ else
 fi
 
 #We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "3.7" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
+if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
     export UPLOAD_LIBCUDF_KAFKA=1
 else
     export UPLOAD_LIBCUDF_KAFKA=0

From 53a31d1b0198412ffa002870d0762a4e719a4e0f Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Thu, 20 Jan 2022 15:00:49 -0800
Subject: [PATCH 197/202] ORC writer API changes for granular statistics
 (#10058)

Depends on #10041.

The erstwhile ORC writer API exposed only a binary choice to choose
the level of statistics: ENABLED/DISABLED.
This commit allows the ORC writer to further choose whether statistics
are collected at the ROW_GROUP or STRIPE level.

This commit also includes the relevant changes to `java/` and `python/`.

Authors:
  - MithunR (https://github.com/mythrocks)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Christopher Harris (https://github.com/cwharris)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/10058
---
 .../io/orc/orc_writer_benchmark.cpp           | 19 +++--
 cpp/include/cudf/io/orc.hpp                   | 84 ++++++++++++++-----
 cpp/src/io/orc/writer_impl.cu                 | 44 ++++++----
 cpp/src/io/orc/writer_impl.hpp                |  8 +-
 java/src/main/native/src/TableJni.cpp         |  4 +-
 .../cudf/_fuzz_testing/tests/fuzz_test_orc.py |  4 +-
 python/cudf/cudf/_lib/orc.pyx                 | 30 +++++--
 python/cudf/cudf/io/orc.py                    |  8 +-
 python/cudf/cudf/tests/test_orc.py            | 63 +++++++++++++-
 9 files changed, 200 insertions(+), 64 deletions(-)

diff --git a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
index be1a2073057..b0eba17359f 100644
--- a/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_benchmark.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "cudf/io/types.hpp"
 #include <benchmark/benchmark.h>
 
 #include <benchmarks/common/generate_benchmark_input.hpp>
@@ -65,8 +66,14 @@ void BM_orc_write_varying_inout(benchmark::State& state)
 
 void BM_orc_write_varying_options(benchmark::State& state)
 {
-  auto const compression  = static_cast<cudf::io::compression_type>(state.range(0));
-  auto const enable_stats = state.range(1) != 0;
+  auto const compression = static_cast<cudf::io::compression_type>(state.range(0));
+  auto const stats_freq  = [&] {
+    switch (state.range(2)) {
+      case 0: return cudf::io::STATISTICS_NONE;
+      case 1: return cudf::io::ORC_STATISTICS_STRIPE;
+      default: return cudf::io::ORC_STATISTICS_ROW_GROUP;
+    }
+  }();
 
   auto const data_types = get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                              int32_t(type_group_id::FLOATING_POINT),
@@ -85,7 +92,7 @@ void BM_orc_write_varying_options(benchmark::State& state)
     cudf_io::orc_writer_options const options =
       cudf_io::orc_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
-        .enable_statistics(enable_stats);
+        .enable_statistics(stats_freq);
     cudf_io::write_orc(options);
   }
 
@@ -113,6 +120,8 @@ BENCHMARK_DEFINE_F(OrcWrite, writer_options)
 BENCHMARK_REGISTER_F(OrcWrite, writer_options)
   ->ArgsProduct({{int32_t(cudf::io::compression_type::NONE),
                   int32_t(cudf::io::compression_type::SNAPPY)},
-                 {0, 1}})
+                 {int32_t{cudf::io::STATISTICS_NONE},
+                  int32_t{cudf::io::ORC_STATISTICS_STRIPE},
+                  int32_t{cudf::io::ORC_STATISTICS_ROW_GROUP}}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 51f82bc4061..108251dd646 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -434,6 +434,18 @@ table_with_metadata read_orc(
  */
 class orc_writer_options_builder;
 
+/**
+ * @brief Constants to disambiguate statistics terminology for ORC.
+ *
+ * ORC refers to its finest granularity of row-grouping as "row group",
+ * which corresponds to Parquet "pages".
+ * Similarly, ORC's "stripe" corresponds to a Parquet "row group".
+ * The following constants disambiguate the terminology for the statistics
+ * collected at each level.
+ */
+static constexpr statistics_freq ORC_STATISTICS_STRIPE    = statistics_freq::STATISTICS_ROWGROUP;
+static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE;
+
 /**
  * @brief Settings to use for `write_orc()`.
  */
@@ -442,8 +454,8 @@ class orc_writer_options {
   sink_info _sink;
   // Specify the compression format to use
   compression_type _compression = compression_type::AUTO;
-  // Enable writing column statistics
-  bool _enable_statistics = true;
+  // Specify frequency of statistics collection
+  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
   size_t _stripe_size_bytes = default_stripe_size_bytes;
   // Maximum number of rows in stripe (unless smaller than a single row group)
@@ -501,7 +513,15 @@ class orc_writer_options {
   /**
    * @brief Whether writing column statistics is enabled/disabled.
    */
-  [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] bool is_enabled_statistics() const
+  {
+    return _stats_freq != statistics_freq::STATISTICS_NONE;
+  }
+
+  /**
+   * @brief Returns frequency of statistics collection.
+   */
+  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
@@ -550,11 +570,16 @@ class orc_writer_options {
   void set_compression(compression_type comp) { _compression = comp; }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection.
    *
-   * @param val Boolean value to enable/disable statistics.
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
+   *
+   * @param val Frequency of statistics collection.
    */
-  void enable_statistics(bool val) { _enable_statistics = val; }
+  void enable_statistics(statistics_freq val) { _stats_freq = val; }
 
   /**
    * @brief Sets the maximum stripe size, in bytes.
@@ -647,14 +672,19 @@ class orc_writer_options_builder {
   }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of column statistics to be written
+   *
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
    *
-   * @param val Boolean value to enable/disable.
+   * @param val Level of statistics collection.
    * @return this for chaining.
    */
-  orc_writer_options_builder& enable_statistics(bool val)
+  orc_writer_options_builder& enable_statistics(statistics_freq val)
   {
-    options._enable_statistics = val;
+    options._stats_freq = val;
     return *this;
   }
 
@@ -775,8 +805,8 @@ class chunked_orc_writer_options {
   sink_info _sink;
   // Specify the compression format to use
   compression_type _compression = compression_type::AUTO;
-  // Enable writing column statistics
-  bool _enable_statistics = true;
+  // Specify granularity of statistics collection
+  statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
   size_t _stripe_size_bytes = default_stripe_size_bytes;
   // Maximum number of rows in stripe (unless smaller than a single row group)
@@ -825,9 +855,9 @@ class chunked_orc_writer_options {
   [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
-   * @brief Whether writing column statistics is enabled/disabled.
+   * @brief Returns granularity of statistics collection.
    */
-  [[nodiscard]] bool is_enabled_statistics() const { return _enable_statistics; }
+  [[nodiscard]] statistics_freq get_statistics_freq() const { return _stats_freq; }
 
   /**
    * @brief Returns maximum stripe size, in bytes.
@@ -871,11 +901,16 @@ class chunked_orc_writer_options {
   void set_compression(compression_type comp) { _compression = comp; }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection
+   *
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
    *
-   * @param val Boolean value to enable/disable.
+   * @param val Frequency of statistics collection.
    */
-  void enable_statistics(bool val) { _enable_statistics = val; }
+  void enable_statistics(statistics_freq val) { _stats_freq = val; }
 
   /**
    * @brief Sets the maximum stripe size, in bytes.
@@ -958,14 +993,19 @@ class chunked_orc_writer_options_builder {
   }
 
   /**
-   * @brief Enable/Disable writing column statistics.
+   * @brief Choose granularity of statistics collection
+   *
+   * The granularity can be set to:
+   * - cudf::io::STATISTICS_NONE: No statistics are collected.
+   * - cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
+   * - cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
    *
-   * @param val Boolean value to enable/disable.
+   * @param val Frequency of statistics collection.
    * @return this for chaining.
    */
-  chunked_orc_writer_options_builder& enable_statistics(bool val)
+  chunked_orc_writer_options_builder& enable_statistics(statistics_freq val)
   {
-    options._enable_statistics = val;
+    options._stats_freq = val;
     return *this;
   }
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 105c473c15e..a917dbf93a5 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1063,15 +1063,15 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
 }
 
 writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
-  bool are_statistics_enabled,
+  statistics_freq stats_freq,
   orc_table_view const& orc_table,
   file_segmentation const& segmentation)
 {
-  auto const num_rowgroup_blobs = segmentation.rowgroups.count();
-  auto const num_stripe_blobs   = segmentation.num_stripes() * orc_table.num_columns();
-  auto const num_file_blobs     = orc_table.num_columns();
-  auto const num_stat_blobs     = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
-
+  auto const num_rowgroup_blobs     = segmentation.rowgroups.count();
+  auto const num_stripe_blobs       = segmentation.num_stripes() * orc_table.num_columns();
+  auto const num_file_blobs         = orc_table.num_columns();
+  auto const num_stat_blobs         = num_rowgroup_blobs + num_stripe_blobs + num_file_blobs;
+  auto const are_statistics_enabled = stats_freq != statistics_freq::STATISTICS_NONE;
   if (not are_statistics_enabled or num_stat_blobs == 0) { return {}; }
 
   hostdevice_vector<stats_column_desc> stat_desc(orc_table.num_columns(), stream);
@@ -1164,17 +1164,27 @@ writer::impl::encoded_statistics writer::impl::gather_statistic_blobs(
 
   hostdevice_vector<uint8_t> blobs(
     stat_merge[num_stat_blobs - 1].start_chunk + stat_merge[num_stat_blobs - 1].num_chunks, stream);
-  gpu::orc_encode_statistics(
-    blobs.device_ptr(), stat_merge.device_ptr(), stat_chunks.data(), num_stat_blobs, stream);
+  // Skip rowgroup blobs when encoding, if chosen granularity is coarser than "ROW_GROUP".
+  auto const is_granularity_rowgroup = stats_freq == ORC_STATISTICS_ROW_GROUP;
+  auto const num_skip                = is_granularity_rowgroup ? 0 : num_rowgroup_blobs;
+  gpu::orc_encode_statistics(blobs.device_ptr(),
+                             stat_merge.device_ptr(num_skip),
+                             stat_chunks.data() + num_skip,
+                             num_stat_blobs - num_skip,
+                             stream);
   stat_merge.device_to_host(stream);
   blobs.device_to_host(stream, true);
 
-  std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
-  for (size_t i = 0; i < num_rowgroup_blobs; i++) {
-    auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
-    auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
-    rowgroup_blobs[i].assign(stat_begin, stat_end);
-  }
+  auto rowgroup_blobs = [&]() -> std::vector<ColStatsBlob> {
+    if (not is_granularity_rowgroup) { return {}; }
+    std::vector<ColStatsBlob> rowgroup_blobs(num_rowgroup_blobs);
+    for (size_t i = 0; i < num_rowgroup_blobs; i++) {
+      auto const stat_begin = blobs.host_ptr(rowgroup_stat_merge[i].start_chunk);
+      auto const stat_end   = stat_begin + rowgroup_stat_merge[i].num_chunks;
+      rowgroup_blobs[i].assign(stat_begin, stat_end);
+    }
+    return rowgroup_blobs;
+  }();
 
   std::vector<ColStatsBlob> stripe_blobs(num_stripe_blobs);
   for (size_t i = 0; i < num_stripe_blobs; i++) {
@@ -1351,7 +1361,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.is_enabled_statistics()),
+    stats_freq_(options.get_statistics_freq()),
     single_write_mode(mode == SingleWriteMode::YES),
     kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
@@ -1372,7 +1382,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     row_index_stride{options.get_row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
-    enable_statistics_(options.is_enabled_statistics()),
+    stats_freq_(options.get_statistics_freq()),
     single_write_mode(mode == SingleWriteMode::YES),
     kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
@@ -1954,7 +1964,7 @@ void writer::impl::write(table_view const& table)
 
     ProtobufWriter pbw_(&buffer_);
 
-    auto const statistics = gather_statistic_blobs(enable_statistics_, orc_table, segmentation);
+    auto const statistics = gather_statistic_blobs(stats_freq_, orc_table, segmentation);
 
     // Write stripes
     std::vector<std::future<void>> write_tasks;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 903ceaa1714..69bb6029ee0 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -293,13 +293,13 @@ class writer::impl {
   /**
    * @brief Returns column statistics encoded in ORC protobuf format.
    *
-   * @param are_statistics_enabled True if statistics are to be included in the output file
+   * @param statistics_freq Frequency of statistics to be included in the output file
    * @param orc_table Table information to be written
    * @param columns List of columns
    * @param segmentation stripe and rowgroup ranges
    * @return The statistic blobs
    */
-  encoded_statistics gather_statistic_blobs(bool are_statistics_enabled,
+  encoded_statistics gather_statistic_blobs(statistics_freq statistics_freq,
                                             orc_table_view const& orc_table,
                                             file_segmentation const& segmentation);
 
@@ -365,8 +365,8 @@ class writer::impl {
   size_t compression_blocksize_     = DEFAULT_COMPRESSION_BLOCKSIZE;
   CompressionKind compression_kind_ = CompressionKind::NONE;
 
-  bool enable_dictionary_ = true;
-  bool enable_statistics_ = true;
+  bool enable_dictionary_     = true;
+  statistics_freq stats_freq_ = ORC_STATISTICS_ROW_GROUP;
 
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
   cudf::io::orc::FileFooter ff;
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 03faf9be021..22b089fa93a 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1733,7 +1733,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(true)
+                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
@@ -1776,7 +1776,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(true)
+                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
                                           .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
index b3fd7e8c5a7..977038d1fcb 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 import io
 import sys
@@ -74,7 +74,7 @@ def orc_reader_stripes_test(input_tuple, columns, stripes):
     data_handle=OrcWriter,
     params={
         "compression": [None, "snappy"],
-        "enable_statistics": [True, False],
+        "enable_statistics": ["NONE", "STRIPE", "ROWGROUP"],
     },
 )
 def orc_writer_test(pdf, compression, enable_statistics):
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index cbba1796c26..ce4f183e795 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -9,6 +9,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.orc cimport (
@@ -144,10 +145,27 @@ cdef compression_type _get_comp_type(object compression):
         raise ValueError(f"Unsupported `compression` type {compression}")
 
 
+cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
+    """
+    Convert ORC statistics terms to CUDF convention:
+      - ORC "STRIPE"   == CUDF "ROWGROUP"
+      - ORC "ROWGROUP" == CUDF "PAGE"
+    """
+    statistics = str(statistics).upper()
+    if statistics == "NONE":
+        return cudf_io_types.statistics_freq.STATISTICS_NONE
+    elif statistics == "STRIPE":
+        return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP
+    elif statistics == "ROWGROUP":
+        return cudf_io_types.statistics_freq.STATISTICS_PAGE
+    else:
+        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
+
+
 cpdef write_orc(table,
                 object path_or_buf,
                 object compression=None,
-                bool enable_statistics=True,
+                object statistics="ROWGROUP",
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
                 object row_index_stride=None):
@@ -189,7 +207,7 @@ cpdef write_orc(table,
             sink_info_c, table_view_from_table(table, ignore_index=True)
         ).metadata(tbl_meta.get())
         .compression(compression_)
-        .enable_statistics(<bool> (True if enable_statistics else False))
+        .enable_statistics(_get_orc_stat_freq(statistics))
         .build()
     )
     if stripe_size_bytes is not None:
@@ -268,15 +286,15 @@ cdef class ORCWriter:
     cdef unique_ptr[orc_chunked_writer] writer
     cdef sink_info sink
     cdef unique_ptr[data_sink] _data_sink
-    cdef bool enable_stats
+    cdef cudf_io_types.statistics_freq stat_freq
     cdef compression_type comp_type
     cdef object index
     cdef unique_ptr[table_input_metadata] tbl_meta
 
     def __cinit__(self, object path, object index=None,
-                  object compression=None, bool enable_statistics=True):
+                  object compression=None, object statistics="ROWGROUP"):
         self.sink = make_sink_info(path, self._data_sink)
-        self.enable_stats = enable_statistics
+        self.stat_freq = _get_orc_stat_freq(statistics)
         self.comp_type = _get_comp_type(compression)
         self.index = index
         self.initialized = False
@@ -350,7 +368,7 @@ cdef class ORCWriter:
                 .metadata(self.tbl_meta.get())
                 .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
-                .enable_statistics(self.enable_stats)
+                .enable_statistics(self.stat_freq)
                 .build()
             )
             self.writer.reset(new orc_chunked_writer(args))
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index c1cce3f996f..5c35d004ac0 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -395,7 +395,7 @@ def to_orc(
     df,
     fname,
     compression=None,
-    enable_statistics=True,
+    statistics="ROWGROUP",
     stripe_size_bytes=None,
     stripe_size_rows=None,
     row_index_stride=None,
@@ -431,7 +431,7 @@ def to_orc(
                 df,
                 file_obj,
                 compression,
-                enable_statistics,
+                statistics,
                 stripe_size_bytes,
                 stripe_size_rows,
                 row_index_stride,
@@ -441,7 +441,7 @@ def to_orc(
             df,
             path_or_buf,
             compression,
-            enable_statistics,
+            statistics,
             stripe_size_bytes,
             stripe_size_rows,
             row_index_stride,
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 44812f5aba4..8689f773a02 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -391,6 +391,64 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
+def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
+    reference_file = "TestOrcFile.demo-12-zlib.orc"
+    pdf_fname = datadir / reference_file
+    gdf_fname = tmpdir.join("gdf.orc")
+
+    try:
+        orcfile = pa.orc.ORCFile(pdf_fname)
+    except Exception as excpr:
+        if type(excpr).__name__ == "ArrowIOError":
+            pytest.skip(".orc file is not found")
+        else:
+            print(type(excpr).__name__)
+
+    expect = orcfile.read().to_pandas()
+    cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
+    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
+def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
+    reference_file = "TestOrcFile.test1.orc"
+    pdf_fname = datadir / reference_file
+    gdf_fname = tmpdir.join("chunked_gdf.orc")
+
+    try:
+        orcfile = pa.orc.ORCFile(pdf_fname)
+    except Exception as excpr:
+        if type(excpr).__name__ == "ArrowIOError":
+            pytest.skip(".orc file is not found")
+        else:
+            print(type(excpr).__name__)
+
+    columns = [
+        "boolean1",
+        "byte1",
+        "short1",
+        "int1",
+        "long1",
+        "float1",
+        "double1",
+    ]
+    pdf = orcfile.read(columns=columns).to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    expect = pd.concat([pdf, pdf]).reset_index(drop=True)
+
+    writer = ORCWriter(gdf_fname, statistics=stats_freq)
+    writer.write_table(gdf)
+    writer.write_table(gdf)
+    writer.close()
+
+    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("compression", [None, "snappy"])
 @pytest.mark.parametrize(
     "reference_file, columns",
@@ -592,8 +650,9 @@ def normalized_equals(value1, value2):
     return value1 == value2
 
 
+@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"])
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
-def test_orc_write_statistics(tmpdir, datadir, nrows):
+def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq):
     supported_stat_types = supported_numpy_dtypes + ["str"]
     # Can't write random bool columns until issue #6763 is fixed
     if nrows == 6000000:
@@ -609,7 +668,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows):
     fname = tmpdir.join("gdf.orc")
 
     # Write said dataframe to ORC with cuDF
-    gdf.to_orc(fname.strpath)
+    gdf.to_orc(fname.strpath, statistics=stats_freq)
 
     # Read back written ORC's statistics
     orc_file = pa.orc.ORCFile(fname)

From 5a4c5f36f082ad9cf1dcc6a2e48a96218ec2093d Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 21 Jan 2022 13:24:33 -0600
Subject: [PATCH 198/202] Fix for appending decimal128 under list and struct
 types (#10105)

I know that this is past the freeze date. This is a fix for a P1 bug that we just found when trying to build Scalar values of Lists and Structs that contain Decimal128 values. We might be able to work around it some other way, but it would take a lot of changes to the existing Spark plugin code to do that so I wanted to try this first.

Authors:
   - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
   - Kuhu Shukla (https://github.com/kuhushukla)
   - Niranjan Artal (https://github.com/nartal1)
---
 .../main/java/ai/rapids/cudf/HostColumnVector.java | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index e21a4ac81c6..0fe7d7a5df8 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -1136,6 +1136,8 @@ private void appendChildOrNull(ColumnBuilder childBuilder, Object listElement) {
         childBuilder.append((Short) listElement);
       } else if (listElement instanceof BigDecimal) {
         childBuilder.append((BigDecimal) listElement);
+      } else if (listElement instanceof BigInteger) {
+        childBuilder.append((BigInteger) listElement);
       } else if (listElement instanceof List) {
         childBuilder.append((List<?>) listElement);
       } else if (listElement instanceof StructData) {
@@ -1230,18 +1232,20 @@ public final ColumnBuilder append(boolean value) {
       return this;
     }
 
-    public final ColumnBuilder append(BigDecimal value) {
+    public ColumnBuilder append(BigDecimal value) {
+      return append(value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue());
+    }
+
+    public ColumnBuilder append(BigInteger unscaledVal) {
       growBuffersAndRows(false, currentIndex * type.getSizeInBytes() + type.getSizeInBytes());
       assert currentIndex < rows;
-      // Rescale input decimal with UNNECESSARY policy, which accepts no precision loss.
-      BigInteger unscaledVal = value.setScale(-type.getScale(), RoundingMode.UNNECESSARY).unscaledValue();
       if (type.typeId == DType.DTypeEnum.DECIMAL32) {
         data.setInt(currentIndex * type.getSizeInBytes(), unscaledVal.intValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL64) {
         data.setLong(currentIndex * type.getSizeInBytes(), unscaledVal.longValueExact());
       } else if (type.typeId == DType.DTypeEnum.DECIMAL128) {
         assert currentIndex < rows;
-        byte[] unscaledValueBytes = value.unscaledValue().toByteArray();
+        byte[] unscaledValueBytes = unscaledVal.toByteArray();
         byte[] result = convertDecimal128FromJavaToCudf(unscaledValueBytes);
         data.setBytes(currentIndex*DType.DTypeEnum.DECIMAL128.sizeInBytes, result, 0, result.length);
       }  else {

From 893f540e9b0748018da7bd5eaa88ee9ff2a65043 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 24 Jan 2022 09:13:01 -0600
Subject: [PATCH 199/202] pin dask release version (#10108)

---
 ci/benchmark/build.sh                    | 2 +-
 ci/gpu/build.sh                          | 2 +-
 conda/environments/cudf_dev_cuda11.5.yml | 4 ++--
 conda/recipes/custreamz/meta.yaml        | 4 ++--
 conda/recipes/dask-cudf/meta.yaml        | 8 ++++----
 python/custreamz/dev_requirements.txt    | 4 ++--
 python/dask_cudf/dev_requirements.txt    | 4 ++--
 python/dask_cudf/setup.py                | 4 ++--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index 534ac19ee98..178bdab0154 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -37,7 +37,7 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2022.01.0'
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 39a39c46eff..8e3057c7a92 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # Dask & Distributed git tag
-export DASK_DISTRIBUTED_GIT_TAG='main'
+export DASK_DISTRIBUTED_GIT_TAG='2022.01.0'
 
 # ucx-py version
 export UCX_PY_VERSION='0.24.*'
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index cb52e656a31..a8ebc5fb5bb 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -42,8 +42,8 @@ dependencies:
   - pydocstyle=6.1.1
   - typing_extensions
   - pre-commit
-  - dask>=2021.11.1
-  - distributed>=2021.11.1
+  - dask>=2021.11.1,<=2022.01.0
+  - distributed>=2021.11.1,<=2022.01.0
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index ddeaa2ccd7b..2e8badc3a54 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -32,8 +32,8 @@ requirements:
     - python
     - streamz
     - cudf {{ version }}
-    - dask>=2021.11.1,<=2021.11.2
-    - distributed>=2021.11.1,<=2021.11.2
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
     - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index fd34ff4112d..225d77729df 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -27,14 +27,14 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1
-    - distributed>=2021.11.1
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
     - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.11.1
-    - distributed>=2021.11.1
+    - dask>=2021.11.1,<=2022.01.0
+    - distributed>=2021.11.1,<=2022.01.0
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index 6f1c09947d5..b4cd7a649ee 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.11.1,<=2021.11.2
-distributed>=2021.11.1,<=2021.11.2
+dask>=2021.11.1,<=2022.01.0
+distributed>=2021.11.1,<=2022.01.0
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index d8b0745be79..d5ba79d4987 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.11.1
-distributed>=2021.11.1
+dask>=2021.11.1,<=2022.01.0
+distributed>=2021.11.1,<=2022.01.0
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 425839772eb..39491a45e7e 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.11.1",
-    "distributed>=2021.11.1",
+    "dask>=2021.11.1,<=2022.01.0",
+    "distributed>=2021.11.1,<=2022.01.0",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",

From 270772f25a7be52fa669412ddfc0d61082a1310a Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 26 Jan 2022 09:29:11 -0800
Subject: [PATCH 200/202] Correctly construct `data_column` variable when
 `drop_nan == False` in `_drop_na_rows` (#10123)

Currently when `drop_nan == False`, variable `data_columns` was not created and referenced below. This PR fixes that.

Authors:
   - Michael Wang (https://github.com/isVoid)

Approvers:
   - GALI PREM SAGAR (https://github.com/galipremsagar)
   - Bradley Dice (https://github.com/bdice)
---
 python/cudf/cudf/core/indexed_frame.py | 13 +++++++++----
 python/cudf/cudf/tests/test_dropna.py  | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e9f2de1cb1c..72878078593 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1321,9 +1321,11 @@ def _drop_na_rows(
             *all* null values.
         subset : list, optional
             List of columns to consider when dropping rows.
-        thresh: int, optional
+        thresh : int, optional
             If specified, then drops every row containing
             less than `thresh` non-null values.
+        drop_nan: bool
+            `nan` is also considered as `NA`
         """
         if subset is None:
             subset = self._column_names
@@ -1341,17 +1343,20 @@ def _drop_na_rows(
         if len(subset) == 0:
             return self.copy(deep=True)
 
-        if drop_nan:
-            data_columns = [
+        data_columns = (
+            [
                 col.nans_to_nulls()
                 if isinstance(col, cudf.core.column.NumericalColumn)
                 else col
                 for col in self._columns
             ]
+            if drop_nan
+            else self._columns
+        )
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                list(self._index._data.columns) + data_columns,
+                [*self._index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index 1e24dd9d275..7338a12ac7a 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -290,3 +290,17 @@ def test_dropna_multiindex_2(data, how):
     got = gi.dropna(how)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("drop_nan", [True, False])
+def test_drop_na_rows_dropnan(drop_nan):
+    col1 = cudf.Series([1, 2, np.nan], nan_as_null=False, dtype="float64")
+    gdf = cudf.DataFrame({"a": col1})
+
+    if drop_nan:
+        got = cudf.DataFrame({"a": [1, 2]}, dtype="float64")
+    else:
+        got = cudf.DataFrame({"a": col1})
+    expected = gdf._drop_na_rows(how="any", drop_nan=drop_nan)
+
+    assert_eq(expected, got)

From cfcb3acb4431fb1bd38d632ad2a3d5ae233fe177 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 27 Jan 2022 11:12:25 -0500
Subject: [PATCH 201/202] Always upload cudf packages (#10147)

Always upload all cudf packages

Authors:
   - Ray Douglass (https://github.com/raydouglass)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
   - Jordan Jacobelli (https://github.com/Ethyling)
---
 ci/cpu/prebuild.sh | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/ci/cpu/prebuild.sh b/ci/cpu/prebuild.sh
index 8a2c9d9be7c..1699fc16a47 100755
--- a/ci/cpu/prebuild.sh
+++ b/ci/cpu/prebuild.sh
@@ -3,32 +3,11 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 set -e
 
-DEFAULT_CUDA_VER="11.5"
-DEFAULT_PYTHON_VER="3.8"
-
-#Always upload cudf Python package
+#Always upload cudf packages
 export UPLOAD_CUDF=1
-
-#Upload libcudf once per CUDA
-if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]]; then
-    export UPLOAD_LIBCUDF=1
-else
-    export UPLOAD_LIBCUDF=0
-fi
-
-# upload cudf_kafka for all versions of Python
-if [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_CUDF_KAFKA=1
-else
-    export UPLOAD_CUDF_KAFKA=0
-fi
-
-#We only want to upload libcudf_kafka once per python/CUDA combo
-if [[ "$PYTHON" == "${DEFAULT_PYTHON_VER}" ]] && [[ "$CUDA" == "${DEFAULT_CUDA_VER}" ]]; then
-    export UPLOAD_LIBCUDF_KAFKA=1
-else
-    export UPLOAD_LIBCUDF_KAFKA=0
-fi
+export UPLOAD_LIBCUDF=1
+export UPLOAD_CUDF_KAFKA=1
+export UPLOAD_LIBCUDF_KAFKA=1
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     #If project flash is not activate, always build both

From a7d88cd0d4f1718fb2016c9866ce2d6dbde2cd60 Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass.com>
Date: Wed, 2 Feb 2022 10:49:57 -0500
Subject: [PATCH 202/202] update changelog

---
 CHANGELOG.md | 239 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 237 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 68ff9abc9ea..7a835c2dbab 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,241 @@
-# cuDF 22.02.00 (Date TBD)
+# cuDF 22.02.00 (2 Feb 2022)
+
+## 🚨 Beaking Changes
+
+- ORC wite API changes fo ganula statistics ([#10058](https://github.com/rapidsai/cudf/pull/10058)) [@mythocks](https://github.com/mythocks)
+- `decimal128` Suppot fo `to/fom_aow` ([#9986](https://github.com/rapidsai/cudf/pull/9986)) [@codeepot](https://github.com/codeepot)
+- Remove depecated method `one_hot_encoding` ([#9977](https://github.com/rapidsai/cudf/pull/9977)) [@isVoid](https://github.com/isVoid)
+- Remove st.subwod_tokenize ([#9968](https://github.com/rapidsai/cudf/pull/9968)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Remove depecated `method` paamete fom `mege` and `join`. ([#9944](https://github.com/rapidsai/cudf/pull/9944)) [@bdice](https://github.com/bdice)
+- Remove depecated method DataFame.hash_columns. ([#9943](https://github.com/rapidsai/cudf/pull/9943)) [@bdice](https://github.com/bdice)
+- Remove depecated method Seies.hash_encode. ([#9942](https://github.com/rapidsai/cudf/pull/9942)) [@bdice](https://github.com/bdice)
+- Refactoing ceil/ound/floo code fo datetime64 types ([#9926](https://github.com/rapidsai/cudf/pull/9926)) [@mayankanand007](https://github.com/mayankanand007)
+- Intoduce `nan_as_null` paamete fo `cudf.Index` ([#9893](https://github.com/rapidsai/cudf/pull/9893)) [@galipemsaga](https://github.com/galipemsaga)
+- Add egex_flags paamete to stings eplace_e functions ([#9878](https://github.com/rapidsai/cudf/pull/9878)) [@davidwendt](https://github.com/davidwendt)
+- Beak tie fo `top` categoical columns in `Seies.descibe` ([#9867](https://github.com/rapidsai/cudf/pull/9867)) [@isVoid](https://github.com/isVoid)
+- Add patitioning suppot in paquet wite ([#9810](https://github.com/rapidsai/cudf/pull/9810)) [@devavet](https://github.com/devavet)
+- Move `dop_duplicates`, `dop_na`, `_gathe`, `take` to IndexFame and ceate thei `_base_index` countepats ([#9807](https://github.com/rapidsai/cudf/pull/9807)) [@isVoid](https://github.com/isVoid)
+- Raise tempoay eo fo `decimal128` types in paquet eade ([#9804](https://github.com/rapidsai/cudf/pull/9804)) [@galipemsaga](https://github.com/galipemsaga)
+- Change default `dtype` of all nulls column fom `float` to `object` ([#9803](https://github.com/rapidsai/cudf/pull/9803)) [@galipemsaga](https://github.com/galipemsaga)
+- Remove unused masked udf cython/c++ code ([#9792](https://github.com/rapidsai/cudf/pull/9792)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Pick smallest decimal type with equied pecision in ORC eade ([#9775](https://github.com/rapidsai/cudf/pull/9775)) [@vuule](https://github.com/vuule)
+- Add decimal128 suppot to Paquet eade and wite ([#9765](https://github.com/rapidsai/cudf/pull/9765)) [@vuule](https://github.com/vuule)
+- Refacto TableTest assetion methods to a sepaate utility class ([#9762](https://github.com/rapidsai/cudf/pull/9762)) [@jlowe](https://github.com/jlowe)
+- Use cuFile diect device eads/wites by default in cuIO ([#9722](https://github.com/rapidsai/cudf/pull/9722)) [@vuule](https://github.com/vuule)
+- Match pandas scala esult types in eductions ([#9717](https://github.com/rapidsai/cudf/pull/9717)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Add paametes to contol ow goup size in Paquet wite ([#9677](https://github.com/rapidsai/cudf/pull/9677)) [@vuule](https://github.com/vuule)
+- Refacto bit counting APIs, intoduce valid/null count functions, and split host/device side code fo segmented counts. ([#9588](https://github.com/rapidsai/cudf/pull/9588)) [@bdice](https://github.com/bdice)
+- Add suppot fo `decimal128` in cudf python ([#9533](https://github.com/rapidsai/cudf/pull/9533)) [@galipemsaga](https://github.com/galipemsaga)
+- Implement `lists::index_of()` to find positions in list ows ([#9510](https://github.com/rapidsai/cudf/pull/9510)) [@mythocks](https://github.com/mythocks)
+- Rewiting ow/column convesions fo Spak &lt;-&gt; cudf data convesions ([#8444](https://github.com/rapidsai/cudf/pull/8444)) [@hypebolic2346](https://github.com/hypebolic2346)
 
-Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch.
+## 🐛 Bug Fixes
+
+- Add check fo negative stipe index in ORC eade ([#10074](https://github.com/rapidsai/cudf/pull/10074)) [@vuule](https://github.com/vuule)
+- Update Java tests to expect DECIMAL128 fom Aow ([#10073](https://github.com/rapidsai/cudf/pull/10073)) [@jlowe](https://github.com/jlowe)
+- Avoid index mateialization when `DataFame` is ceated with un-named `Seies` objects ([#10071](https://github.com/rapidsai/cudf/pull/10071)) [@galipemsaga](https://github.com/galipemsaga)
+- fix gcc 11 compilation eos ([#10067](https://github.com/rapidsai/cudf/pull/10067)) [@ongou](https://github.com/ongou)
+- Fix `columns` odeing issue in paquet eade ([#10066](https://github.com/rapidsai/cudf/pull/10066)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix datafame setitem with `ndaay` types ([#10056](https://github.com/rapidsai/cudf/pull/10056)) [@galipemsaga](https://github.com/galipemsaga)
+- Remove implicit copy due to convesion fom cudf::size_type and size_t ([#10045](https://github.com/rapidsai/cudf/pull/10045)) [@obetmaynad](https://github.com/obetmaynad)
+- Include &lt;optional&gt; in heades that use std::optional ([#10044](https://github.com/rapidsai/cudf/pull/10044)) [@obetmaynad](https://github.com/obetmaynad)
+- Fix ep and concat of `StuctColumn` ([#10042](https://github.com/rapidsai/cudf/pull/10042)) [@galipemsaga](https://github.com/galipemsaga)
+- Include ow goup level stats when witing ORC files ([#10041](https://github.com/rapidsai/cudf/pull/10041)) [@vuule](https://github.com/vuule)
+- build.sh espects the `--build_metics` and `--incl_cache_stats` flags ([#10035](https://github.com/rapidsai/cudf/pull/10035)) [@obetmaynad](https://github.com/obetmaynad)
+- Fix memoy leaks in JNI native code. ([#10029](https://github.com/rapidsai/cudf/pull/10029)) [@mythocks](https://github.com/mythocks)
+- Update JNI to use new aena m constucto ([#10027](https://github.com/rapidsai/cudf/pull/10027)) [@ongou](https://github.com/ongou)
+- Fix null check when compaing stucts in `ag_min` opeation of eduction/goupby ([#10026](https://github.com/rapidsai/cudf/pull/10026)) [@ttnghia](https://github.com/ttnghia)
+- Wap CI scipt shell vaiables in quotes to fix local testing. ([#10018](https://github.com/rapidsai/cudf/pull/10018)) [@bdice](https://github.com/bdice)
+- cudftestutil no longe popagates compile flags to extenal uses ([#10017](https://github.com/rapidsai/cudf/pull/10017)) [@obetmaynad](https://github.com/obetmaynad)
+- Remove `CUDA_DEVICE_CALLABLE` maco usage ([#10015](https://github.com/rapidsai/cudf/pull/10015)) [@hypebolic2346](https://github.com/hypebolic2346)
+- Add missing list filling heade in meta.yaml ([#10007](https://github.com/rapidsai/cudf/pull/10007)) [@devavet](https://github.com/devavet)
+- Fix `conda` ecipes fo `custeamz` &amp; `cudf_kafka` ([#10003](https://github.com/rapidsai/cudf/pull/10003)) [@ajschmidt8](https://github.com/ajschmidt8)
+- Fix matching egex wod-bounday () in stings eplace ([#9997](https://github.com/rapidsai/cudf/pull/9997)) [@davidwendt](https://github.com/davidwendt)
+- Fix null check when compaing stucts in `min` and `max` eduction/goupby opeations ([#9994](https://github.com/rapidsai/cudf/pull/9994)) [@ttnghia](https://github.com/ttnghia)
+- Fix octal patten matching in egex sting ([#9993](https://github.com/rapidsai/cudf/pull/9993)) [@davidwendt](https://github.com/davidwendt)
+- `decimal128` Suppot fo `to/fom_aow` ([#9986](https://github.com/rapidsai/cudf/pull/9986)) [@codeepot](https://github.com/codeepot)
+- Fix goupby shift/diff/fill afte selecting fom a  `GoupBy` ([#9984](https://github.com/rapidsai/cudf/pull/9984)) [@shwina](https://github.com/shwina)
+- Fix the oveflow poblem of decimal escale ([#9966](https://github.com/rapidsai/cudf/pull/9966)) [@spelingxx](https://github.com/spelingxx)
+- Use default value fo decimal pecision in paquet wite when not specified ([#9963](https://github.com/rapidsai/cudf/pull/9963)) [@devavet](https://github.com/devavet)
+- Fix cudf java build eo. ([#9958](https://github.com/rapidsai/cudf/pull/9958)) [@fiestaman](https://github.com/fiestaman)
+- Use gpuci_mamba_ety to install local atifacts. ([#9951](https://github.com/rapidsai/cudf/pull/9951)) [@bdice](https://github.com/bdice)
+- Fix egession HostColumnVectoCoe equiing native libs ([#9948](https://github.com/rapidsai/cudf/pull/9948)) [@jlowe](https://github.com/jlowe)
+- Rename aggegate_metadata in wite to fix name collision ([#9938](https://github.com/rapidsai/cudf/pull/9938)) [@devavet](https://github.com/devavet)
+- Fixed issue with pecentile_appox whee output tdigests could have uninitialized data at the end. ([#9931](https://github.com/rapidsai/cudf/pull/9931)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Resolve acecheck eos in ORC kenels ([#9916](https://github.com/rapidsai/cudf/pull/9916)) [@vuule](https://github.com/vuule)
+- Fix the java build afte paquet patitioning suppot ([#9908](https://github.com/rapidsai/cudf/pull/9908)) [@evans2](https://github.com/evans2)
+- Fix compilation of benchmak fo paquet wite. ([#9905](https://github.com/rapidsai/cudf/pull/9905)) [@bdice](https://github.com/bdice)
+- Fix a memcheck eo in ORC wite ([#9896](https://github.com/rapidsai/cudf/pull/9896)) [@vuule](https://github.com/vuule)
+- Intoduce `nan_as_null` paamete fo `cudf.Index` ([#9893](https://github.com/rapidsai/cudf/pull/9893)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix fallback to sot aggegation fo gouping only hash aggegate ([#9891](https://github.com/rapidsai/cudf/pull/9891)) [@abellina](https://github.com/abellina)
+- Add zlib to cudfjni link when using static libcudf libay dependency ([#9890](https://github.com/rapidsai/cudf/pull/9890)) [@jlowe](https://github.com/jlowe)
+- TimedeltaIndex constucto aises an AttibuteEo. ([#9884](https://github.com/rapidsai/cudf/pull/9884)) [@skiui-souce](https://github.com/skiui-souce)
+- Fix cudf.Scala sting datetime constuction ([#9875](https://github.com/rapidsai/cudf/pull/9875)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Load libcufile.so with RTLD_NODELETE flag ([#9872](https://github.com/rapidsai/cudf/pull/9872)) [@vuule](https://github.com/vuule)
+- Beak tie fo `top` categoical columns in `Seies.descibe` ([#9867](https://github.com/rapidsai/cudf/pull/9867)) [@isVoid](https://github.com/isVoid)
+- Fix null handling fo stucts `min` and `ag_min` in goupby, goupby scan, eduction, and inclusive_scan ([#9864](https://github.com/rapidsai/cudf/pull/9864)) [@ttnghia](https://github.com/ttnghia)
+- Add one-level list encoding suppot in paquet eade ([#9848](https://github.com/rapidsai/cudf/pull/9848)) [@PointKenel](https://github.com/PointKenel)
+- Fix an out-of-bounds ead in validity copying in contiguous_split. ([#9842](https://github.com/rapidsai/cudf/pull/9842)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Fix join of MultiIndex to Index with one column and ovelapping name. ([#9830](https://github.com/rapidsai/cudf/pull/9830)) [@vyas](https://github.com/vyas)
+- Fix caching in `Seies.applymap` ([#9821](https://github.com/rapidsai/cudf/pull/9821)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Enfoce boolean `ascending` fo dask-cudf `sot_values` ([#9814](https://github.com/rapidsai/cudf/pull/9814)) [@chalesbluca](https://github.com/chalesbluca)
+- Fix ORC wite cash with empty input columns ([#9808](https://github.com/rapidsai/cudf/pull/9808)) [@vuule](https://github.com/vuule)
+- Change default `dtype` of all nulls column fom `float` to `object` ([#9803](https://github.com/rapidsai/cudf/pull/9803)) [@galipemsaga](https://github.com/galipemsaga)
+- Load native dependencies when Java ColumnView is loaded ([#9800](https://github.com/rapidsai/cudf/pull/9800)) [@jlowe](https://github.com/jlowe)
+- Fix dtype-agument bug in dask_cudf ead_csv ([#9796](https://github.com/rapidsai/cudf/pull/9796)) [@jzamoa](https://github.com/jzamoa)
+- Fix oveflow fo min calculation in stings::fom_timestamps ([#9793](https://github.com/rapidsai/cudf/pull/9793)) [@evans2](https://github.com/evans2)
+- Fix memoy eo due to lambda etun type deduction limitation ([#9778](https://github.com/rapidsai/cudf/pull/9778)) [@kathikeyann](https://github.com/kathikeyann)
+- Revet egex $/EOL end-of-sting new-line special case handling ([#9774](https://github.com/rapidsai/cudf/pull/9774)) [@davidwendt](https://github.com/davidwendt)
+- Fix missing steams ([#9767](https://github.com/rapidsai/cudf/pull/9767)) [@kathikeyann](https://github.com/kathikeyann)
+- Fix make_empty_scala_like on list_type ([#9759](https://github.com/rapidsai/cudf/pull/9759)) [@spelingxx](https://github.com/spelingxx)
+- Update cmake and conda to 22.02 ([#9746](https://github.com/rapidsai/cudf/pull/9746)) [@devavet](https://github.com/devavet)
+- Fix out-of-bounds memoy wite in decimal128-to-sting convesion ([#9740](https://github.com/rapidsai/cudf/pull/9740)) [@davidwendt](https://github.com/davidwendt)
+- Match pandas scala esult types in eductions ([#9717](https://github.com/rapidsai/cudf/pull/9717)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Fix egex non-multiline EOL/$ matching stings ending with a new-line ([#9715](https://github.com/rapidsai/cudf/pull/9715)) [@davidwendt](https://github.com/davidwendt)
+- Fixed build by adding moe checks fo int8, int16 ([#9707](https://github.com/rapidsai/cudf/pull/9707)) [@azajafi](https://github.com/azajafi)
+- Fix `null` handling when `boolean` dtype is passed ([#9691](https://github.com/rapidsai/cudf/pull/9691)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix steam usage in `segmented_gathe()` ([#9679](https://github.com/rapidsai/cudf/pull/9679)) [@mythocks](https://github.com/mythocks)
+
+## 📖 Documentation
+
+- Update `decimal` dtypes elated docs enties ([#10072](https://github.com/rapidsai/cudf/pull/10072)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix egex doc descibing hexadecimal escape chaactes ([#10009](https://github.com/rapidsai/cudf/pull/10009)) [@davidwendt](https://github.com/davidwendt)
+- Fix cudf compilation instuctions. ([#9956](https://github.com/rapidsai/cudf/pull/9956)) [@esoha-nvidia](https://github.com/esoha-nvidia)
+- Fix see also links fo IO APIs ([#9895](https://github.com/rapidsai/cudf/pull/9895)) [@galipemsaga](https://github.com/galipemsaga)
+- Fix build instuctions fo libcudf doxygen ([#9837](https://github.com/rapidsai/cudf/pull/9837)) [@davidwendt](https://github.com/davidwendt)
+- Fix some doxygen wanings and add missing documentation ([#9770](https://github.com/rapidsai/cudf/pull/9770)) [@kathikeyann](https://github.com/kathikeyann)
+- update cuda vesion in local build ([#9736](https://github.com/rapidsai/cudf/pull/9736)) [@kathikeyann](https://github.com/kathikeyann)
+- Fix doxygen fo enum types in libcudf ([#9724](https://github.com/rapidsai/cudf/pull/9724)) [@davidwendt](https://github.com/davidwendt)
+- Spell check fixes ([#9682](https://github.com/rapidsai/cudf/pull/9682)) [@kathikeyann](https://github.com/kathikeyann)
+- Fix links in C++ Develope Guide. ([#9675](https://github.com/rapidsai/cudf/pull/9675)) [@bdice](https://github.com/bdice)
+
+## 🚀 New Featues
+
+- Remove libcudacxx patch needed fo nvcc 11.4 ([#10057](https://github.com/rapidsai/cudf/pull/10057)) [@obetmaynad](https://github.com/obetmaynad)
+- Allow CuPy 10 ([#10048](https://github.com/rapidsai/cudf/pull/10048)) [@jakikham](https://github.com/jakikham)
+- Add in suppot fo NULL_LOGICAL_AND and NULL_LOGICAL_OR binops ([#10016](https://github.com/rapidsai/cudf/pull/10016)) [@evans2](https://github.com/evans2)
+- Add `goupby.tansfom` (only suppot fo aggegations) ([#10005](https://github.com/rapidsai/cudf/pull/10005)) [@shwina](https://github.com/shwina)
+- Add patitioning suppot to Paquet chunked wite ([#10000](https://github.com/rapidsai/cudf/pull/10000)) [@devavet](https://github.com/devavet)
+- Add jni fo sequences ([#9972](https://github.com/rapidsai/cudf/pull/9972)) [@wbo4958](https://github.com/wbo4958)
+- Java bindings fo mixed left, inne, and full joins ([#9941](https://github.com/rapidsai/cudf/pull/9941)) [@jlowe](https://github.com/jlowe)
+- Java bindings fo JSON eade suppot ([#9940](https://github.com/rapidsai/cudf/pull/9940)) [@wbo4958](https://github.com/wbo4958)
+- Enable tanspose fo sting columns in cudf python ([#9937](https://github.com/rapidsai/cudf/pull/9937)) [@galipemsaga](https://github.com/galipemsaga)
+- Suppot stucts fo `cudf::contains` with column/scala input ([#9929](https://github.com/rapidsai/cudf/pull/9929)) [@ttnghia](https://github.com/ttnghia)
+- Implement mixed equality/conditional joins ([#9917](https://github.com/rapidsai/cudf/pull/9917)) [@vyas](https://github.com/vyas)
+- Add cudf::stings::extact_all API ([#9909](https://github.com/rapidsai/cudf/pull/9909)) [@davidwendt](https://github.com/davidwendt)
+- Implement JNI fo `cudf::scatte` APIs ([#9903](https://github.com/rapidsai/cudf/pull/9903)) [@ttnghia](https://github.com/ttnghia)
+- JNI: Function to copy and set validity fom bool column. ([#9901](https://github.com/rapidsai/cudf/pull/9901)) [@mythocks](https://github.com/mythocks)
+- Add dictionay suppot to cudf::copy_if_else ([#9887](https://github.com/rapidsai/cudf/pull/9887)) [@davidwendt](https://github.com/davidwendt)
+- add un_benchmaks taget fo unning benchmaks with json output ([#9879](https://github.com/rapidsai/cudf/pull/9879)) [@kathikeyann](https://github.com/kathikeyann)
+- Add egex_flags paamete to stings eplace_e functions ([#9878](https://github.com/rapidsai/cudf/pull/9878)) [@davidwendt](https://github.com/davidwendt)
+- Add_suffix and add_pefix fo DataFames and Seies ([#9846](https://github.com/rapidsai/cudf/pull/9846)) [@mayankanand007](https://github.com/mayankanand007)
+- Add JNI fo `cudf::dop_duplicates` ([#9841](https://github.com/rapidsai/cudf/pull/9841)) [@ttnghia](https://github.com/ttnghia)
+- Implement pe-list sequence ([#9839](https://github.com/rapidsai/cudf/pull/9839)) [@ttnghia](https://github.com/ttnghia)
+- adding `seies.tanspose` ([#9835](https://github.com/rapidsai/cudf/pull/9835)) [@mayankanand007](https://github.com/mayankanand007)
+- Adding suppot fo `Seies.autoco` ([#9833](https://github.com/rapidsai/cudf/pull/9833)) [@mayankanand007](https://github.com/mayankanand007)
+- Suppot ound opeation on datetime64 datatypes ([#9820](https://github.com/rapidsai/cudf/pull/9820)) [@mayankanand007](https://github.com/mayankanand007)
+- Add patitioning suppot in paquet wite ([#9810](https://github.com/rapidsai/cudf/pull/9810)) [@devavet](https://github.com/devavet)
+- Raise tempoay eo fo `decimal128` types in paquet eade ([#9804](https://github.com/rapidsai/cudf/pull/9804)) [@galipemsaga](https://github.com/galipemsaga)
+- Add decimal128 suppot to Paquet eade and wite ([#9765](https://github.com/rapidsai/cudf/pull/9765)) [@vuule](https://github.com/vuule)
+- Optimize `goupby::scan` ([#9754](https://github.com/rapidsai/cudf/pull/9754)) [@PointKenel](https://github.com/PointKenel)
+- Add sample JNI API ([#9728](https://github.com/rapidsai/cudf/pull/9728)) [@es-life](https://github.com/es-life)
+- Suppot `min` and `max` in inclusive scan fo stucts ([#9725](https://github.com/rapidsai/cudf/pull/9725)) [@ttnghia](https://github.com/ttnghia)
+- Add `fist` and `last` method to `IndexedFame` ([#9710](https://github.com/rapidsai/cudf/pull/9710)) [@isVoid](https://github.com/isVoid)
+- Suppot `min` and `max` eduction fo stucts ([#9697](https://github.com/rapidsai/cudf/pull/9697)) [@ttnghia](https://github.com/ttnghia)
+- Add paametes to contol ow goup size in Paquet wite ([#9677](https://github.com/rapidsai/cudf/pull/9677)) [@vuule](https://github.com/vuule)
+- Run compute-sanitize in nightly build ([#9641](https://github.com/rapidsai/cudf/pull/9641)) [@kathikeyann](https://github.com/kathikeyann)
+- Implement Seies.datetime.floo ([#9571](https://github.com/rapidsai/cudf/pull/9571)) [@skiui-souce](https://github.com/skiui-souce)
+- ceil/floo fo `DatetimeIndex` ([#9554](https://github.com/rapidsai/cudf/pull/9554)) [@mayankanand007](https://github.com/mayankanand007)
+- Add suppot fo `decimal128` in cudf python ([#9533](https://github.com/rapidsai/cudf/pull/9533)) [@galipemsaga](https://github.com/galipemsaga)
+- Implement `lists::index_of()` to find positions in list ows ([#9510](https://github.com/rapidsai/cudf/pull/9510)) [@mythocks](https://github.com/mythocks)
+- custeamz oauth callback fo kafka (libdkafka) ([#9486](https://github.com/rapidsai/cudf/pull/9486)) [@jdye64](https://github.com/jdye64)
+- Add Peason coelation fo sot goupby (python) ([#9166](https://github.com/rapidsai/cudf/pull/9166)) [@skiui-souce](https://github.com/skiui-souce)
+- Intechange datafame potocol ([#9071](https://github.com/rapidsai/cudf/pull/9071)) [@iskode](https://github.com/iskode)
+- Rewiting ow/column convesions fo Spak &lt;-&gt; cudf data convesions ([#8444](https://github.com/rapidsai/cudf/pull/8444)) [@hypebolic2346](https://github.com/hypebolic2346)
+
+## 🛠️ Impovements
+
+- Pepae upload scipts fo Python 3.7 emoval ([#10092](https://github.com/rapidsai/cudf/pull/10092)) [@Ethyling](https://github.com/Ethyling)
+- Simplify custeamz and cudf_kafka ecipes files ([#10065](https://github.com/rapidsai/cudf/pull/10065)) [@Ethyling](https://github.com/Ethyling)
+- ORC wite API changes fo ganula statistics ([#10058](https://github.com/rapidsai/cudf/pull/10058)) [@mythocks](https://github.com/mythocks)
+- Remove python constaints in cuteamz and cudf_kafka ecipes ([#10052](https://github.com/rapidsai/cudf/pull/10052)) [@Ethyling](https://github.com/Ethyling)
+- Unpin `dask` and `distibuted` in CI ([#10028](https://github.com/rapidsai/cudf/pull/10028)) [@galipemsaga](https://github.com/galipemsaga)
+- Add `_fom_column_like_self` factoy ([#10022](https://github.com/rapidsai/cudf/pull/10022)) [@isVoid](https://github.com/isVoid)
+- Replace custom CUDA bindings peviously povided by RMM with official CUDA Python bindings ([#10008](https://github.com/rapidsai/cudf/pull/10008)) [@shwina](https://github.com/shwina)
+- Use `cuda::std::is_aithmetic` in `cudf::is_numeic` tait. ([#9996](https://github.com/rapidsai/cudf/pull/9996)) [@bdice](https://github.com/bdice)
+- Clean up CUDA steam use in cuIO ([#9991](https://github.com/rapidsai/cudf/pull/9991)) [@vuule](https://github.com/vuule)
+- Use addessed-odeed fist fit fo the pinned memoy pool ([#9989](https://github.com/rapidsai/cudf/pull/9989)) [@ongou](https://github.com/ongou)
+- Add stings tests to tanspose_test.cpp ([#9985](https://github.com/rapidsai/cudf/pull/9985)) [@davidwendt](https://github.com/davidwendt)
+- Use gpuci_mamba_ety on Java CI. ([#9983](https://github.com/rapidsai/cudf/pull/9983)) [@bdice](https://github.com/bdice)
+- Remove depecated method `one_hot_encoding` ([#9977](https://github.com/rapidsai/cudf/pull/9977)) [@isVoid](https://github.com/isVoid)
+- Mino cleanup of unused Python functions ([#9974](https://github.com/rapidsai/cudf/pull/9974)) [@vyas](https://github.com/vyas)
+- Use new efficient patitioned paquet witing in cuDF ([#9971](https://github.com/rapidsai/cudf/pull/9971)) [@devavet](https://github.com/devavet)
+- Remove st.subwod_tokenize ([#9968](https://github.com/rapidsai/cudf/pull/9968)) [@VibhuJawa](https://github.com/VibhuJawa)
+- Fowad-mege banch-21.12 to banch-22.02 ([#9947](https://github.com/rapidsai/cudf/pull/9947)) [@bdice](https://github.com/bdice)
+- Remove depecated `method` paamete fom `mege` and `join`. ([#9944](https://github.com/rapidsai/cudf/pull/9944)) [@bdice](https://github.com/bdice)
+- Remove depecated method DataFame.hash_columns. ([#9943](https://github.com/rapidsai/cudf/pull/9943)) [@bdice](https://github.com/bdice)
+- Remove depecated method Seies.hash_encode. ([#9942](https://github.com/rapidsai/cudf/pull/9942)) [@bdice](https://github.com/bdice)
+- use ninja in java ci build ([#9933](https://github.com/rapidsai/cudf/pull/9933)) [@ongou](https://github.com/ongou)
+- Add build-time publish step to cpu build scipt ([#9927](https://github.com/rapidsai/cudf/pull/9927)) [@davidwendt](https://github.com/davidwendt)
+- Refactoing ceil/ound/floo code fo datetime64 types ([#9926](https://github.com/rapidsai/cudf/pull/9926)) [@mayankanand007](https://github.com/mayankanand007)
+- Remove vaious unused functions ([#9922](https://github.com/rapidsai/cudf/pull/9922)) [@vyas](https://github.com/vyas)
+- Raise in `quey` if dtype is not suppoted ([#9921](https://github.com/rapidsai/cudf/pull/9921)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Add missing impots tests ([#9920](https://github.com/rapidsai/cudf/pull/9920)) [@Ethyling](https://github.com/Ethyling)
+- Spak Decimal128 hashing ([#9919](https://github.com/rapidsai/cudf/pull/9919)) [@wlee](https://github.com/wlee)
+- Replace `thust/std::get` with stuctued bindings ([#9915](https://github.com/rapidsai/cudf/pull/9915)) [@codeepot](https://github.com/codeepot)
+- Upgade thust vesion to 1.15 ([#9912](https://github.com/rapidsai/cudf/pull/9912)) [@obetmaynad](https://github.com/obetmaynad)
+- Remove conda envs fo CUDA 11.0 and 11.2. ([#9910](https://github.com/rapidsai/cudf/pull/9910)) [@bdice](https://github.com/bdice)
+- Retun count of set bits fom inplace_bitmask_and. ([#9904](https://github.com/rapidsai/cudf/pull/9904)) [@bdice](https://github.com/bdice)
+- Use dynamic nullate fo join hashe and equality compaato ([#9902](https://github.com/rapidsai/cudf/pull/9902)) [@davidwendt](https://github.com/davidwendt)
+- Update ucx-py vesion on elease using vc ([#9897](https://github.com/rapidsai/cudf/pull/9897)) [@Ethyling](https://github.com/Ethyling)
+- Remove `IncludeCategoies` fom `.clang-fomat` ([#9876](https://github.com/rapidsai/cudf/pull/9876)) [@codeepot](https://github.com/codeepot)
+- Suppot statically linking CUDA untime fo Java bindings ([#9873](https://github.com/rapidsai/cudf/pull/9873)) [@jlowe](https://github.com/jlowe)
+- Add `clang-tidy` to libcudf ([#9860](https://github.com/rapidsai/cudf/pull/9860)) [@codeepot](https://github.com/codeepot)
+- Remove depecated methods fom Java Table class ([#9853](https://github.com/rapidsai/cudf/pull/9853)) [@jlowe](https://github.com/jlowe)
+- Add test fo map column metadata handling in ORC wite ([#9852](https://github.com/rapidsai/cudf/pull/9852)) [@vuule](https://github.com/vuule)
+- Use pandas `to_offset` to pase fequency sting in `date_ange` ([#9843](https://github.com/rapidsai/cudf/pull/9843)) [@isVoid](https://github.com/isVoid)
+- add templated benchmak with fixtue ([#9838](https://github.com/rapidsai/cudf/pull/9838)) [@kathikeyann](https://github.com/kathikeyann)
+- Use list of column inputs fo `apply_boolean_mask` ([#9832](https://github.com/rapidsai/cudf/pull/9832)) [@isVoid](https://github.com/isVoid)
+- Added a few moe tests fo Decimal to Sting cast ([#9818](https://github.com/rapidsai/cudf/pull/9818)) [@azajafi](https://github.com/azajafi)
+- Run doctests. ([#9815](https://github.com/rapidsai/cudf/pull/9815)) [@bdice](https://github.com/bdice)
+- Avoid oveflow fo fixed_point ound ([#9809](https://github.com/rapidsai/cudf/pull/9809)) [@spelingxx](https://github.com/spelingxx)
+- Move `dop_duplicates`, `dop_na`, `_gathe`, `take` to IndexFame and ceate thei `_base_index` countepats ([#9807](https://github.com/rapidsai/cudf/pull/9807)) [@isVoid](https://github.com/isVoid)
+- Use vecto factoies fo host-device copies. ([#9806](https://github.com/rapidsai/cudf/pull/9806)) [@bdice](https://github.com/bdice)
+- Refacto host device macos ([#9797](https://github.com/rapidsai/cudf/pull/9797)) [@vyas](https://github.com/vyas)
+- Remove unused masked udf cython/c++ code ([#9792](https://github.com/rapidsai/cudf/pull/9792)) [@bandon-b-mille](https://github.com/bandon-b-mille)
+- Allow custom sot functions fo dask-cudf `sot_values` ([#9789](https://github.com/rapidsai/cudf/pull/9789)) [@chalesbluca](https://github.com/chalesbluca)
+- Impove build time of libcudf iteato tests ([#9788](https://github.com/rapidsai/cudf/pull/9788)) [@davidwendt](https://github.com/davidwendt)
+- Copy Java native dependencies diectly into classpath ([#9787](https://github.com/rapidsai/cudf/pull/9787)) [@jlowe](https://github.com/jlowe)
+- Add decimal types to cuIO benchmaks ([#9776](https://github.com/rapidsai/cudf/pull/9776)) [@vuule](https://github.com/vuule)
+- Pick smallest decimal type with equied pecision in ORC eade ([#9775](https://github.com/rapidsai/cudf/pull/9775)) [@vuule](https://github.com/vuule)
+- Avoid oveflow fo `fixed_point` `cudf::cast` and pefomance optimization ([#9772](https://github.com/rapidsai/cudf/pull/9772)) [@codeepot](https://github.com/codeepot)
+- Use CTAD with Thust function objects ([#9768](https://github.com/rapidsai/cudf/pull/9768)) [@codeepot](https://github.com/codeepot)
+- Refacto TableTest assetion methods to a sepaate utility class ([#9762](https://github.com/rapidsai/cudf/pull/9762)) [@jlowe](https://github.com/jlowe)
+- Use Java classloade to find test esouces ([#9760](https://github.com/rapidsai/cudf/pull/9760)) [@jlowe](https://github.com/jlowe)
+- Allow cast decimal128 to sting and add tests ([#9756](https://github.com/rapidsai/cudf/pull/9756)) [@azajafi](https://github.com/azajafi)
+- Load balance optimization fo contiguous_split ([#9755](https://github.com/rapidsai/cudf/pull/9755)) [@nvdbaanec](https://github.com/nvdbaanec)
+- Consolidate and impove `eset_index` ([#9750](https://github.com/rapidsai/cudf/pull/9750)) [@isVoid](https://github.com/isVoid)
+- Update to UCX-Py 0.24 ([#9748](https://github.com/rapidsai/cudf/pull/9748)) [@pentschev](https://github.com/pentschev)
+- Skip cufile tests in JNI build scipt ([#9744](https://github.com/rapidsai/cudf/pull/9744)) [@pxLi](https://github.com/pxLi)
+- Enable sting to decimal 128 cast ([#9742](https://github.com/rapidsai/cudf/pull/9742)) [@azajafi](https://github.com/azajafi)
+- Use stop instead of stop_. ([#9735](https://github.com/rapidsai/cudf/pull/9735)) [@bdice](https://github.com/bdice)
+- Fowad-mege banch-21.12 to banch-22.02 ([#9730](https://github.com/rapidsai/cudf/pull/9730)) [@bdice](https://github.com/bdice)
+- Impove cmake fomat scipt ([#9723](https://github.com/rapidsai/cudf/pull/9723)) [@vyas](https://github.com/vyas)
+- Use cuFile diect device eads/wites by default in cuIO ([#9722](https://github.com/rapidsai/cudf/pull/9722)) [@vuule](https://github.com/vuule)
+- Add diectoy-patitioned data suppot to cudf.ead_paquet ([#9720](https://github.com/rapidsai/cudf/pull/9720)) [@jzamoa](https://github.com/jzamoa)
+- Use steam allocato adapto fo hash join table ([#9704](https://github.com/rapidsai/cudf/pull/9704)) [@PointKenel](https://github.com/PointKenel)
+- Update check fo inf/nan stings in libcudf float convesion to ignoe case ([#9694](https://github.com/rapidsai/cudf/pull/9694)) [@davidwendt](https://github.com/davidwendt)
+- Update cudf JNI to 22.02.0-SNAPSHOT ([#9681](https://github.com/rapidsai/cudf/pull/9681)) [@pxLi](https://github.com/pxLi)
+- Replace cudf&#39;s concuent_odeed_map with cuco::static_map in semi/anti joins ([#9666](https://github.com/rapidsai/cudf/pull/9666)) [@vyas](https://github.com/vyas)
+- Some impovements to `pase_decimal` function and bindings fo `is_fixed_point` ([#9658](https://github.com/rapidsai/cudf/pull/9658)) [@azajafi](https://github.com/azajafi)
+- Add utility to fomat ninja-log build times ([#9631](https://github.com/rapidsai/cudf/pull/9631)) [@davidwendt](https://github.com/davidwendt)
+- Allow untime has_nulls paamete fo ow opeatos ([#9623](https://github.com/rapidsai/cudf/pull/9623)) [@davidwendt](https://github.com/davidwendt)
+- Use fsspec.paquet fo impoved ead_paquet pefomance fom emote stoage ([#9589](https://github.com/rapidsai/cudf/pull/9589)) [@jzamoa](https://github.com/jzamoa)
+- Refacto bit counting APIs, intoduce valid/null count functions, and split host/device side code fo segmented counts. ([#9588](https://github.com/rapidsai/cudf/pull/9588)) [@bdice](https://github.com/bdice)
+- Use List of Columns as Input fo `dop_nulls`, `gathe` and `dop_duplicates` ([#9558](https://github.com/rapidsai/cudf/pull/9558)) [@isVoid](https://github.com/isVoid)
+- Simplify mege intenals and educe ovehead ([#9516](https://github.com/rapidsai/cudf/pull/9516)) [@vyas](https://github.com/vyas)
+- Add `stuct` geneation suppot in datageneato &amp; fuzz tests ([#9180](https://github.com/rapidsai/cudf/pull/9180)) [@galipemsaga](https://github.com/galipemsaga)
+- Simplify wite_csv by emoving unnecessay wite/impl classes ([#9089](https://github.com/rapidsai/cudf/pull/9089)) [@cwhais](https://github.com/cwhais)
 
 # cuDF 21.12.00 (9 Dec 2021)