From ad0254566ed9f1cb33338feb156c5751ec3747e4 Mon Sep 17 00:00:00 2001 From: AJ Schmidt Date: Thu, 4 Nov 2021 10:13:34 -0400 Subject: [PATCH 001/202] DOC v22.02 Updates --- CHANGELOG.md | 4 ++++ conda/environments/cudf_dev_cuda11.0.yml | 2 +- conda/environments/cudf_dev_cuda11.2.yml | 2 +- cpp/CMakeLists.txt | 2 +- cpp/doxygen/Doxyfile | 4 ++-- cpp/examples/basic/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 4 ++-- 7 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b46ac22d767..4dd94954a82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# cuDF 22.02.00 (Date TBD) + +Please see https://github.com/rapidsai/cudf/releases/tag/v22.02.00a for the latest changes to this development branch. + # cuDF 21.12.00 (Date TBD) Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch. diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 803e4f0ba26..60a5959a23f 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -10,7 +10,7 @@ dependencies: - clang=11.0.0 - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - - rmm=21.12.* + - rmm=22.02.* - cmake>=3.20.1 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml index 2281d361ebd..7904593c4c7 100644 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ b/conda/environments/cudf_dev_cuda11.2.yml @@ -10,7 +10,7 @@ dependencies: - clang=11.0.0 - clang-tools=11.0.0 - cupy>7.1.0,<10.0.0a0 - - rmm=21.12.* + - rmm=22.02.* - cmake>=3.20.1 - cmake_setuptools>=0.1.3 - python>=3.7,<3.9 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1a0c853ef48..bd08717ff43 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.02/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake ) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 1141f20e3b1..55e5119040e 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 21.12.00 +PROJECT_NUMBER = 22.02.00 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/21.12 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/22.02 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt index df44ac31d90..9bb021f1429 100644 --- a/cpp/examples/basic/CMakeLists.txt +++ b/cpp/examples/basic/CMakeLists.txt @@ -14,7 +14,7 @@ file( ) include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake) -set(CUDF_TAG branch-21.12) +set(CUDF_TAG branch-22.02) CPMFindPackage( NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf GIT_TAG ${CUDF_TAG} diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 4a7d115ae3b..2c184252192 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -80,9 +80,9 @@ # built documents. # # The short X.Y version. -version = '21.12' +version = '22.02' # The full version, including alpha/beta/rc tags. -release = '21.12.00' +release = '22.02.00' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From d64e2749a608d0eca79f5baa01ce5e13afaadc96 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 12 Nov 2021 19:47:46 -0600 Subject: [PATCH 002/202] Fix links in C++ Developer Guide. (#9675) This PR improves the C++ developer guide. My primary goal was to fix some invalid links. The diff is a bit large because of some minor changes in the interest of establishing consistent style and improving the reading/editing experience. (e.g. replacing a few instances of tabs with spaces, trimming trailing whitespace, wrapping sections that were not wrapped like the rest of the file, and correcting typos that I came across while reading). To save time, I recommend that reviewers use the option in GitHub's review tab that will ignore whitespace changes. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9675 --- cpp/docs/BENCHMARKING.md | 22 +- cpp/docs/DEVELOPER_GUIDE.md | 580 ++++++++++++++++++------------------ cpp/docs/TESTING.md | 160 +++++----- 3 files changed, 384 insertions(+), 378 deletions(-) diff --git a/cpp/docs/BENCHMARKING.md b/cpp/docs/BENCHMARKING.md index ddf7e177516..ed99ff5f1be 100644 --- a/cpp/docs/BENCHMARKING.md +++ b/cpp/docs/BENCHMARKING.md @@ -8,16 +8,16 @@ other benchmarks in `cpp/benchmarks` to understand the options. ## Directory and File Naming -The naming of unit benchmark directories and source files should be consistent with the feature -being benchmarked. For example, the benchmarks for APIs in `copying.hpp` should live in -`cudf/cpp/benchmarks/copying`. Each feature (or set of related features) should have its own +The naming of unit benchmark directories and source files should be consistent with the feature +being benchmarked. For example, the benchmarks for APIs in `copying.hpp` should live in +`cudf/cpp/benchmarks/copying`. Each feature (or set of related features) should have its own benchmark source file named `_benchmark.cu/cpp`. For example, -`cudf/cpp/src/copying/scatter.cu` has benchmarks in +`cudf/cpp/src/copying/scatter.cu` has benchmarks in `cudf/cpp/benchmarks/copying/scatter_benchmark.cu`. -In the interest of improving compile time, whenever possible, test source files should be `.cpp` +In the interest of improving compile time, whenever possible, test source files should be `.cpp` files because `nvcc` is slower than `gcc` in compiling host code. Note that `thrust::device_vector` -includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, +includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, `rmm::device_buffer` and the various `column_wrapper` types described in [Testing](TESTING.md) can be used in `.cpp` files, and are therefore preferred in test code over `thrust::device_vector`. @@ -25,7 +25,7 @@ can be used in `.cpp` files, and are therefore preferred in test code over `thru CUDA computations and operations like copies are typically asynchronous with respect to host code, so it is important to carefully synchronize in order to ensure the benchmark timing is not stopped -before the feature you are benchmarking has completed. An RAII helper class `cuda_event_timer` is +before the feature you are benchmarking has completed. An RAII helper class `cuda_event_timer` is provided in `cpp/benchmarks/synchronization/synchronization.hpp` to help with this. This class can also optionally clear the GPU L2 cache in order to ensure cache hits do not artificially inflate performance in repeated iterations. @@ -35,10 +35,10 @@ performance in repeated iterations. In general, we should benchmark all features over a range of data sizes and types, so that we can catch regressions across libcudf changes. However, running many benchmarks is expensive, so ideally we should sample the parameter space in such a way to get good coverage without having to test -exhaustively. +exhaustively. -A rule of thumb is that we should benchmark with enough data to reach the point where the algorithm -reaches its saturation bottleneck, whether that bottleneck is bandwidth or computation. Using data +A rule of thumb is that we should benchmark with enough data to reach the point where the algorithm +reaches its saturation bottleneck, whether that bottleneck is bandwidth or computation. Using data sets larger than this point is generally not helpful, except in specific cases where doing so -exercises different code and can therefore uncover regressions that smaller benchmarks will not +exercises different code and can therefore uncover regressions that smaller benchmarks will not (this should be rare). diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 18860504bf1..5e465ed6991 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -1,31 +1,31 @@ # libcudf C++ Developer Guide -This document serves as a guide for contributors to libcudf C++ code. Developers should also refer +This document serves as a guide for contributors to libcudf C++ code. Developers should also refer to these additional files for further documentation of libcudf best practices. * [Documentation Guide](DOCUMENTATION.md) for guidelines on documenting libcudf code. * [Testing Guide](TESTING.md) for guidelines on writing unit tests. -* [Benchmarking Guide](TODO) for guidelines on writing unit benchmarks. +* [Benchmarking Guide](BENCHMARKING.md) for guidelines on writing unit benchmarks. # Overview -libcudf is a C++ library that provides GPU-accelerated data-parallel algorithms for processing -column-oriented tabular data. libcudf provides algorithms including slicing, filtering, sorting, +libcudf is a C++ library that provides GPU-accelerated data-parallel algorithms for processing +column-oriented tabular data. libcudf provides algorithms including slicing, filtering, sorting, various types of aggregations, and database-type operations such as grouping and joins. libcudf serves a number of clients via multiple language interfaces, including Python and Java. Users may also use libcudf directly from C++ code. ## Lexicon -This section defines terminology used within libcudf +This section defines terminology used within libcudf. ### Column -A column is an array of data of a single type. Along with Tables, columns are the fundamental data +A column is an array of data of a single type. Along with Tables, columns are the fundamental data structures used in libcudf. Most libcudf algorithms operate on columns. Columns may have a validity -mask representing whether each element is valid or null (invalid). Columns of nested types are +mask representing whether each element is valid or null (invalid). Columns of nested types are supported, meaning that a column may have child columns. A column is the C++ equivalent to a cuDF -Python [series](https://docs.rapids.ai/api/cudf/stable/api.html#series) +Python [Series](https://docs.rapids.ai/api/cudf/stable/api_docs/series.html). ### Element @@ -37,29 +37,29 @@ A type representing a single element of a data type. ### Table -A table is a collection of columns with equal number of elements. A table is the C++ equivalent to -a cuDF Python [data frame](https://docs.rapids.ai/api/cudf/stable/api.html#dataframe). +A table is a collection of columns with equal number of elements. A table is the C++ equivalent to +a cuDF Python [DataFrame](https://docs.rapids.ai/api/cudf/stable/api_docs/dataframe.html). ### View -A view is a non-owning object that provides zero-copy access (possibly with slicing or offsets) data -owned by another object. Examples are column views and table views. +A view is a non-owning object that provides zero-copy access (possibly with slicing or offsets) to +data owned by another object. Examples are column views and table views. # Directory Structure and File Naming -External/public libcudf APIs are grouped based on functionality into an appropriately titled -header file in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/copying.hpp` -contains the APIs for functions related to copying from one column to another. Note the `.hpp` +External/public libcudf APIs are grouped based on functionality into an appropriately titled +header file in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/copying.hpp` +contains the APIs for functions related to copying from one column to another. Note the `.hpp` file extension used to indicate a C++ header file. -Header files should use the `#pragma once` include guard. +Header files should use the `#pragma once` include guard. -The naming of external API headers should be consistent with the name of the folder that contains +The naming of external API headers should be consistent with the name of the folder that contains the source files that implement the API. For example, the implementation of the APIs found in -`cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for +`cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for the APIs reside in `cudf/tests/copying/`. -Internal API headers containing `detail` namespace definitions that are used across translation +Internal API headers containing `detail` namespace definitions that are used across translation units inside libcudf should be placed in `include/cudf/detail`. ## File extensions @@ -75,22 +75,24 @@ execution policy (always `rmm::exec_policy` in libcudf). ## Code and Documentation Style and Formatting -libcudf code uses [snake_case](https://en.wikipedia.org/wiki/Snake_case) for all names except in a -few cases: template parameters, unit tests and test case names may use Pascal case, aka -[UpperCamelCase](https://en.wikipedia.org/wiki/Camel_case). We do not use [Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation), except sometimes when naming device data variables and their corresponding -host copies. Private member variables are typically prefixed with an underscore. +libcudf code uses [snake_case](https://en.wikipedia.org/wiki/Snake_case) for all names except in a +few cases: template parameters, unit tests and test case names may use Pascal case, aka +[UpperCamelCase](https://en.wikipedia.org/wiki/Camel_case). We do not use +[Hungarian notation](https://en.wikipedia.org/wiki/Hungarian_notation), except sometimes when naming +device data variables and their corresponding host copies. Private member variables are typically +prefixed with an underscore. ```c++ template -void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr) +void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr) { ... } -class utility_class +class utility_class { ... - private: +private: int _rating{}; std::unique_ptr _column{}; } @@ -103,26 +105,26 @@ TYPED_TEST(RepeatTypedTestFixture, RepeatScalarCount) } ``` -C++ formatting is enforced using `clang-format`. You should configure `clang-format` on your -machine to use the `cudf/cpp/.clang-format` configuration file, and run `clang-format` on all -changed code before committing it. The easiest way to do this is to configure your editor to -"format on save". +C++ formatting is enforced using `clang-format`. You should configure `clang-format` on your +machine to use the `cudf/cpp/.clang-format` configuration file, and run `clang-format` on all +changed code before committing it. The easiest way to do this is to configure your editor to +"format on save." Aspects of code style not discussed in this document and not automatically enforceable are typically caught during code review, or not enforced. ### C++ Guidelines -In general, we recommend following -[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines). We also -recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/watch?v=W2tWOdzgXHA), -and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives." +In general, we recommend following +[C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines). We also +recommend watching Sean Parent's [C++ Seasoning talk](https://www.youtube.com/watch?v=W2tWOdzgXHA), +and we try to follow his rules: "No raw loops. No raw pointers. No raw synchronization primitives." * Prefer algorithms from STL and Thrust to raw loops. - * Prefer libcudf and RMM [owning data structures and views](libcudf-data-structures) to raw pointers + * Prefer libcudf and RMM [owning data structures and views](#libcudf-data-structures) to raw pointers and raw memory allocation. * libcudf doesn't have a lot of CPU-thread concurrency, but there is some. And currently libcudf - does use raw synchronization primitives. So we should revisit Parent's third rule and improve + does use raw synchronization primitives. So we should revisit Parent's third rule and improve here. Documentation is discussed in the [Documentation Guide](DOCUMENTATION.md). @@ -131,28 +133,28 @@ Documentation is discussed in the [Documentation Guide](DOCUMENTATION.md). The following guidelines apply to organizing `#include` lines. - * Group includes by library (e.g. cuDF, RMM, Thrust, STL). `clang-format` will respect the + * Group includes by library (e.g. cuDF, RMM, Thrust, STL). `clang-format` will respect the groupings and sort the individual includes within a group lexicographically. * Separate groups by a blank line. - * Order the groups from "nearest" to "farthest". In other words, local includes, then includes - from other RAPIDS libraries, then includes from related libraries, like ``, then - includes from dependencies installed with cuDF, and then standard headers (for example ``, + * Order the groups from "nearest" to "farthest". In other words, local includes, then includes + from other RAPIDS libraries, then includes from related libraries, like ``, then + includes from dependencies installed with cuDF, and then standard headers (for example ``, ``). - * Use <> instead of "" unless the header is in the same directory as the source file. + * Use `<>` instead of `""` unless the header is in the same directory as the source file. * Tools like `clangd` often auto-insert includes when they can, but they usually get the grouping and brackets wrong. - * Always check that includes are only necessary for the file in which they are included. - Try to avoid excessive including especially in header files. Double check this when you remove + * Always check that includes are only necessary for the file in which they are included. + Try to avoid excessive including especially in header files. Double check this when you remove code. * Use quotes `"` to include local headers from the same relative source directory. This should only - occur in source files and non-public header files. Otherwise use angle brackets `<>` around + occur in source files and non-public header files. Otherwise use angle brackets `<>` around included header filenames. - * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including - (internal) headers from source paths not in the same directory as the including file, + * Avoid relative paths with `..` when possible. Paths with `..` are necessary when including + (internal) headers from source paths not in the same directory as the including file, because source paths are not passed with `-I`. * Avoid including library internal headers from non-internal files. For example, try not to include - headers from libcudf `src` directories in tests or in libcudf public headers. If you find - yourself doing this, start a discussion about moving (parts of) the included internal header + headers from libcudf `src` directories in tests or in libcudf public headers. If you find + yourself doing this, start a discussion about moving (parts of) the included internal header to a public header. # libcudf Data Structures @@ -162,14 +164,14 @@ data structures you will use when developing libcudf code. ## Views and Ownership -Resource ownership is an essential concept in libcudf. In short, an "owning" object owns a -resource (such as device memory). It acquires that resource during construction and releases the +Resource ownership is an essential concept in libcudf. In short, an "owning" object owns a +resource (such as device memory). It acquires that resource during construction and releases the resource in destruction ([RAII](https://en.cppreference.com/w/cpp/language/raii)). A "non-owning" object does not own resources. Any class in libcudf with the `*_view` suffix is non-owning. For more detail see the [`libcudf++` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing) libcudf functions typically take views as input (`column_view`, `table_view`, or `scalar_view`) -and produce `unique_ptr`s to owning objects as output. For example, +and produce `unique_ptr`s to owning objects as output. For example, ```c++ std::unique_ptr sort(table_view const& input); @@ -177,25 +179,25 @@ std::unique_ptr
sort(table_view const& input); ## `rmm::device_memory_resource` -libcudf Allocates all device memory via RMM memory resources (MR). See the +libcudf allocates all device memory via RMM memory resources (MR). See the [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details. ### Current Device Memory Resource RMM provides a "default" memory resource for each device that can be accessed and updated via the -`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions, -respectively. All memory resource parameters should be defaulted to use the return value of -`rmm::mr::get_current_device_resource()`. +`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions, +respectively. All memory resource parameters should be defaulted to use the return value of +`rmm::mr::get_current_device_resource()`. ## `cudf::column` -`cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either -a `cudf::column` or a `cudf::table` as output. A `column` contains `device_buffer`s which own the -device memory for the elements of a column and an optional null indicator bitmask. +`cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either +a `cudf::column` or a `cudf::table` as output. A `column` contains `device_buffer`s which own the +device memory for the elements of a column and an optional null indicator bitmask. -Implicitly convertible to `column_view` and `mutable_column_view`. +Implicitly convertible to `column_view` and `mutable_column_view`. -Movable and copyable. A copy performs a deep copy of the column's contents, whereas a move moves +Movable and copyable. A copy performs a deep copy of the column's contents, whereas a move moves the contents from one column to another. Example: @@ -214,13 +216,13 @@ A `column` may have nested (child) columns, depending on the data type of the co ### `cudf::column_view` -`cudf::column_view` is a core non-owning data structure in libcudf. It is an immutable, +`cudf::column_view` is a core non-owning data structure in libcudf. It is an immutable, non-owning view of device memory as a column. Most libcudf public APIs take views as inputs. -A `column_view` may be a view of a "slice" of a column. For example, it might view rows 75-150 of a -column with 1000 rows. The `size()` of this `column_view` would be `75`, and accessing index `0` of -the view would return the element at index `75` of the owning `column`. Internally, this is -implemented by storing in the view a pointer, an offset, and a size. `column_view::data()` +A `column_view` may be a view of a "slice" of a column. For example, it might view rows 75-150 of a +column with 1000 rows. The `size()` of this `column_view` would be `75`, and accessing index `0` of +the view would return the element at index `75` of the owning `column`. Internally, this is +implemented by storing in the view a pointer, an offset, and a size. `column_view::data()` returns a pointer iterator to `column_view::head() + offset`. ### `cudf::mutable_column_view` @@ -230,29 +232,29 @@ APIs that modify columns in place. ### `cudf::column_device_view` -An immutable, non-owning view of device data as a column of elements that is trivially copyable and -usable in CUDA device code. Used to pass `column_view` data as input to CUDA kernels and device +An immutable, non-owning view of device data as a column of elements that is trivially copyable and +usable in CUDA device code. Used to pass `column_view` data as input to CUDA kernels and device functions (including Thrust algorithms) ### `cudf::mutable_column_device_view` -A mutable, non-owning view of device data as a column of elements that is trivially copyable and +A mutable, non-owning view of device data as a column of elements that is trivially copyable and usable in CUDA device code. Used to pass `column_view` data to be modified on the device by CUDA kernels and device functions (including Thrust algorithms). ## `cudf::table` -Owning class for a set of `cudf::column`s all with equal number of elements. This is the C++ -equivalent to a data frame. +Owning class for a set of `cudf::column`s all with equal number of elements. This is the C++ +equivalent to a data frame. Implicitly convertible to `cudf::table_view` and `cudf::mutable_table_view` -Movable and copyable. A copy performs a deep copy of all columns, whereas a move moves all columns +Movable and copyable. A copy performs a deep copy of all columns, whereas a move moves all columns from one table to another. ### `cudf::table_view` -An *immutable*, non-owning view of a table. +An *immutable*, non-owning view of a table. ### `cudf::mutable_table_view` @@ -261,20 +263,20 @@ A *mutable*, non-owning view of a table. ## Spans libcudf provides `span` classes that mimic C++20 `std::span`, which is a lightweight -view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and -`device_span`, which can be constructed from multiple container types, or from a pointer -(host or device, respectively) and size, or from iterators. `span` types are useful for defining +view of a contiguous sequence of objects. libcudf provides two classes, `host_span` and +`device_span`, which can be constructed from multiple container types, or from a pointer +(host or device, respectively) and size, or from iterators. `span` types are useful for defining generic (internal) interfaces which work with multiple input container types. `device_span` can be -constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`. +constructed from `thrust::device_vector`, `rmm::device_vector`, or `rmm::device_uvector`. `host_span` can be constructed from `thrust::host_vector`, `std::vector`, or `std::basic_string`. -If you are definining internal (detail) functions that operate on vectors, use spans for the input +If you are defining internal (detail) functions that operate on vectors, use spans for the input vector parameters rather than a specific vector type, to make your functions more widely applicable. When a `span` refers to immutable elements, use `span`, not `span const`. Since a span is lightweight view, it does not propagate `const`-ness. Therefore, `const` should be applied to -the template type parameter, not to the `span` itself. Also, `span` should be passed by value -because it is a lightweight view. APIS in libcudf that take spans as input will look like the +the template type parameter, not to the `span` itself. Also, `span` should be passed by value +because it is a lightweight view. APIS in libcudf that take spans as input will look like the following function that copies device data to a host `std::vector`. ```c++ @@ -284,15 +286,15 @@ std::vector make_std_vector_async(device_span v, rmm::cuda_stream_vi ## `cudf::scalar` -A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types -currently supported by cudf. Each type of value is represented by a separate type of scalar class -which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value, +A `cudf::scalar` is an object that can represent a singular, nullable value of any of the types +currently supported by cudf. Each type of value is represented by a separate type of scalar class +which are all derived from `cudf::scalar`. e.g. A `numeric_scalar` holds a single numerical value, a `string_scalar` holds a single string. The data for the stored value resides in device memory. -A `list_scalar` holds the underlying data of a single list. This means the underlying data can be any type -that cudf supports. For example, a `list_scalar` representing a list of integers stores a `cudf::column` -of type `INT32`. A `list_scalar` representing a list of lists of integers stores a `cudf::column` of -type `LIST`, which in turn stores a column of type `INT32`. +A `list_scalar` holds the underlying data of a single list. This means the underlying data can be +any type that cudf supports. For example, a `list_scalar` representing a list of integers stores a +`cudf::column` of type `INT32`. A `list_scalar` representing a list of lists of integers stores a +`cudf::column` of type `LIST`, which in turn stores a column of type `INT32`. |Value type|Scalar class|Notes| |-|-|-| @@ -305,16 +307,16 @@ type `LIST`, which in turn stores a column of type `INT32`. |list|`list_scalar`| Underlying data can be any type supported by cudf | ### Construction -`scalar`s can be created using either their respective constructors or using factory functions like -`make_numeric_scalar()`, `make_timestamp_scalar()` or `make_string_scalar()`. +`scalar`s can be created using either their respective constructors or using factory functions like +`make_numeric_scalar()`, `make_timestamp_scalar()` or `make_string_scalar()`. ### Casting -All the factory methods return a `unique_ptr` which needs to be statically downcasted to -its respective scalar class type before accessing its value. Their validity (nullness) can be -accessed without casting. Generally, the value needs to be accessed from a function that is aware -of the value type e.g. a functor that is dispatched from `type_dispatcher`. To cast to the -requisite scalar class type given the value type, use the mapping utility `scalar_type_t` provided -in `type_dispatcher.hpp` : +All the factory methods return a `unique_ptr` which needs to be statically downcasted to +its respective scalar class type before accessing its value. Their validity (nullness) can be +accessed without casting. Generally, the value needs to be accessed from a function that is aware +of the value type e.g. a functor that is dispatched from `type_dispatcher`. To cast to the +requisite scalar class type given the value type, use the mapping utility `scalar_type_t` provided +in `type_dispatcher.hpp` : ```c++ //unique_ptr s = make_numeric_scalar(...); @@ -326,8 +328,8 @@ auto s1 = static_cast(s.get()); ### Passing to device Each scalar type, except `list_scalar`, has a corresponding non-owning device view class which allows -access to the value and its validity from the device. This can be obtained using the function -`get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar +access to the value and its validity from the device. This can be obtained using the function +`get_scalar_device_view(ScalarType s)`. Note that a device view is not provided for a base scalar object, only for the derived typed scalar class objects. The underlying data for `list_scalar` can be accessed via `view()` method. For non-nested data, @@ -339,17 +341,17 @@ data, a specialized device view for list columns can be constructed via ## Streams -CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition -to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be +CUDA streams are not yet exposed in external libcudf APIs. However, in order to ease the transition +to future use of streams, all libcudf APIs that allocate device memory or execute a kernel should be implemented using asynchronous APIs on the default stream (e.g., stream 0). -The recommended pattern for doing this is to make the definition of the external API invoke an -internal API in the `detail` namespace. The internal `detail` API has the same parameters as the -public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to -`rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API +The recommended pattern for doing this is to make the definition of the external API invoke an +internal API in the `detail` namespace. The internal `detail` API has the same parameters as the +public API, plus a `rmm::cuda_stream_view` parameter at the end defaulted to +`rmm::cuda_stream_default`. The implementation should be wholly contained in the `detail` API definition and use only asynchronous versions of CUDA APIs with the stream parameter. -In order to make the `detail` API callable from other libcudf functions, it should be exposed in a +In order to make the `detail` API callable from other libcudf functions, it should be exposed in a header placed in the `cudf/cpp/include/detail/` directory. For example: @@ -382,55 +384,57 @@ void external_function(...){ ``` **Note:** It is important to synchronize the stream if *and only if* it is necessary. For example, -when a non-pointer value is returned from the API that is the result of an asynchronous +when a non-pointer value is returned from the API that is the result of an asynchronous device-to-host copy, the stream used for the copy should be synchronized before returning. However, -when a column is returned, the stream should not be synchronized because doing so will break +when a column is returned, the stream should not be synchronized because doing so will break asynchrony if and when we add an asynchronous API to libcudf. **Note:** `cudaDeviceSynchronize()` should *never* be used. - This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs. +This limits the ability to do any multi-stream/multi-threaded work with libcudf APIs. ### NVTX Ranges - In order to aid in performance optimization and debugging, all compute intensive libcudf functions should have a corresponding NVTX range. - In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()` that will automatically annotate the lifetime of the enclosing function and use the functions name as the name of the NVTX range. - For more information about NVTX, see [here](https://github.com/NVIDIA/NVTX/tree/dev/cpp). +In order to aid in performance optimization and debugging, all compute intensive libcudf functions +should have a corresponding NVTX range. In libcudf, we have a convenience macro `CUDF_FUNC_RANGE()` +that will automatically annotate the lifetime of the enclosing function and use the function's name +as the name of the NVTX range. For more information about NVTX, see +[here](https://github.com/NVIDIA/NVTX/tree/dev/cpp). ### Stream Creation -There may be times in implementing libcudf features where it would be advantageous to use streams -*internally*, i.e., to accomplish overlap in implementing an algorithm. However, dynamically -creating a stream can be expensive. RMM has a stream pool class to help avoid dynamic stream -creation. However, this is not yet exposed in libcudf, so for the time being, libcudf features +There may be times in implementing libcudf features where it would be advantageous to use streams +*internally*, i.e., to accomplish overlap in implementing an algorithm. However, dynamically +creating a stream can be expensive. RMM has a stream pool class to help avoid dynamic stream +creation. However, this is not yet exposed in libcudf, so for the time being, libcudf features should avoid creating streams (even if it is slightly less efficient). It is a good idea to leave a `// TODO:` note indicating where using a stream would be beneficial. ## Memory Allocation -Device [memory resources](#memory_resource) are used in libcudf to abstract and control how device -memory is allocated. +Device [memory resources](#memory_resource) are used in libcudf to abstract and control how device +memory is allocated. ### Output Memory -Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a +Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a `device_memory_resource` as the last parameter. Inside the API, this memory resource must be used to allocate any memory for returned objects. It should therefore be passed into functions whose outputs will be returned. Example: ```c++ -// Returned `column` contains newly allocated memory, +// Returned `column` contains newly allocated memory, // therefore the API must accept a memory resource pointer std::unique_ptr returns_output_memory( ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource()); // This API does not allocate any new *output* memory, therefore // a memory resource is unnecessary -void does_not_allocate_output_memory(...); +void does_not_allocate_output_memory(...); ``` ### Temporary Memory -Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must +Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must allocate temporary, scratch memory for intermediate results. Always use the default resource obtained from `rmm::mr::get_current_device_resource()` for temporary memory allocations. Example: @@ -451,70 +455,70 @@ libcudf code generally eschews raw pointers and direct memory allocation. Use RM use `device_memory_resource`(*)s for device memory allocation with automated lifetime management. #### `rmm::device_buffer` -Allocates a specified number of bytes of untyped, uninitialized device memory using a -`device_memory_resource`. If no resource is explicitly provided, uses -`rmm::mr::get_current_device_resource()`. +Allocates a specified number of bytes of untyped, uninitialized device memory using a +`device_memory_resource`. If no resource is explicitly provided, uses +`rmm::mr::get_current_device_resource()`. -`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the -`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the +`rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the +`device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the device memory from one `device_buffer` to another. ```c++ -// Allocates at least 100 bytes of uninitialized device memory +// Allocates at least 100 bytes of uninitialized device memory // using the specified resource and stream -rmm::device_buffer buff(100, stream, mr); +rmm::device_buffer buff(100, stream, mr); void * raw_data = buff.data(); // Raw pointer to underlying device memory // Deep copies `buff` into `copy` on `stream` -rmm::device_buffer copy(buff, stream); +rmm::device_buffer copy(buff, stream); // Moves contents of `buff` into `moved_to` -rmm::device_buffer moved_to(std::move(buff)); +rmm::device_buffer moved_to(std::move(buff)); custom_memory_resource *mr...; // Allocates 100 bytes from the custom_memory_resource -rmm::device_buffer custom_buff(100, mr, stream); +rmm::device_buffer custom_buff(100, mr, stream); ``` #### `rmm::device_scalar` -Allocates a single element of the specified type initialized to the specified value. Use this for -scalar input/outputs into device kernels, e.g., reduction results, null count, etc. This is +Allocates a single element of the specified type initialized to the specified value. Use this for +scalar input/outputs into device kernels, e.g., reduction results, null count, etc. This is effectively a convenience wrapper around a `rmm::device_vector` of length 1. ```c++ // Allocates device memory for a single int using the specified resource and stream // and initializes the value to 42 -rmm::device_scalar int_scalar{42, stream, mr}; +rmm::device_scalar int_scalar{42, stream, mr}; // scalar.data() returns pointer to value in device memory kernel<<<...>>>(int_scalar.data(),...); -// scalar.value() synchronizes the scalar's stream and copies the +// scalar.value() synchronizes the scalar's stream and copies the // value from device to host and returns the value int host_value = int_scalar.value(); ``` #### `rmm::device_vector` -Allocates a specified number of elements of the specified type. If no initialization value is +Allocates a specified number of elements of the specified type. If no initialization value is provided, all elements are default initialized (this incurs a kernel launch). **Note**: We have removed all usage of `rmm::device_vector` and `thrust::device_vector` from -libcudf, and you should not use it in new code in libcudf without careful consideration. Instead, -use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These +libcudf, and you should not use it in new code in libcudf without careful consideration. Instead, +use `rmm::device_uvector` along with the utility factories in `device_factories.hpp`. These utilities enable creation of `uvector`s from host-side vectors, or creating zero-initialized `uvector`s, so that they are as convenient to use as `device_vector`. Avoiding `device_vector` has a number of benefits, as described in the following section on `rmm::device_uvector`. #### `rmm::device_uvector` -Similar to a `device_vector`, allocates a contiguous set of elements in device memory but with key +Similar to a `device_vector`, allocates a contiguous set of elements in device memory but with key differences: - As an optimization, elements are uninitialized and no synchronization occurs at construction. This limits the types `T` to trivially copyable types. -- All operations are stream ordered (i.e., they accept a `cuda_stream_view` specifying the stream +- All operations are stream ordered (i.e., they accept a `cuda_stream_view` specifying the stream on which the operation is performed). This improves safety when using non-default streams. -- `device_uvector.hpp` does not include any `__device__` code, unlike `thrust/device_vector.hpp`, +- `device_uvector.hpp` does not include any `__device__` code, unlike `thrust/device_vector.hpp`, which means `device_uvector`s can be used in `.cpp` files, rather than just in `.cu` files. ```c++ @@ -523,21 +527,21 @@ cuda_stream s; // default resource rmm::device_uvector v(100, s); // Initializes the elements to 0 -thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0}); +thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0}); rmm::mr::device_memory_resource * mr = new my_custom_resource{...}; // Allocates uninitialized storage for 100 `int32_t` elements on stream `s` using the resource `mr` -rmm::device_uvector v2{100, s, mr}; +rmm::device_uvector v2{100, s, mr}; ``` ## Input/Output Style The preferred style for how inputs are passed in and outputs are returned is the following: -- Inputs - - Columns: - - `column_view const&` - - Tables: - - `table_view const&` +- Inputs + - Columns: + - `column_view const&` + - Tables: + - `table_view const&` - Scalar: - `scalar const&` - Everything else: @@ -545,30 +549,30 @@ The preferred style for how inputs are passed in and outputs are returned is the - Pass by value - Non-trivial or expensive to copy types - Pass by `const&` -- In/Outs - - Columns: - - `mutable_column_view&` - - Tables: - - `mutable_table_view&` +- In/Outs + - Columns: + - `mutable_column_view&` + - Tables: + - `mutable_table_view&` - Everything else: - Pass by via raw pointer -- Outputs - - Outputs should be *returned*, i.e., no output parameters - - Columns: - - `std::unique_ptr` - - Tables: - - `std::unique_ptr
` +- Outputs + - Outputs should be *returned*, i.e., no output parameters + - Columns: + - `std::unique_ptr` + - Tables: + - `std::unique_ptr
` - Scalars: - `std::unique_ptr` ### Multiple Return Values -Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be -done in C++ (including creating a `struct` for the output). One convenient way to do this is -using `std::tie` and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke -either the copy constructor or the move constructor of the object, and it may be preferable to move -non-trivially copyable objects (and required for types with deleted copy constructors, like +Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be +done in C++ (including creating a `struct` for the output). One convenient way to do this is +using `std::tie` and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke +either the copy constructor or the move constructor of the object, and it may be preferable to move +non-trivially copyable objects (and required for types with deleted copy constructors, like `std::unique_ptr`). ```c++ @@ -577,7 +581,7 @@ std::pair return_two_tables(void){ cudf::table out1; ... // Do stuff with out0, out1 - + // Return a std::pair of the two outputs return std::make_pair(std::move(out0), std::move(out1)); } @@ -587,19 +591,20 @@ cudf::table out1; std::tie(out0, out1) = cudf::return_two_outputs(); ``` -Note: `std::tuple` _could_ be used if not for the fact that Cython does not support -`std::tuple`. Therefore, libcudf APIs must use `std::pair`, and are therefore limited to return -only two objects of different types. Multiple objects of the same type may be returned via a +Note: `std::tuple` _could_ be used if not for the fact that Cython does not support +`std::tuple`. Therefore, libcudf APIs must use `std::pair`, and are therefore limited to return +only two objects of different types. Multiple objects of the same type may be returned via a `std::vector`. -Alternatively, with C++17 (supported from cudf v0.20), [structured binding](https://en.cppreference.com/w/cpp/language/structured_binding) +Alternatively, with C++17 (supported from cudf v0.20), +[structured binding](https://en.cppreference.com/w/cpp/language/structured_binding) may be used to disaggregate multiple return values: ```c++ auto [out0, out1] = cudf::return_two_outputs(); ``` -Note that the compiler might not support capturing aliases defined in a structured binding +Note that the compiler might not support capturing aliases defined in a structured binding in a lambda. One may work around this by using a capture with an initializer instead: ```c++ @@ -618,10 +623,10 @@ auto foo = [&out0 = out0] { ## Iterator-based interfaces -Increasingly, libcudf is moving toward internal (`detail`) APIs with iterator parameters rather -than explicit `column`/`table`/`scalar` parameters. As with STL, iterators enable generic -algorithms to be applied to arbitrary containers. A good example of this is `cudf::copy_if_else`. -This function takes two inputs, and a Boolean mask. It copies the corresponding element from the +Increasingly, libcudf is moving toward internal (`detail`) APIs with iterator parameters rather +than explicit `column`/`table`/`scalar` parameters. As with STL, iterators enable generic +algorithms to be applied to arbitrary containers. A good example of this is `cudf::copy_if_else`. +This function takes two inputs, and a Boolean mask. It copies the corresponding element from the first or second input depending on whether the mask at that index is `true` or `false`. Implementing `copy_if_else` for all combinations of `column` and `scalar` parameters is simplified by using iterators in the `detail` API. @@ -636,15 +641,15 @@ std::unique_ptr copy_if_else( FilterFn filter, ...); ``` -`LeftIter` and `RightIter` need only implement the necessary interface for an iterator. libcudf -provides a number of iterator types and utilities that are useful with iterator-based APIs from -libcudf as well as Thrust algorithms. Most are defined in `include/detail/iterator.cuh`. +`LeftIter` and `RightIter` need only implement the necessary interface for an iterator. libcudf +provides a number of iterator types and utilities that are useful with iterator-based APIs from +libcudf as well as Thrust algorithms. Most are defined in `include/detail/iterator.cuh`. ### Pair iterator -The pair iterator is used to access elements of nullable columns as a pair containing an element's -value and validity. `cudf::detail::make_pair_iterator` can be used to create a pair iterator from a -`column_device_view` or a `cudf::scalar`. `make_pair_iterator` is not available for +The pair iterator is used to access elements of nullable columns as a pair containing an element's +value and validity. `cudf::detail::make_pair_iterator` can be used to create a pair iterator from a +`column_device_view` or a `cudf::scalar`. `make_pair_iterator` is not available for `mutable_column_device_view`. ### Null-replacement iterator @@ -654,20 +659,20 @@ This iterator replaces the null/validity value for each element with a specified ### Validity iterator -This iterator returns the validity of the underlying element (`true` or `false`). Created using +This iterator returns the validity of the underlying element (`true` or `false`). Created using `cudf::detail::make_validity_iterator`. ### Index-normalizing iterators The proliferation of data types supported by libcudf can result in long compile times. One area where compile time was a problem is in types used to store indices, which can be any integer type. -The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be -used for index types (integers) without requiring a type-specific instance. It can be used for any -iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`, -`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a +The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be +used for index types (integers) without requiring a type-specific instance. It can be used for any +iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`, +`int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always return a `cudf::size_type` integer. -Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example +Use the `indexalator_factory` to create an appropriate input iterator from a column_view. Example input iterator usage: ```c++ @@ -699,20 +704,20 @@ namespace cudf{ } // namespace cudf ``` -The top-level `cudf` namespace is sufficient for most of the public API. However, to logically -group a broad set of functions, further namespaces may be used. For example, there are numerous -functions that are specific to columns of Strings. These functions reside in the `cudf::strings::` -namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::` -namespace. +The top-level `cudf` namespace is sufficient for most of the public API. However, to logically +group a broad set of functions, further namespaces may be used. For example, there are numerous +functions that are specific to columns of Strings. These functions reside in the `cudf::strings::` +namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::` +namespace. ### Internal -Many functions are not meant for public use, so place them in either the `detail` or an *anonymous* +Many functions are not meant for public use, so place them in either the `detail` or an *anonymous* namespace, depending on the situation. #### `detail` namespace -Functions or objects that will be used across *multiple* translation units (i.e., source files), +Functions or objects that will be used across *multiple* translation units (i.e., source files), should be exposed in an internal header file and placed in the `detail` namespace. Example: ```c++ @@ -726,7 +731,7 @@ void reusable_helper_function(...); #### Anonymous namespace -Functions or objects that will only be used in a *single* translation unit should be defined in an +Functions or objects that will only be used in a *single* translation unit should be defined in an *anonymous* namespace in the source file where it is used. Example: ```c++ @@ -736,12 +741,12 @@ void isolated_helper_function(...); } // anonymous namespace ``` -[**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file) +[**Anonymous namespaces should *never* be used in a header file.**](https://wiki.sei.cmu.edu/confluence/display/cplusplus/DCL59-CPP.+Do+not+define+an+unnamed+namespace+in+a+header+file) # Error Handling -libcudf follows conventions (and provides utilities) enforcing compile-time and run-time -conditions and detecting and handling CUDA errors. Communication of errors is always via C++ +libcudf follows conventions (and provides utilities) enforcing compile-time and run-time +conditions and detecting and handling CUDA errors. Communication of errors is always via C++ exceptions. ## Runtime Conditions @@ -753,13 +758,14 @@ Example usage: CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch"); ``` -The first argument is the conditional expression expected to resolve to `true` under normal -conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of `cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the -error that has occurred and is used for the exception's `what()` message. +The first argument is the conditional expression expected to resolve to `true` under normal +conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of +`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the +error that has occurred and is used for the exception's `what()` message. -There are times where a particular code path, if reached, should indicate an error no matter what. -For example, often the `default` case of a `switch` statement represents an invalid alternative. -Use the `CUDF_FAIL` macro for such errors. This is effectively the same as calling +There are times where a particular code path, if reached, should indicate an error no matter what. +For example, often the `default` case of a `switch` statement represents an invalid alternative. +Use the `CUDF_FAIL` macro for such errors. This is effectively the same as calling `CUDF_EXPECTS(false, reason)`. Example: @@ -769,9 +775,9 @@ CUDF_FAIL("This code path should not be reached."); ### CUDA Error Checking -Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This -macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The -thrown exception includes a description of the CUDA error code in it's `what()` message. +Use the `CUDA_TRY` macro to check for the successful completion of CUDA runtime API functions. This +macro throws a `cudf::cuda_error` exception if the CUDA API return value is not `cudaSuccess`. The +thrown exception includes a description of the CUDA error code in its `what()` message. Example: @@ -786,7 +792,7 @@ Use `static_assert` to enforce compile-time conditions. For example, ```c++ template void trivial_types_only(T t){ - static_assert(std::is_trivial::value, "This function requires a trivial type."); + static_assert(std::is_trivial::value, "This function requires a trivial type."); ... } ``` @@ -805,7 +811,7 @@ Columns may contain data of a number of types (see `enum class type_id` in `incl * Lists of any type * Structs of columns of any type -Most algorithms must support columns of any data type. This leads to complexity in the code, and +Most algorithms must support columns of any data type. This leads to complexity in the code, and is one of the primary challenges a libcudf developer faces. Sometimes we develop new algorithms with gradual support for more data types to make this easier. Typically we start with fixed-width data types such as numeric types and timestamps/durations, adding support for nested types later. @@ -815,21 +821,21 @@ as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dis # Type Dispatcher -libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This -*type-erasure* enables interoperability with other languages and type systems, such as Python and -Java. In order to determine the type, libcudf algorithms must use the run-time information stored -in the column `type()` to reconstruct the data type `T` by casting the `void*` to the appropriate +libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This +*type-erasure* enables interoperability with other languages and type systems, such as Python and +Java. In order to determine the type, libcudf algorithms must use the run-time information stored +in the column `type()` to reconstruct the data type `T` by casting the `void*` to the appropriate `T*`. -This so-called *type dispatch* is pervasive throughout libcudf. The `type_dispatcher` is a -central utility that automates the process of mapping the runtime type information in `data_type` +This so-called *type dispatch* is pervasive throughout libcudf. The `type_dispatcher` is a +central utility that automates the process of mapping the runtime type information in `data_type` to a concrete C++ type. -At a high level, you call the `type_dispatcher` with a `data_type` and a function object (also -known as a *functor*) with an `operator()` template. Based on the value of `data_type::id()`, the -type dispatcher invokes the corresponding instantiation of the `operator()` template. +At a high level, you call the `type_dispatcher` with a `data_type` and a function object (also +known as a *functor*) with an `operator()` template. Based on the value of `data_type::id()`, the +type dispatcher invokes the corresponding instantiation of the `operator()` template. -This simplified example shows how the value of `data_type::id()` determines which instantiation of +This simplified example shows how the value of `data_type::id()` determines which instantiation of the `F::operator()` template is invoked. ```c++ @@ -843,7 +849,7 @@ void type_dispatcher(data_type t, F f){ } ``` -The following example shows a function object called `size_of_functor` that returns the size of the +The following example shows a function object called `size_of_functor` that returns the size of the dispatched type. ```c++ @@ -857,9 +863,9 @@ cudf::type_dispatcher(data_type{type_id::INT32}, size_of_functor{}); // returns cudf::type_dispatcher(data_type{type_id::FLOAT64}, size_of_functor{}); // returns 8 ``` -By default, `type_dispatcher` uses `cudf::type_to_id` to provide the mapping of `cudf::type_id` -to dispatched C++ types. However, this mapping may be customized by explicitly specifying a -user-defined trait for the `IdTypeMap`. For example, to always dispatch `int32_t` for all values of +By default, `type_dispatcher` uses `cudf::type_to_id` to provide the mapping of `cudf::type_id` +to dispatched C++ types. However, this mapping may be customized by explicitly specifying a +user-defined trait for the `IdTypeMap`. For example, to always dispatch `int32_t` for all values of `cudf::type_id`: ```c++ @@ -871,18 +877,18 @@ cudf::type_dispatcher(data_type, f); ## Avoid Multiple Type Dispatch -Avoid multiple type-dispatch if possible. The compiler creates a code path for every type -dispatched, so a second-level type dispatch results in quadratic growth in compilation time and +Avoid multiple type-dispatch if possible. The compiler creates a code path for every type +dispatched, so a second-level type dispatch results in quadratic growth in compilation time and object code size. As a large library with many types and functions, we are constantly working to reduce compilation time and code size. ## Specializing Type-Dispatched Code Paths -It is often necessary to customize the dispatched `operator()` for different types. This can be +It is often necessary to customize the dispatched `operator()` for different types. This can be done in several ways. -The first method is to use explicit, full template specialization. This is useful for specializing -behavior for single types. The following example function object prints `"int32_t"` or `"double"` +The first method is to use explicit, full template specialization. This is useful for specializing +behavior for single types. The following example function object prints `"int32_t"` or `"double"` when invoked with either of those types, or `"unhandled type"` otherwise. ```c++ @@ -900,8 +906,8 @@ template <> void type_printer::operator()() { std::cout << "double\n"; } ``` -The second method is to use [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) with -`std::enable_if_t`. This is useful to partially specialize for a set of types with a common trait. +The second method is to use [SFINAE](https://en.cppreference.com/w/cpp/language/sfinae) with +`std::enable_if_t`. This is useful to partially specialize for a set of types with a common trait. The following example functor prints `integral` or `floating point` for integral or floating point types, respectively. @@ -909,7 +915,7 @@ types, respectively. struct integral_or_floating_point { template ::value and - not std::is_floating_point::value>* = nullptr> + not std::is_floating_point::value>* = nullptr> void operator()() { std::cout << "neither integral nor floating point\n"; } template ::value>* = nullptr> + std::enable_if_t::value>* = nullptr> void operator()() { std::cout << "floating point\n"; } }; ``` For more info on SFINAE with `std::enable_if`, [see this post](https://eli.thegreenplace.net/2014/sfinae-and-enable_if). -There are a number of traits defined in `include/cudf/utilities/traits.hpp` that are useful for -partial specialization of dispatched function objects. For example `is_numeric()` can be used to +There are a number of traits defined in `include/cudf/utilities/traits.hpp` that are useful for +partial specialization of dispatched function objects. For example `is_numeric()` can be used to specialize for any numeric type. # Variable-Size and Nested Data Types -libcudf supports a number of variable-size and nested data types, including strings, lists, and -structs. - - * `string`: Simply a character string, but a column of strings may have a different-length string +libcudf supports a number of variable-size and nested data types, including strings, lists, and +structs. + + * `string`: Simply a character string, but a column of strings may have a different-length string in each row. - * `list`: A list of elements of any type, so a column of lists of integers has rows with a list of - integers, possibly of a different length, in each row. + * `list`: A list of elements of any type, so a column of lists of integers has rows with a list of + integers, possibly of a different length, in each row. * `struct`: In a column of structs, each row is a structure comprising one or more fields. These fields are stored in structure-of-arrays format, so that the column of structs has a nested - column for each field of the structure. + column for each field of the structure. -As the heading implies, list and struct columns may be nested arbitrarily. One may create a column -of lists of structs, where the fields of the struct may be of any type, including strings, lists and -structs. Thinking about deeply nested data types can be confusing for column-based data, even with +As the heading implies, list and struct columns may be nested arbitrarily. One may create a column +of lists of structs, where the fields of the struct may be of any type, including strings, lists and +structs. Thinking about deeply nested data types can be confusing for column-based data, even with experience. Therefore it is important to carefully write algorithms, and to test and document them well. @@ -952,13 +958,13 @@ well. In order to represent variable-width elements, libcudf columns contain a vector of child columns. For list columns, the parent column's type is `LIST` and contains no data, but its size represents the number of lists in the column, and its null mask represents the validity of each list element. -The parent has two children. +The parent has two children. 1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each list in a dense column of elements. -2. A column containing the actual data and optional null mask for all elements of all the lists +2. A column containing the actual data and optional null mask for all elements of all the lists packed together. - + With this representation, `data[offsets[i]]` is the first element of list `i`, and the size of list `i` is given by `offsets[i+1] - offsets[i]`. @@ -967,9 +973,9 @@ of any type. Note also that not only is each list nullable (using the null mask each list element may be nullable. So you may have a lists column with null row 3, and also null element 2 of row 4. -The underlying data for a lists column is always bundled into a single leaf column at the very -bottom of the hierarchy (ignoring structs, which conceptually "reset" the root of the hierarchy), -regardless of the level of nesting. So a `List>>>>` column has a single `int` +The underlying data for a lists column is always bundled into a single leaf column at the very +bottom of the hierarchy (ignoring structs, which conceptually "reset" the root of the hierarchy), +regardless of the level of nesting. So a `List>>>` column has a single `int` column at the very bottom. The following is a visual representation of this. ``` @@ -997,17 +1003,17 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap ## Strings columns -Strings are represented in much the same way as lists, except that the data child column is always +Strings are represented in much the same way as lists, except that the data child column is always a non-nullable column of `INT8` data. The parent column's type is `STRING` and contains no data, -but its size represents the number of strings in the column, and its null mask represents the +but its size represents the number of strings in the column, and its null mask represents the validity of each string. To summarize, the strings column children are: -1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each +1. A non-nullable column of `INT32` elements that indicates the offset to the beginning of each string in a dense column of all characters. -2. A non-nullable column of `INT8` elements of all the characters across all the strings packed +2. A non-nullable column of `INT8` elements of all the characters across all the strings packed together. -With this representation, `characters[offsets[i]]` is the first character of string `i`, and the +With this representation, `characters[offsets[i]]` is the first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of this compound column representation of strings. @@ -1026,10 +1032,10 @@ null mask represents the validity of each struct element. With this representation, `child[0][10]` is row 10 of the first field of the struct, `child[1][42]` is row 42 of the second field of the struct. -Notice that in addition to the struct column's null mask, each struct field column has its own optional null -mask. A struct field's validity can vary independently from the corresponding struct row. For -instance, a non-null struct row might have a null field. However, the fields of a null struct row -are deemed to be null as well. For example, consider a struct column of type +Notice that in addition to the struct column's null mask, each struct field column has its own +optional null mask. A struct field's validity can vary independently from the corresponding struct +row. For instance, a non-null struct row might have a null field. However, the fields of a null +struct row are deemed to be null as well. For example, consider a struct column of type `STRUCT`. If the contents are `[ {1.0, 2}, {4.0, 5}, null, {8.0, null} ]`, the struct column's layout is as follows. (Note that null masks should be read from right to left.) @@ -1039,46 +1045,46 @@ struct column's layout is as follows. (Note that null masks should be read from null_mask = [1, 1, 0, 1] null_count = 1 children = { - { + { type = FLOAT32 data = [1.0, 4.0, X, 8.0] null_mask = [ 1, 1, 0, 1] null_count = 1 - }, - { + }, + { type = INT32 data = [2, 5, X, X] null_mask = [1, 1, 0, 0] null_count = 2 - } - } + } + } } ``` -The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of -the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in +The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of +the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in the null masks of both struct fields. ## Dictionary columns -Dictionaries provide an efficient way to represent low-cardinality data by storing a single copy -of each value. A dictionary comprises a column of sorted keys and a column containing an index into -the keys column for each row of the parent column. The keys column may have any libcudf data type, -such as a numerical type or strings. The indices represent the corresponding positions of each -element's value in the keys. The indices child column can have any unsigned integer type +Dictionaries provide an efficient way to represent low-cardinality data by storing a single copy +of each value. A dictionary comprises a column of sorted keys and a column containing an index into +the keys column for each row of the parent column. The keys column may have any libcudf data type, +such as a numerical type or strings. The indices represent the corresponding positions of each +element's value in the keys. The indices child column can have any unsigned integer type (`UINT8`, `UINT16`, `UINT32`, or `UINT64`). ## Nested column challenges -The first challenge with nested columns is that it is effectively impossible to do any operation -that modifies the length of any string or list in place. For example, consider trying to append the +The first challenge with nested columns is that it is effectively impossible to do any operation +that modifies the length of any string or list in place. For example, consider trying to append the character `'a'` to the end of each string. This requires dynamically resizing the characters column -to allow inserting `'a'` at the end of each string, and then modifying the offsets column to +to allow inserting `'a'` at the end of each string, and then modifying the offsets column to indicate the new size of each element. As a result, every operation that can modify the strings or lists in a column must be done out-of-place. The second challenge is that in an out-of-place operation on a strings column, unlike with fixed- -width elements, the size of the output cannot be known *a priori*. For example, consider scattering +width elements, the size of the output cannot be known *a priori*. For example, consider scattering into a column of strings: ```c++ @@ -1090,7 +1096,7 @@ result: {"this", "red", "a", "green", "of", "blue"} ``` In this example, the strings "red", "green", and "blue" will respectively be scattered into -positions `1`, `3`, and `5` of `destination`. Recall from above that this operation cannot be done +positions `1`, `3`, and `5` of `destination`. Recall from above that this operation cannot be done in place, therefore `result` will be generated by selectively copying strings from `destination` and `scatter_values`. Notice that `result`'s child column of characters requires storage for `19` characters. However, there is no way to know ahead of time that `result` will require `19` @@ -1102,9 +1108,9 @@ approach: 2. Allocate sufficient storage for all of the output characters and materialize each output string. In scatter, the first phase consists of using the `scatter_map` to determine whether string `i` in -the output will come from `destination` or from `scatter_values` and use the corresponding size(s) -to materialize the offsets column and determine the size of the output. Then, in the second phase, -sufficient storage is allocated for the output's characters, and then the characters are filled +the output will come from `destination` or from `scatter_values` and use the corresponding size(s) +to materialize the offsets column and determine the size of the output. Then, in the second phase, +sufficient storage is allocated for the output's characters, and then the characters are filled with the corresponding strings from either `destination` or `scatter_values`. ## Nested Type Views @@ -1113,15 +1119,15 @@ libcudf provides view types for nested column types as well as for the data elem ### `cudf::strings_column_view` and `cudf::string_view` -`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of -any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore +`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of +any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore `cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the -data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object +data type for a `cudf::column` of type `INT32`. As it's name implies, this is a read-only object instance that points to device memory inside the strings column. It's lifespan is the same (or less) as the column it views. Use the `column_device_view::element` method to access an individual row element. Like any other -column, do not call `element()` on a row that is null. +column, do not call `element()` on a row that is null. ```c++ cudf::column_device_view d_strings; @@ -1132,11 +1138,11 @@ column, do not call `element()` on a row that is null. } ``` -A null string is not the same as an empty string. Use the `string_scalar` class if you need an +A null string is not the same as an empty string. Use the `string_scalar` class if you need an instance of a class object to represent a null string. -The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf -functions like `sort` without string-specific code. The data for a `string_view` instance is +The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf +functions like `sort` without string-specific code. The data for a `string_view` instance is required to be [UTF-8](#UTF-8) and all operators and methods expect this encoding. Unless documented otherwise, position and length parameters are specified in characters and not bytes. The class also includes a `string_view::const_iterator` which can be used to navigate through individual characters @@ -1146,13 +1152,13 @@ within the string. #### UTF-8 -The libcudf strings column only supports UTF-8 encoding for strings data. -[UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each +The libcudf strings column only supports UTF-8 encoding for strings data. +[UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes. For this reason, it is recommended to use the `string_view` class to access these characters for most operations. -The `string_view.cuh` header also includes some utility methods for reading and writing +The `string_view.cuh` header also includes some utility methods for reading and writing (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays. ### `cudf::lists_column_view` and `cudf::lists_view` @@ -1171,7 +1177,7 @@ struct, and therefore `cudf::struct_view` is the data type of a `cudf::column` o # cuIO: file reading and writing -cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file +cuIO is a component of libcudf that provides GPU-accelerated reading and writing of data file formats commonly used in data analytics, including CSV, Parquet, ORC, Avro, and JSON_Lines. // TODO: add more detail and move to a separate file. diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md index 1bdf9c208d8..3c1e992c7eb 100644 --- a/cpp/docs/TESTING.md +++ b/cpp/docs/TESTING.md @@ -1,68 +1,68 @@ # Unit Testing in libcudf -Unit tests in libcudf are written using +Unit tests in libcudf are written using [Google Test](https://github.com/google/googletest/blob/master/docs/primer.md). -**Important:** Instead of including `gtest/gtest.h` directly, use +**Important:** Instead of including `gtest/gtest.h` directly, use `#include `. ## Best Practices: What Should We Test? -In general we should test to make sure all code paths are covered. This is not always easy or +In general we should test to make sure all code paths are covered. This is not always easy or possible. But generally this means we test all supported combinations of algorithms and data types, -and all operators supported by algorithms that support multiple operators (e.g. reductions, +and all operators supported by algorithms that support multiple operators (e.g. reductions, groupby). Here are some other guidelines. * In general empty input is not an error in libcudf. Typically empty input results in empty output. Tests should verify this. - * Anything that involves manipulating bitmasks (especially hand-rolled kernels) should have tests + * Anything that involves manipulating bitmasks (especially hand-rolled kernels) should have tests that check varying number of rows, especially around boundaries like the warp size (32). So, test fewer than 32 rows, more than 32 rows, exactly 32 rows, and greater than 64 rows. - * Most algorithms should have one or more tests exercising inputs with a large enough number of - rows to require launching multiple thread blocks, especially when values are ultimately - communicated between blocks (e.g. reductions). This is especially important for custom kernels - but also applies to Thrust and CUB algorithm calls with lambdas / functors. + * Most algorithms should have one or more tests exercising inputs with a large enough number of + rows to require launching multiple thread blocks, especially when values are ultimately + communicated between blocks (e.g. reductions). This is especially important for custom kernels + but also applies to Thrust and CUB algorithm calls with lambdas / functors. * For anything involving strings or lists, test exhaustive combinations of empty strings/lists, - null strings/lists and strings/lists with null elements. - + null strings/lists and strings/lists with null elements. + * Strings tests should include a mixture of non-ASCII UTF-8 characters like `é` in test data. * Test sliced columns as input (that is, columns that have a nonzero `offset`). This is an easy to forget case. - * Tests that verify various forms of "degenerate" column inputs, for example: empty - string columns that have no children (not many paths in cudf can generate these but it - does happen); columns with zero size but that somehow have non-null data pointers; and struct + * Tests that verify various forms of "degenerate" column inputs, for example: empty + string columns that have no children (not many paths in cudf can generate these but it + does happen); columns with zero size but that somehow have non-null data pointers; and struct columns with no children. - * Decimal types are not included in the `NumericTypes` type list, but are included in - `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as + * Decimal types are not included in the `NumericTypes` type list, but are included in + `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as appropriate. ## Directory and File Naming -The naming of unit test directories and source files should be consistent with the feature being +The naming of unit test directories and source files should be consistent with the feature being tested. For example, the tests for APIs in `copying.hpp` should live in `cudf/cpp/tests/copying`. -Each feature (or set of related features) should have its own test source file named -`_tests.cu/cpp`. For example, `cudf/cpp/src/copying/scatter.cu` has tests in +Each feature (or set of related features) should have its own test source file named +`_tests.cu/cpp`. For example, `cudf/cpp/src/copying/scatter.cu` has tests in `cudf/cpp/tests/copying/scatter_tests.cu`. -In the interest of improving compile time, whenever possible, test source files should be `.cpp` +In the interest of improving compile time, whenever possible, test source files should be `.cpp` files because `nvcc` is slower than `gcc` in compiling host code. Note that `thrust::device_vector` -includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, -`rmm::device_buffer` and the various `column_wrapper` types described later can be used in `.cpp` +includes device code, and so must only be used in `.cu` files. `rmm::device_uvector`, +`rmm::device_buffer` and the various `column_wrapper` types described later can be used in `.cpp` files, and are therefore preferred in test code over `thrust::device_vector`. ## Base Fixture All libcudf unit tests should make use of a GTest ["Test Fixture"](https://github.com/google/googletest/blob/master/docs/primer.md#test-fixtures-using-the-same-data-configuration-for-multiple-tests-same-data-multiple-tests). -Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture` -found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and -finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is +Even if the fixture is empty, it should inherit from the base fixture `cudf::test::BaseFixture` +found in `include/cudf_test/base_fixture.hpp`. This ensures that RMM is properly initialized and +finalized. `cudf::test::BaseFixture` already inherits from `::testing::Test` and therefore it is not necessary for your test fixtures to inherit from it. Example: @@ -74,7 +74,7 @@ class MyTestFixture : public cudf::test::BaseFixture {...}; In general, libcudf features must work across all of the supported types (there are exceptions e.g. not all binary operations are supported for all types). In order to automate the process of running -the same tests across multiple types, we use GTest's +the same tests across multiple types, we use GTest's [Typed Tests](https://github.com/google/googletest/blob/master/docs/advanced.md#typed-tests). Typed tests allow you to write a test once and run it across a list of types. @@ -92,15 +92,15 @@ TYPED_TEST(TypedTestFixture, FirstTest){ ``` To specify the list of types to use, instead of GTest's `::testing::Types<...>`, libcudf provides `cudf::test::Types<...>` which is a custom, drop-in replacement for `::testing::Types`. -In this example, all tests using the `TypedTestFixture` fixture will run once for each type in the +In this example, all tests using the `TypedTestFixture` fixture will run once for each type in the list defined in `TestTypes` (`int, float, double`). ### Type Lists -The list of types that are used in tests should be consistent across all tests. To ensure -consistency, several sets of common type lists are provided in +The list of types that are used in tests should be consistent across all tests. To ensure +consistency, several sets of common type lists are provided in `include/cudf_test/type_lists.hpp`. For example, `NumericTypes` is a type list of all numeric types, -`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every +`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every element type that libcudf supports. ```c++ @@ -110,17 +110,17 @@ element type that libcudf supports. TYPED_TEST_SUITE(TypedTestFixture, cudf::test::NumericTypes); ``` -Whenever possible, use one of the type list provided in `include/utilities/test/type_lists.hpp` +Whenever possible, use one of the type list provided in `include/utilities/test/type_lists.hpp` rather than creating new custom lists. #### Advanced Type Lists -Sometimes it is necessary to generate more advanced type lists than the simple lists of single types -in the `TypeList` example above. libcudf provides a set of meta-programming utilities in +Sometimes it is necessary to generate more advanced type lists than the simple lists of single types +in the `TypeList` example above. libcudf provides a set of meta-programming utilities in `include/cudf_test/type_list_utilities.hpp` for generating and composing more advanced type lists. For example, it may be useful to generate a *nested* type list where each element in the list is two -types. In a nested type list, each element in the list is itself another list. In order to access +types. In a nested type list, each element in the list is itself another list. In order to access the `N`th type within the nested list, use `GetType`. Imagine testing all possible two-type combinations of ``. This could be done manually: @@ -129,7 +129,7 @@ Imagine testing all possible two-type combinations of ``. This could using namespace cudf::test; template TwoTypesFixture : BaseFixture{...}; -using TwoTypesList = Types< Types, Types, +using TwoTypesList = Types< Types, Types, Types, Types >; TYPED_TEST_SUITE(TwoTypesFixture, TwoTypesList); TYPED_TEST(TwoTypesFixture, FirstTest){ @@ -140,49 +140,49 @@ TYPED_TEST(TwoTypesFixture, FirstTest){ } ``` -The above example manually specifies all pairs composed of `int` and `float`. `CrossProduct` is a +The above example manually specifies all pairs composed of `int` and `float`. `CrossProduct` is a utility in `type_list_utilities.hpp` which materializes this cross product automatically. ```c++ -using TwoTypesList = Types< Types, Types, +using TwoTypesList = Types< Types, Types, Types, Types >; using CrossProductTypeList = CrossProduct< Types, Types >; // TwoTypesList and CrossProductTypeList are identical ``` `CrossProduct` can be used with an arbitrary number of type lists to generate nested type lists of -two or more types. **However**, overuse of `CrossProduct` can dramatically inflate compile time. -The cross product of two type lists of size `n` and `m` will result in a new list with -`n*m` nested type lists. This means `n*m` templates will be instantiated; `n` and `m` need not be +two or more types. **However**, overuse of `CrossProduct` can dramatically inflate compile time. +The cross product of two type lists of size `n` and `m` will result in a new list with +`n*m` nested type lists. This means `n*m` templates will be instantiated; `n` and `m` need not be large before compile time becomes unreasonable. -There are a number of other utilities in `type_list_utilities.hpp`. For more details, see the -documentation in that file and their associated tests in +There are a number of other utilities in `type_list_utilities.hpp`. For more details, see the +documentation in that file and their associated tests in `cudf/cpp/tests/utilities_tests/type_list_tests.cpp`. ## Utilities libcudf provides a number of utilities in `include/cudf_test` to make common testing operations more -convenient. Before creating your own test utilities, look to see if one already exists that does -what you need. If not, consider adding a new utility to do what you need. However, make sure that -the utility is generic enough to be useful for other tests and is not overly tailored to your +convenient. Before creating your own test utilities, look to see if one already exists that does +what you need. If not, consider adding a new utility to do what you need. However, make sure that +the utility is generic enough to be useful for other tests and is not overly tailored to your specific testing need. ### Column Wrappers In order to make generating input columns easier, libcudf provides the `*_column_wrapper` classes in `include/cudf_test/column_wrapper.hpp`. These classes wrap a `cudf::column` and provide constructors -for initializing a `cudf::column` object usable with libcudf APIs. Any `*_column_wrapper` class is -implicitly convertible to a `column_view` or `mutable_column_view` and therefore may be +for initializing a `cudf::column` object usable with libcudf APIs. Any `*_column_wrapper` class is +implicitly convertible to a `column_view` or `mutable_column_view` and therefore may be transparently passed to any API expecting a `column_view` or `mutable_column_view` argument. #### `fixed_width_column_wrapper` The `fixed_width_column_wrapper` class should be used for constructing and initializing columns of -any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc. -`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each -element in the column. For nullable columns, an additional iterator can be provided to indicate the -validity of each element. There are also constructors that accept a `std::initializer_list` for +any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc. +`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each +element in the column. For nullable columns, an additional iterator can be provided to indicate the +validity of each element. There are also constructors that accept a `std::initializer_list` for the column elements and optionally for the validity of each element. Example: @@ -207,9 +207,9 @@ fixed_width_column_wrapper w{ {1,2,3,4}, {1, 0, 1, 0}}; #### `fixed_point_column_wrapper` The `fixed_point_column_wrapper` class should be used for constructing and initializing columns of -any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides -constructors that accept an iterator range to generate each element in the column. For nullable -columns, an additional iterator can be provided to indicate the validity of each element. +any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides +constructors that accept an iterator range to generate each element in the column. For nullable +columns, an additional iterator can be provided to indicate the validity of each element. Constructors also take the scale of the fixed-point values to create. Example: @@ -226,10 +226,10 @@ fixed_point_column_wrapper w(elements, elements + 5, validity, 2); #### `dictionary_column_wrapper` -The `dictionary_column_wrapper` class should be used to create dictionary columns. -`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each -element in the column. For nullable columns, an additional iterator can be provided to indicate the -validity of each element. There are also constructors that accept a `std::initializer_list` for +The `dictionary_column_wrapper` class should be used to create dictionary columns. +`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each +element in the column. For nullable columns, an additional iterator can be provided to indicate the +validity of each element. There are also constructors that accept a `std::initializer_list` for the column elements and optionally for the validity of each element. Example: @@ -273,30 +273,30 @@ dictionary_column_wrapper d({"", "bb", "", "bb", "", "a", ""}, vali #### `strings_column_wrapper` -The `strings_column_wrapper` class should be used to create columns of strings. It provides -constructors that accept an iterator range to generate each string in the column. For nullable -columns, an additional iterator can be provided to indicate the validity of each string. There are -also constructors that accept a `std::initializer_list` for the column's strings and +The `strings_column_wrapper` class should be used to create columns of strings. It provides +constructors that accept an iterator range to generate each string in the column. For nullable +columns, an additional iterator can be provided to indicate the validity of each string. There are +also constructors that accept a `std::initializer_list` for the column's strings and optionally for the validity of each element. Example: ```c++ -// Creates a non-nullable STRING column with 7 string elements: +// Creates a non-nullable STRING column with 7 string elements: // {"", "this", "is", "a", "column", "of", "strings"} std::vector strings{"", "this", "is", "a", "column", "of", "strings"}; strings_column_wrapper s(strings.begin(), strings.end()); -// Creates a nullable STRING column with 7 string elements: +// Creates a nullable STRING column with 7 string elements: // {NULL, "this", NULL, "a", NULL, "of", NULL} std::vector strings{"", "this", "is", "a", "column", "of", "strings"}; auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;}); strings_column_wrapper s(strings.begin(), strings.end(), validity); -// Creates a non-nullable STRING column with 7 string elements: +// Creates a non-nullable STRING column with 7 string elements: // {"", "this", "is", "a", "column", "of", "strings"} strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}); -// Creates a nullable STRING column with 7 string elements: +// Creates a nullable STRING column with 7 string elements: // {NULL, "this", NULL, "a", NULL, "of", NULL} auto validity = make_counting_transform_iterator(0, [](auto i){return i%2;}); strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity); @@ -304,10 +304,10 @@ strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, val #### `lists_column_wrapper` -The `lists_column_wrapper` class should be used to create columns of lists. It provides -constructors that accept an iterator range to generate each list in the column. For nullable -columns, an additional iterator can be provided to indicate the validity of each list. There are -also constructors that accept a `std::initializer_list` for the column's lists and +The `lists_column_wrapper` class should be used to create columns of lists. It provides +constructors that accept an iterator range to generate each list in the column. For nullable +columns, an additional iterator can be provided to indicate the validity of each list. There are +also constructors that accept a `std::initializer_list` for the column's lists and optionally for the validity of each element. A number of other constructors are available. Example: @@ -357,9 +357,9 @@ lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validit #### `structs_column_wrapper` -The `structs_column_wrapper` class should be used to create columns of structs. It provides +The `structs_column_wrapper` class should be used to create columns of structs. It provides constructors that accept a vector or initializer list of pre-constructed columns or column wrappers -for child columns. For nullable columns, an additional iterator can be provided to indicate the +for child columns. For nullable columns, an additional iterator can be provided to indicate the validity of each struct. Examples: @@ -413,29 +413,29 @@ have the same metadata. #### `expect_column_properties_equal` -Verifies that two columns have the same type, size, and nullability. For nested types, recursively +Verifies that two columns have the same type, size, and nullability. For nested types, recursively verifies the equality of type, size and nullability of all nested children. #### `expect_column_properties_equivalent` -Verifies that two columns have equivalent type and equal size, ignoring nullability. For nested +Verifies that two columns have equivalent type and equal size, ignoring nullability. For nested types, recursively verifies the equivalence of type, and equality of size of all nested children, ignoring nullability. Note "equivalent type". Most types are equivalent if and only they are equal. `fixed_point` types -are one exception. They are equivalent if the representation type is equal, even if they have -different scales. Nested type columns can be equivalent in the case where they both have zero size, -but one has children (also empty) and the other does not. For columns with nonzero size, both equals +are one exception. They are equivalent if the representation type is equal, even if they have +different scales. Nested type columns can be equivalent in the case where they both have zero size, +but one has children (also empty) and the other does not. For columns with nonzero size, both equals and equivalent expect equal number of children. #### `expect_columns_equal` -Verifies that two columns have equal properties and verifies elementwise equality of the column +Verifies that two columns have equal properties and verifies elementwise equality of the column data. Null elements are treated as equal. #### `expect_columns_equivalent` -Verifies that two columns have equivalent properties and verifies elementwise equivalence of the +Verifies that two columns have equivalent properties and verifies elementwise equivalence of the column data. Null elements are treated as equivalent. #### `expect_equal_buffers` @@ -444,6 +444,6 @@ Verifies the bitwise equality of two device memory buffers. ### Printing and accessing column data -`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing +`include/cudf_test/column_utilities.hpp` defines various functions and overloads for printing columns (`print`), converting column data to string (`to_string`, `to_strings`), and copying data to the host (`to_host). From 7fc65d80893a85c58252742e33cfee5e4dda179d Mon Sep 17 00:00:00 2001 From: Peixin Date: Tue, 16 Nov 2021 10:38:37 +0800 Subject: [PATCH 003/202] Update cudf JNI to 22.02.0-SNAPSHOT (#9681) Signed-off-by: Peixin Li cudfjni version update. NOTE: this includes change to use gpuci/cuda images since official cuda images is not ready yet on docker hub Authors: - Peixin (https://github.com/pxLi) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9681 --- java/ci/Dockerfile.centos7 | 8 +++++--- java/ci/README.md | 10 +++++----- java/pom.xml | 2 +- java/src/main/native/CMakeLists.txt | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7 index a6264a84696..2ee57bfaeab 100644 --- a/java/ci/Dockerfile.centos7 +++ b/java/ci/Dockerfile.centos7 @@ -17,11 +17,13 @@ ### # Build the image for cudf development environment. # -# Arguments: CUDA_VERSION=11.0, 11.1, 11.2.0 or 11.2.2 +# Arguments: CUDA_VERSION=11.5.0 # ### -ARG CUDA_VERSION -FROM nvidia/cuda:$CUDA_VERSION-devel-centos7 +ARG CUDA_VERSION=11.5.0 +# use rapids gpuci/cuda images until nvidia/cuda cuda 11.5+ images are available in docker hub +# FROM nvidia/cuda:$CUDA_VERSION-devel-centos7 +FROM gpuci/cuda:$CUDA_VERSION-devel-centos7 ### Install basic requirements RUN yum install -y centos-release-scl diff --git a/java/ci/README.md b/java/ci/README.md index 5432dc8d0f1..0e947b62511 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -11,14 +11,14 @@ In the root path of cuDF repo, run below command to build the docker image. ```bash -docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.2.2 -t cudf-build:11.2.2-devel-centos7 . +docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.5.0 -t cudf-build:11.5.0-devel-centos7 . ``` The following CUDA versions are supported w/ CUDA Enhanced Compatibility: * CUDA 11.0+ Change the --build-arg CUDA_VERSION to what you need. -You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you like. +You can replace the tag "cudf-build:11.5.0-devel-centos7" with another name you like. ## Start the docker then build @@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.2.2-devel-centos7" with another name you Run below command to start a docker container with GPU. ```bash -nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash +nvidia-docker run -it cudf-build:11.5.0-devel-centos7 bash ``` ### Download the cuDF source code @@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.12 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.02 ``` ### Build cuDF jar with devtoolset @@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-21.12.0-SNAPSHOT-cuda11.jar. +You can find the cuDF jar in java/target/ like cudf-22.02.0-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index 356d94455c8..87d43ec1272 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 21.12.0-SNAPSHOT + 22.02.0-SNAPSHOT cudfjni diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index d9fc3f337e7..a5a6646c7e6 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -26,7 +26,7 @@ rapids_cuda_init_architectures(CUDF_JNI) project( CUDF_JNI - VERSION 21.12.00 + VERSION 22.02.00 LANGUAGES C CXX CUDA ) From 7e4a985444148d727a1be457e745eff7fecc75fc Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 15 Nov 2021 20:51:11 -0800 Subject: [PATCH 004/202] Some improvements to `parse_decimal` function and bindings for `is_fixed_point` (#9658) This PR adds Java bindings for `is_fixed_point` Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Nghia Truong (https://github.com/ttnghia) - Robert (Bobby) Evans (https://github.com/revans2) - David Wendt (https://github.com/davidwendt) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9658 --- .../strings/convert/convert_fixed_point.hpp | 8 ++-- .../main/java/ai/rapids/cudf/ColumnView.java | 32 ++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 16 ++++++++ .../java/ai/rapids/cudf/ColumnVectorTest.java | 38 +++++++++---------- 4 files changed, 69 insertions(+), 25 deletions(-) diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp index 7bcb7e72ab2..5fe5c880f9d 100644 --- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp +++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp @@ -93,18 +93,16 @@ std::unique_ptr from_fixed_point( * @brief Returns a boolean column identifying strings in which all * characters are valid for conversion to fixed-point. * - * The output row entry is set to `true` if the corresponding string element - * has at least one character in [+-0123456789.]. The optional sign character - * must only be in the first position. The decimal point may only appear once. + * The sign and the exponent is optional. The decimal point may only appear once. * Also, the integer component must fit within the size limits of the * underlying fixed-point storage type. The value of the integer component * is based on the scale of the `decimal_type` provided. * * @code{.pseudo} * Example: - * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34' '.789', '-0.005] + * s = ['123', '-456', '', '1.2.3', '+17E30', '12.34', '.789', '-0.005] * b = is_fixed_point(s) - * b is [true, true, false, false, false, true, true, true] + * b is [true, true, false, false, true, true, true, true] * @endcode * * Any null entries result in corresponding null entries in the output column. diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index aa9d3f0d9f3..329c251f72d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -348,6 +348,34 @@ public final ColumnVector isNull() { return new ColumnVector(isNullNative(getNativeView())); } + /** + * Returns a Boolean vector with the same number of rows as this instance, that has + * TRUE for any entry that is a fixed-point, and FALSE if its not a fixed-point. + * A null will be returned for null entries. + * + * The sign and the exponent is optional. The decimal point may only appear once. + * The integer component must fit within the size limits of the underlying fixed-point + * storage type. The value of the integer component is based on the scale of the target + * decimalType. + * + * Example: + * vec = ["A", "nan", "Inf", "-Inf", "Infinity", "infinity", "2.1474", "112.383", "-2.14748", + * "NULL", "null", null, "1.2", "1.2e-4", "0.00012"] + * vec.isFixedPoint() = [false, false, false, false, false, false, true, true, true, false, false, + * null, true, true, true] + * + * @param decimalType the data type that should be used for bounds checking. Note that only + * Decimal types (fixed-point) are allowed. + * @return Boolean vector + */ + public final ColumnVector isFixedPoint(DType decimalType) { + assert type.equals(DType.STRING); + assert decimalType.isDecimalType(); + return new ColumnVector(isFixedPoint(getNativeView(), + decimalType.getTypeId().getNativeId(), decimalType.getScale())); + } + + /** * Returns a Boolean vector with the same number of rows as this instance, that has * TRUE for any entry that is an integer, and FALSE if its not an integer. A null will be returned @@ -375,6 +403,7 @@ public final ColumnVector isInteger() { */ public final ColumnVector isInteger(DType intType) { assert type.equals(DType.STRING); + assert intType.isBackedByInt() || intType.isBackedByLong(); return new ColumnVector(isIntegerWithType(getNativeView(), intType.getTypeId().getNativeId(), intType.getScale())); } @@ -3220,6 +3249,9 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + + private static native long isFixedPoint(long viewHandle, int nativeTypeId, int scale); + /** * Native method to concatenate a list column of strings (each row is a list of strings), * concatenates the strings within each row and returns a single strings column result. diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index d2a2030e24c..bce330ea4a3 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2023,6 +2023,22 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jo CATCH_STD(env, 0) } +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject, + jlong handle, jint j_dtype, + jint scale) { + + JNI_NULL_CHECK(env, handle, "native view handle is null", 0) + + try { + cudf::jni::auto_set_device(env); + cudf::column_view *view = reinterpret_cast(handle); + cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale); + std::unique_ptr result = cudf::strings::is_fixed_point(*view, fp_dtype); + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) +} + JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject, jlong handle, jint j_dtype, jint scale) { diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 0d007aa0ed7..4d52862f7b0 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -18,12 +18,7 @@ package ai.rapids.cudf; -import ai.rapids.cudf.HostColumnVector.BasicType; -import ai.rapids.cudf.HostColumnVector.DataType; -import ai.rapids.cudf.HostColumnVector.ListType; -import ai.rapids.cudf.HostColumnVector.StructData; -import ai.rapids.cudf.HostColumnVector.StructType; - +import ai.rapids.cudf.HostColumnVector.*; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @@ -38,20 +33,9 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import static ai.rapids.cudf.QuantileMethod.HIGHER; -import static ai.rapids.cudf.QuantileMethod.LINEAR; -import static ai.rapids.cudf.QuantileMethod.LOWER; -import static ai.rapids.cudf.QuantileMethod.MIDPOINT; -import static ai.rapids.cudf.QuantileMethod.NEAREST; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; -import static ai.rapids.cudf.TableTest.assertStructColumnsAreEqual; -import static ai.rapids.cudf.TableTest.assertTablesAreEqual; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotEquals; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertThrows; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static ai.rapids.cudf.QuantileMethod.*; +import static ai.rapids.cudf.TableTest.*; +import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assumptions.assumeTrue; public class ColumnVectorTest extends CudfTestBase { @@ -4834,6 +4818,20 @@ void testIsInteger() { } } + @Test + void testIsFixedPoint() { + String[] decimalStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", + "2.1474", "112.383", "-2.14748", "NULL", "null", null, "1.2", "1.2e-4", "0.00012"}; + + DType dt = DType.create(DType.DTypeEnum.DECIMAL32, -3); + try (ColumnVector decStringCV = ColumnVector.fromStrings(decimalStrings); + ColumnVector isFixedPoint = decStringCV.isFixedPoint(dt); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, false, false, false + , false, true, true, true, false, false, null, true, true, true)) { + assertColumnsAreEqual(expected, isFixedPoint); + } + } + @Test void testIsFloat() { String[] floatStrings = {"A", "nan", "Inf", "-Inf", "Infinity", "infinity", "-0.0", "0.0", From c3bcc8d6d223a999b5beba3c60ad8af8d86844a0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 16 Nov 2021 10:36:25 -0600 Subject: [PATCH 005/202] Fix `null` handling when `boolean` dtype is passed (#9691) Fixes: #9642 This PR fixes issue where null values being treated as `False` when `boolean` dtype was being passed to the `Series` constructor. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9691 --- python/cudf/cudf/core/column/column.py | 5 +++++ python/cudf/cudf/tests/test_series.py | 11 +++++++++++ 2 files changed, 16 insertions(+) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5f9104263b1..6f2f01c746d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2060,6 +2060,11 @@ def as_column( return cudf.core.column.Decimal32Column.from_arrow( data ) + if is_bool_dtype(dtype): + # Need this special case handling for bool dtypes, + # since 'boolean' & 'pd.BooleanDtype' are not + # understood by np.dtype below. + dtype = "bool" np_type = np.dtype(dtype).type pa_type = np_to_pa_dtype(np.dtype(dtype)) data = as_column( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 09f0417b7ac..73fe46746ce 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1335,3 +1335,14 @@ def test_equals_names(lhs, rhs): expect = lhs.to_pandas().equals(rhs.to_pandas()) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", [[True, False, None, True, False], [None, None], []] +) +@pytest.mark.parametrize("bool_dtype", ["bool", "boolean", pd.BooleanDtype()]) +def test_nullable_bool_dtype_series(data, bool_dtype): + psr = pd.Series(data, dtype=pd.BooleanDtype()) + gsr = cudf.Series(data, dtype=bool_dtype) + + assert_eq(psr, gsr.to_pandas(nullable=True)) From e08ae9cb15fe260015cf70a22181fa67123e779f Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Tue, 16 Nov 2021 18:03:14 -0800 Subject: [PATCH 006/202] Implement Series.datetime.floor (#9571) Fixes: #7102 Replaces: [#9488](https://github.com/rapidsai/cudf/pull/9488/files) Authors: - Sheilah Kirui (https://github.com/skirui-source) - Mayank Anand (https://github.com/mayankanand007) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Michael Wang (https://github.com/isVoid) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9571 --- cpp/include/cudf/datetime.hpp | 93 ++++++++++- cpp/src/datetime/datetime_ops.cu | 199 ++++++++++++++++++----- cpp/tests/datetime/datetime_ops_test.cpp | 92 ++++++++++- docs/cudf/source/api_docs/series.rst | 2 + python/cudf/cudf/_lib/cpp/datetime.pxd | 17 +- python/cudf/cudf/_lib/datetime.pyx | 33 +++- python/cudf/cudf/core/column/datetime.py | 7 +- python/cudf/cudf/core/series.py | 73 ++++++++- python/cudf/cudf/tests/test_datetime.py | 39 ++++- 9 files changed, 502 insertions(+), 53 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index d67984daa7c..71e5968bf07 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -378,5 +378,96 @@ std::unique_ptr ceil_nanosecond( column_view const& column, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Round down to the nearest day + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_day( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest hour + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_hour( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest minute + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_minute( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest second + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_second( + cudf::column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest millisecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_millisecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest microsecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_microsecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Round down to the nearest nanosecond + * + * @param column cudf::column_view of the input datetime values + * @param mr Device memory resource used to allocate device memory of the returned column. + * + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + * @return cudf::column of the same datetime resolution as the input column + */ +std::unique_ptr floor_nanosecond( + column_view const& column, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace datetime } // namespace cudf diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ccfad56b4ea..717bd7ac0a8 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -54,6 +54,8 @@ enum class datetime_component { NANOSECOND }; +enum class rounding_kind { CEIL, FLOOR }; + template struct extract_component_operator { template @@ -88,32 +90,59 @@ struct extract_component_operator { } }; -template -struct ceil_timestamp { +// This functor takes the rounding type as runtime info and dispatches to the ceil/floor/round +// function. +template +struct RoundFunctor { + template + CUDA_DEVICE_CALLABLE auto operator()(rounding_kind round_kind, Timestamp dt) + { + switch (round_kind) { + case rounding_kind::CEIL: return cuda::std::chrono::ceil(dt); + case rounding_kind::FLOOR: return cuda::std::chrono::floor(dt); + default: cudf_assert(false && "Unsupported rounding kind."); + } + __builtin_unreachable(); + } +}; + +struct RoundingDispatcher { + rounding_kind round_kind; + datetime_component component; + + RoundingDispatcher(rounding_kind round_kind, datetime_component component) + : round_kind(round_kind), component(component) + { + } + template CUDA_DEVICE_CALLABLE Timestamp operator()(Timestamp const ts) const { - using namespace cuda::std::chrono; - // want to use this with D, H, T (minute), S, L (millisecond), U - switch (COMPONENT) { + switch (component) { case datetime_component::DAY: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::HOUR: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::MINUTE: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::SECOND: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::MILLISECOND: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::MICROSECOND: - return time_point_cast(ceil(ts)); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); case datetime_component::NANOSECOND: - return time_point_cast(ceil(ts)); - default: cudf_assert(false && "Unexpected resolution"); + return time_point_cast( + RoundFunctor{}(round_kind, ts)); + default: cudf_assert(false && "Unsupported datetime rounding resolution."); } - - return {}; + __builtin_unreachable(); } }; @@ -196,10 +225,11 @@ struct is_leap_year_op { }; // Specific function for applying ceil/floor date ops -template -struct dispatch_ceil { +struct dispatch_round { template std::enable_if_t(), std::unique_ptr> operator()( + rounding_kind round_kind, + datetime_component component, cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const @@ -221,7 +251,7 @@ struct dispatch_ceil { column.begin(), column.end(), output->mutable_view().begin(), - TransformFunctor{}); + RoundingDispatcher{round_kind, component}); return output; } @@ -384,13 +414,14 @@ std::unique_ptr add_calendrical_months(column_view const& timestamp_colu } } -template -std::unique_ptr ceil_general(column_view const& column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr round_general(rounding_kind round_kind, + datetime_component component, + column_view const& column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return cudf::type_dispatcher( - column.type(), dispatch_ceil>{}, column, stream, mr); + column.type(), dispatch_round{}, round_kind, component, column, stream, mr); } std::unique_ptr extract_year(column_view const& column, @@ -498,53 +529,147 @@ std::unique_ptr extract_quarter(column_view const& column, std::unique_ptr ceil_day(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::DAY, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_hour(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::HOUR, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_minute(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MINUTE, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_second(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::SECOND, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_millisecond(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MILLISECOND, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_microsecond(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::MICROSECOND, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr ceil_nanosecond(column_view const& column, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::ceil_general( - column, rmm::cuda_stream_default, mr); + return detail::round_general(detail::rounding_kind::CEIL, + detail::datetime_component::NANOSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_day(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::DAY, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_hour(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::HOUR, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_minute(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MINUTE, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_second(column_view const& column, rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::SECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_millisecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MILLISECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_microsecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::MICROSECOND, + column, + rmm::cuda_stream_default, + mr); +} + +std::unique_ptr floor_nanosecond(column_view const& column, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::round_general(detail::rounding_kind::FLOOR, + detail::datetime_component::NANOSECOND, + column, + rmm::cuda_stream_default, + mr); } std::unique_ptr extract_year(column_view const& column, rmm::mr::device_memory_resource* mr) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index c0d2d1cc447..b70ac29fd5d 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -357,9 +357,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime) using namespace cuda::std::chrono; auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop_ = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop_)); + auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); auto host_val = to_host(input); thrust::host_vector timestamps = host_val.first; @@ -403,6 +403,22 @@ TYPED_TEST(TypedDatetimeOpsTest, TestCeilDatetime) auto expected_millisecond = fixed_width_column_wrapper( ceiled_millisecond.begin(), ceiled_millisecond.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_millisecond(input), expected_millisecond); + + std::vector ceiled_microsecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_microsecond.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_microsecond = fixed_width_column_wrapper( + ceiled_microsecond.begin(), ceiled_microsecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_microsecond(input), expected_microsecond); + + std::vector ceiled_nanosecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), ceiled_nanosecond.begin(), [](auto i) { + return time_point_cast(ceil(i)); + }); + auto expected_nanosecond = fixed_width_column_wrapper( + ceiled_nanosecond.begin(), ceiled_nanosecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ceil_nanosecond(input), expected_nanosecond); } TEST_F(BasicDatetimeOpsTest, TestDayOfYearWithDate) @@ -827,4 +843,76 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_quarter(timestamps_s), quarter); } +TYPED_TEST(TypedDatetimeOpsTest, TestFloorDatetime) +{ + using T = TypeParam; + using namespace cudf::test; + using namespace cudf::datetime; + using namespace cuda::std::chrono; + + auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + + auto input = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); + + auto host_val = to_host(input); + thrust::host_vector timestamps = host_val.first; + + std::vector floored_day(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_day.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_day = fixed_width_column_wrapper(floored_day.begin(), + floored_day.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_day(input), expected_day); + + std::vector floored_hour(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_hour.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_hour = fixed_width_column_wrapper( + floored_hour.begin(), floored_hour.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_hour(input), expected_hour); + + std::vector floored_minute(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_minute.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_minute = fixed_width_column_wrapper( + floored_minute.begin(), floored_minute.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_minute(input), expected_minute); + + std::vector floored_second(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_second.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_second = fixed_width_column_wrapper( + floored_second.begin(), floored_second.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_second); + + std::vector floored_millisecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_millisecond.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_millisecond = fixed_width_column_wrapper( + floored_millisecond.begin(), floored_millisecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_millisecond); + + std::vector floored_microsecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_microsecond.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_microsecond = fixed_width_column_wrapper( + floored_microsecond.begin(), floored_microsecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_second(input), expected_microsecond); + + std::vector floored_nanosecond(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), floored_nanosecond.begin(), [](auto i) { + return time_point_cast(floor(i)); + }); + auto expected_nanosecond = fixed_width_column_wrapper( + floored_nanosecond.begin(), floored_nanosecond.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*floor_millisecond(input), expected_nanosecond); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 46a31a0dcf6..b90ee628332 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -300,6 +300,8 @@ Datetime methods strftime isocalendar + ceil + floor Timedelta properties diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd index 2af4dd648c5..38ed9fbd769 100644 --- a/python/cudf/cudf/_lib/cpp/datetime.pxd +++ b/python/cudf/cudf/_lib/cpp/datetime.pxd @@ -23,7 +23,22 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] ceil_microsecond( const column_view& column ) except + - cdef unique_ptr[column] ceil_nanosecond(const column_view& column) except + + cdef unique_ptr[column] ceil_nanosecond( + const column_view& column + ) except + + cdef unique_ptr[column] floor_day(const column_view& column) except + + cdef unique_ptr[column] floor_hour(const column_view& column) except + + cdef unique_ptr[column] floor_minute(const column_view& column) except + + cdef unique_ptr[column] floor_second(const column_view& column) except + + cdef unique_ptr[column] floor_millisecond( + const column_view& column + ) except + + cdef unique_ptr[column] floor_microsecond( + const column_view& column + ) except + + cdef unique_ptr[column] floor_nanosecond( + const column_view& column + ) except + cdef unique_ptr[column] add_calendrical_months( const column_view& timestamps, const column_view& months diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 5cda06362b6..3215088c438 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -72,13 +72,13 @@ def ceil_datetime(Column col, object field): c_result = move(libcudf_datetime.ceil_day(col_view)) elif field == "H": c_result = move(libcudf_datetime.ceil_hour(col_view)) - elif field == "T": + elif field == "T" or field == "min": c_result = move(libcudf_datetime.ceil_minute(col_view)) elif field == "S": c_result = move(libcudf_datetime.ceil_second(col_view)) - elif field == "L": + elif field == "L" or field == "ms": c_result = move(libcudf_datetime.ceil_millisecond(col_view)) - elif field == "U": + elif field == "U" or field == "us": c_result = move(libcudf_datetime.ceil_microsecond(col_view)) elif field == "N": c_result = move(libcudf_datetime.ceil_nanosecond(col_view)) @@ -89,6 +89,33 @@ def ceil_datetime(Column col, object field): return result +def floor_datetime(Column col, object field): + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + + with nogil: + # https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.resolution_string.html + if field == "D": + c_result = move(libcudf_datetime.floor_day(col_view)) + elif field == "H": + c_result = move(libcudf_datetime.floor_hour(col_view)) + elif field == "T" or field == "min": + c_result = move(libcudf_datetime.floor_minute(col_view)) + elif field == "S": + c_result = move(libcudf_datetime.floor_second(col_view)) + elif field == "L" or field == "ms": + c_result = move(libcudf_datetime.floor_millisecond(col_view)) + elif field == "U" or field == "us": + c_result = move(libcudf_datetime.floor_microsecond(col_view)) + elif field == "N": + c_result = move(libcudf_datetime.floor_nanosecond(col_view)) + else: + raise ValueError(f"Invalid resolution: '{field}'") + + result = Column.from_unique_ptr(move(c_result)) + return result + + def is_leap_year(Column col): """Returns a boolean indicator whether the year of the date is a leap year """ diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 68379002e6b..756e48edccb 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -222,8 +222,11 @@ def values(self): def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component(self, field) - def ceil(self, field: str) -> ColumnBase: - return libcudf.datetime.ceil_datetime(self, field) + def ceil(self, freq: str) -> ColumnBase: + return libcudf.datetime.ceil_datetime(self, freq) + + def floor(self, freq: str) -> ColumnBase: + return libcudf.datetime.floor_datetime(self, freq) def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: if isinstance(other, cudf.Scalar): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 00a8ebabe34..c804f2bca2c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4592,11 +4592,76 @@ def _get_dt_field(self, field): data=out_column, index=self.series._index, name=self.series.name ) - def ceil(self, field): - out_column = self.series._column.ceil(field) + def ceil(self, freq): + """ + Perform ceil operation on the data to the specified freq. - return Series( - data=out_column, index=self.series._index, name=self.series.name + Parameters + ---------- + freq : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + Series + Series with all timestamps rounded up to the specified frequency. + The index is preserved. + + Examples + -------- + >>> import cudf + >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58", + ... "2001-01-01 00:05:04"], dtype="datetime64[ns]") + >>> t.dt.ceil("T") + 0 2001-01-01 00:05:00 + 1 2001-01-01 00:05:00 + 2 2001-01-01 00:06:00 + dtype: datetime64[ns] + """ + out_column = self.series._column.ceil(freq) + + return Series._from_data( + data={self.series.name: out_column}, index=self.series._index + ) + + def floor(self, freq): + """ + Perform floor operation on the data to the specified freq. + + Parameters + ---------- + freq : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + Series + Series with all timestamps rounded up to the specified frequency. + The index is preserved. + + Examples + -------- + >>> import cudf + >>> t = cudf.Series(["2001-01-01 00:04:45", "2001-01-01 00:04:58", + ... "2001-01-01 00:05:04"], dtype="datetime64[ns]") + >>> t.dt.floor("T") + 0 2001-01-01 00:04:00 + 1 2001-01-01 00:04:00 + 2 2001-01-01 00:05:00 + dtype: datetime64[ns] + """ + out_column = self.series._column.floor(freq) + + return Series._from_data( + data={self.series.name: out_column}, index=self.series._index ) def strftime(self, date_format, *args, **kwargs): diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index d666dfc0ec1..bf75badc06f 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -1777,12 +1777,45 @@ def test_error_values(): ], ) @pytest.mark.parametrize("time_type", DATETIME_TYPES) -@pytest.mark.parametrize("resolution", ["D", "H", "T", "S", "L", "U", "N"]) +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) def test_ceil(data, time_type, resolution): - ps = pd.Series(data, dtype=time_type) - gs = cudf.from_pandas(ps) + gs = cudf.Series(data, dtype=time_type) + ps = gs.to_pandas() expect = ps.dt.ceil(resolution) got = gs.dt.ceil(resolution) assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + ( + [ + "2020-05-31 08:00:00", + "1999-12-31 18:40:10", + "2000-12-31 04:00:05", + "1900-02-28 07:00:06", + "1800-03-14 07:30:20", + "2100-03-14 07:30:20", + "1970-01-01 00:00:09", + "1969-12-31 12:59:10", + ] + ) + ], +) +@pytest.mark.parametrize("time_type", DATETIME_TYPES) +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) +def test_floor(data, time_type, resolution): + + gs = cudf.Series(data, dtype=time_type) + ps = gs.to_pandas() + + expect = ps.dt.floor(resolution) + got = gs.dt.floor(resolution) + assert_eq(expect, got) From 4d13d81bb04a51a1ad7f476184c2b1eb88038126 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Tue, 16 Nov 2021 21:11:28 -0800 Subject: [PATCH 007/202] Fixed build by adding more checks for int8, int16 (#9707) Add additional checks for int8, int16 fixes [#/rapidsai/cudf/4127](https://github.com/NVIDIA/spark-rapids/issues/4127) Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/9707 --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 329c251f72d..729444f460c 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -398,12 +398,13 @@ public final ColumnVector isInteger() { * for null entries. * * @param intType the data type that should be used for bounds checking. Note that only - * integer types are allowed. + * cudf integer types are allowed including signed/unsigned int8 through int64 * @return Boolean vector */ public final ColumnVector isInteger(DType intType) { assert type.equals(DType.STRING); - assert intType.isBackedByInt() || intType.isBackedByLong(); + assert intType.isBackedByInt() || intType.isBackedByLong() || intType.isBackedByByte() + || intType.isBackedByShort(); return new ColumnVector(isIntegerWithType(getNativeView(), intType.getTypeId().getNativeId(), intType.getScale())); } From 91141042ac5ce5024975eb2eab63f916047e6b6a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 17 Nov 2021 10:31:10 -0800 Subject: [PATCH 008/202] Add parameters to control row group size in Parquet writer (#9677) Closes https://github.com/rapidsai/cudf/issues/9615 Adds the following API to the Parquet writer: - Set maximum row group size, in bytes (minimum of 512KB); - Set maximum row group size, in rows (minimum of 5000). The API is more limited than its ORC equivalent because of limitation in Parquet page size control/estimation. Other changes: - Fix naming in some ORC APIs to be consistent. - Change `rowgroup` to `row_group` in APIs, since Parquet specs refer to this as "row group", not "rowgroup". - Replace some `uint32_t` use in Parquet writer. - Remove unused `target_page_size`. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9677 --- cpp/include/cudf/io/detail/parquet.hpp | 2 +- cpp/include/cudf/io/orc.hpp | 20 ++-- cpp/include/cudf/io/parquet.hpp | 125 ++++++++++++++++++++++- cpp/src/io/functions.cpp | 6 +- cpp/src/io/orc/writer_impl.cu | 12 +-- cpp/src/io/parquet/writer_impl.cu | 79 +++++++------- cpp/src/io/parquet/writer_impl.hpp | 12 +-- cpp/tests/io/parquet_test.cpp | 22 ++++ python/cudf/cudf/_lib/cpp/io/orc.pxd | 8 +- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 22 +++- python/cudf/cudf/_lib/parquet.pyx | 34 +++--- python/cudf/cudf/io/parquet.py | 6 ++ python/cudf/cudf/tests/test_parquet.py | 23 +++++ python/cudf/cudf/utils/ioutils.py | 10 +- 14 files changed, 291 insertions(+), 90 deletions(-) diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index 14f27ef8eef..98922ad10a4 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -148,7 +148,7 @@ class writer { * @param[in] metadata_list List of input file metadata * @return A parquet-compatible blob that contains the data for all rowgroups in the list */ - static std::unique_ptr> merge_rowgroup_metadata( + static std::unique_ptr> merge_row_group_metadata( const std::vector>>& metadata_list); }; diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index fb1199fc166..3bc2e6c9ef2 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -475,24 +475,24 @@ class orc_writer_options { /** * @brief Whether writing column statistics is enabled/disabled. */ - bool enable_statistics() const { return _enable_statistics; } + bool is_enabled_statistics() const { return _enable_statistics; } /** * @brief Returns maximum stripe size, in bytes. */ - auto stripe_size_bytes() const { return _stripe_size_bytes; } + auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. */ - auto stripe_size_rows() const { return _stripe_size_rows; } + auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. */ - auto row_index_stride() const + auto get_row_index_stride() const { - auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows()); + auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows()); return unaligned_stride - unaligned_stride % 8; } @@ -769,24 +769,24 @@ class chunked_orc_writer_options { /** * @brief Whether writing column statistics is enabled/disabled. */ - bool enable_statistics() const { return _enable_statistics; } + bool is_enabled_statistics() const { return _enable_statistics; } /** * @brief Returns maximum stripe size, in bytes. */ - auto stripe_size_bytes() const { return _stripe_size_bytes; } + auto get_stripe_size_bytes() const { return _stripe_size_bytes; } /** * @brief Returns maximum stripe size, in rows. */ - auto stripe_size_rows() const { return _stripe_size_rows; } + auto get_stripe_size_rows() const { return _stripe_size_rows; } /** * @brief Returns the row index stride. */ - auto row_index_stride() const + auto get_row_index_stride() const { - auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows()); + auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows()); return unaligned_stride - unaligned_stride % 8; } diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 660ec051304..88cf7416506 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -37,6 +37,9 @@ namespace io { * @file */ +constexpr size_t default_row_group_size_bytes = 128 * 1024 * 1024; // 128MB +constexpr size_type default_row_group_size_rows = 1000000; + /** * @brief Builds parquet_reader_options to use for `read_parquet()`. */ @@ -398,6 +401,10 @@ class parquet_writer_options { bool _write_timestamps_as_int96 = false; // Column chunks file path to be set in the raw output metadata std::string _column_chunks_file_path; + // Maximum size of each row group (unless smaller than a single page) + size_t _row_group_size_bytes = default_row_group_size_bytes; + // Maximum number of rows in row group (unless smaller than a single page) + size_type _row_group_size_rows = default_row_group_size_rows; /** * @brief Constructor from sink and table. @@ -472,6 +479,16 @@ class parquet_writer_options { */ std::string get_column_chunks_file_path() const { return _column_chunks_file_path; } + /** + * @brief Returns maximum row group size, in bytes. + */ + auto get_row_group_size_bytes() const { return _row_group_size_bytes; } + + /** + * @brief Returns maximum row group size, in rows. + */ + auto get_row_group_size_rows() const { return _row_group_size_rows; } + /** * @brief Sets metadata. * @@ -510,6 +527,28 @@ class parquet_writer_options { { _column_chunks_file_path.assign(file_path); } + + /** + * @brief Sets the maximum row group size, in bytes. + */ + void set_row_group_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS( + size_bytes >= 512 * 1024, + "The maximum row group size cannot be smaller than the page size, which is 512KB."); + _row_group_size_bytes = size_bytes; + } + + /** + * @brief Sets the maximum row group size, in rows. + */ + void set_row_group_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum row group size cannot be smaller than the page size, which is 5000 rows."); + _row_group_size_rows = size_rows; + } }; class parquet_writer_options_builder { @@ -582,6 +621,30 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Sets the maximum row group size, in bytes. + * + * @param val maximum row group size + * @return this for chaining. + */ + parquet_writer_options_builder& row_group_size_bytes(size_t val) + { + options.set_row_group_size_bytes(val); + return *this; + } + + /** + * @brief Sets the maximum number of rows in output row groups. + * + * @param val maximum number or rows + * @return this for chaining. + */ + parquet_writer_options_builder& row_group_size_rows(size_type val) + { + options.set_row_group_size_rows(val); + return *this; + } + /** * @brief Sets whether int96 timestamps are written or not in parquet_writer_options. * @@ -637,7 +700,7 @@ std::unique_ptr> write_parquet( * @param[in] metadata_list List of input file metadata. * @return A parquet-compatible blob that contains the data for all row groups in the list. */ -std::unique_ptr> merge_rowgroup_metadata( +std::unique_ptr> merge_row_group_metadata( const std::vector>>& metadata_list); /** @@ -660,6 +723,10 @@ class chunked_parquet_writer_options { // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. // If true then overrides any per-column setting in _metadata. bool _write_timestamps_as_int96 = false; + // Maximum size of each row group (unless smaller than a single page) + size_t _row_group_size_bytes = default_row_group_size_bytes; + // Maximum number of rows in row group (unless smaller than a single page) + size_type _row_group_size_rows = default_row_group_size_rows; /** * @brief Constructor from sink. @@ -703,6 +770,16 @@ class chunked_parquet_writer_options { */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } + /** + * @brief Returns maximum row group size, in bytes. + */ + auto get_row_group_size_bytes() const { return _row_group_size_bytes; } + + /** + * @brief Returns maximum row group size, in rows. + */ + auto get_row_group_size_rows() const { return _row_group_size_rows; } + /** * @brief Sets metadata. * @@ -732,6 +809,28 @@ class chunked_parquet_writer_options { */ void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; } + /** + * @brief Sets the maximum row group size, in bytes. + */ + void set_row_group_size_bytes(size_t size_bytes) + { + CUDF_EXPECTS( + size_bytes >= 512 * 1024, + "The maximum row group size cannot be smaller than the page size, which is 512KB."); + _row_group_size_bytes = size_bytes; + } + + /** + * @brief Sets the maximum row group size, in rows. + */ + void set_row_group_size_rows(size_type size_rows) + { + CUDF_EXPECTS( + size_rows >= 5000, + "The maximum row group size cannot be smaller than the page size, which is 5000 rows."); + _row_group_size_rows = size_rows; + } + /** * @brief creates builder to build chunked_parquet_writer_options. * @@ -811,6 +910,30 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Sets the maximum row group size, in bytes. + * + * @param val maximum row group size + * @return this for chaining. + */ + chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val) + { + options.set_row_group_size_bytes(val); + return *this; + } + + /** + * @brief Sets the maximum number of rows in output row groups. + * + * @param val maximum number or rows + * @return this for chaining. + */ + chunked_parquet_writer_options_builder& row_group_size_rows(size_type val) + { + options.set_row_group_size_rows(val); + return *this; + } + /** * @brief move chunked_parquet_writer_options member once it's built. */ diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index b678941db21..a8ca1d3a459 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -405,13 +405,13 @@ table_with_metadata read_parquet(parquet_reader_options const& options, } /** - * @copydoc cudf::io::merge_rowgroup_metadata + * @copydoc cudf::io::merge_row_group_metadata */ -std::unique_ptr> merge_rowgroup_metadata( +std::unique_ptr> merge_row_group_metadata( const std::vector>>& metadata_list) { CUDF_FUNC_RANGE(); - return detail_parquet::writer::merge_rowgroup_metadata(metadata_list); + return detail_parquet::writer::merge_row_group_metadata(metadata_list); } table_input_metadata::table_input_metadata(table_view const& table, diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 2bf020d08a2..1563e3e1fd7 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1301,10 +1301,10 @@ writer::impl::impl(std::unique_ptr sink, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), - max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()}, - row_index_stride{options.row_index_stride()}, + max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, + row_index_stride{options.get_row_index_stride()}, compression_kind_(to_orc_compression(options.get_compression())), - enable_statistics_(options.enable_statistics()), + enable_statistics_(options.is_enabled_statistics()), single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sink)) { @@ -1321,10 +1321,10 @@ writer::impl::impl(std::unique_ptr sink, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), - max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()}, - row_index_stride{options.row_index_stride()}, + max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, + row_index_stride{options.get_row_index_stride()}, compression_kind_(to_orc_compression(options.get_compression())), - enable_statistics_(options.enable_statistics()), + enable_statistics_(options.is_enabled_statistics()), single_write_mode(mode == SingleWriteMode::YES), out_sink_(std::move(sink)) { diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 2c7d745bb4c..62803432157 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1026,6 +1026,8 @@ writer::impl::impl(std::unique_ptr sink, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), + max_row_group_size{options.get_row_group_size_bytes()}, + max_row_group_rows{options.get_row_group_size_rows()}, compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), @@ -1045,6 +1047,8 @@ writer::impl::impl(std::unique_ptr sink, rmm::mr::device_memory_resource* mr) : _mr(mr), stream(stream), + max_row_group_size{options.get_row_group_size_bytes()}, + max_row_group_rows{options.get_row_group_size_rows()}, compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), @@ -1148,8 +1152,7 @@ void writer::impl::write(table_view const& table) // compression/decompression performance). using cudf::io::parquet::gpu::max_page_fragment_size; - uint32_t num_fragments = - (uint32_t)((num_rows + max_page_fragment_size - 1) / max_page_fragment_size); + size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size; cudf::detail::hostdevice_2dvector fragments( num_columns, num_fragments, stream); @@ -1162,21 +1165,20 @@ void writer::impl::write(table_view const& table) init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size); } - size_t global_rowgroup_base = md.row_groups.size(); + auto const global_rowgroup_base = static_cast(md.row_groups.size()); // Decide row group boundaries based on uncompressed data size - size_t rowgroup_size = 0; - uint32_t num_rowgroups = 0; - for (uint32_t f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; - f++) { - size_t fragment_data_size = 0; + auto rowgroup_size = 0ul; + auto num_rowgroups = 0; + for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) { + auto fragment_data_size = 0ul; // Replace with STL algorithm to transform and sum for (auto i = 0; i < num_columns; i++) { fragment_data_size += fragments[i][f].fragment_data_size; } if (f > rowgroup_start && - (rowgroup_size + fragment_data_size > max_rowgroup_size_ || - (f + 1 - rowgroup_start) * max_page_fragment_size > max_rowgroup_rows_)) { + (rowgroup_size + fragment_data_size > max_row_group_size || + (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) { // update schema md.row_groups.resize(md.row_groups.size() + 1); md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size; @@ -1204,15 +1206,15 @@ void writer::impl::write(table_view const& table) } } // Initialize row groups and column chunks - uint32_t num_chunks = num_rowgroups * num_columns; + auto const num_chunks = num_rowgroups * num_columns; hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); - for (uint32_t r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups; + for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups; r++, global_r++) { - uint32_t fragments_in_chunk = (uint32_t)( - (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size); + size_type const fragments_in_chunk = + (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size; md.row_groups[global_r].total_byte_size = 0; md.row_groups[global_r].columns.resize(num_columns); - for (int i = 0; i < num_columns; i++) { + for (auto i = 0; i < num_columns; i++) { gpu::EncColumnChunk* ck = &chunks[r][i]; *ck = {}; @@ -1244,8 +1246,8 @@ void writer::impl::write(table_view const& table) } auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream); - for (uint32_t rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) { - for (int col = 0; col < num_columns; col++) { + for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) { + for (auto col = 0; col < num_columns; col++) { if (chunks.host_view()[rg][col].use_dictionary) { md.row_groups[global_rg].columns[col].meta_data.encodings.push_back( Encoding::PLAIN_DICTIONARY); @@ -1274,16 +1276,16 @@ void writer::impl::write(table_view const& table) } // Initialize batches of rowgroups to encode (mainly to limit peak memory usage) - std::vector batch_list; - uint32_t num_pages = 0; - size_t max_bytes_in_batch = 1024 * 1024 * 1024; // 1GB - TBD: Tune this - size_t max_uncomp_bfr_size = 0; - size_t max_comp_bfr_size = 0; - size_t max_chunk_bfr_size = 0; - uint32_t max_pages_in_batch = 0; - size_t bytes_in_batch = 0; - size_t comp_bytes_in_batch = 0; - for (uint32_t r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) { + std::vector batch_list; + size_type num_pages = 0; + size_t max_bytes_in_batch = 1024 * 1024 * 1024; // 1GB - TODO: Tune this + size_t max_uncomp_bfr_size = 0; + size_t max_comp_bfr_size = 0; + size_t max_chunk_bfr_size = 0; + size_type max_pages_in_batch = 0; + size_t bytes_in_batch = 0; + size_t comp_bytes_in_batch = 0; + for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) { size_t rowgroup_size = 0; size_t comp_rowgroup_size = 0; if (r < num_rowgroups) { @@ -1331,11 +1333,11 @@ void writer::impl::write(table_view const& table) // This contains stats for both the pages and the rowgroups. TODO: make them separate. rmm::device_uvector page_stats(num_stats_bfr, stream); - for (uint32_t b = 0, r = 0; b < (uint32_t)batch_list.size(); b++) { - uint8_t* bfr = static_cast(uncomp_bfr.data()); - uint8_t* bfr_c = static_cast(comp_bfr.data()); - for (uint32_t j = 0; j < batch_list[b]; j++, r++) { - for (int i = 0; i < num_columns; i++) { + for (auto b = 0, r = 0; b < static_cast(batch_list.size()); b++) { + auto bfr = static_cast(uncomp_bfr.data()); + auto bfr_c = static_cast(comp_bfr.data()); + for (auto j = 0; j < batch_list[b]; j++, r++) { + for (auto i = 0; i < num_columns; i++) { gpu::EncColumnChunk* ck = &chunks[r][i]; ck->uncompressed_bfr = bfr; ck->compressed_bfr = bfr_c; @@ -1360,14 +1362,15 @@ void writer::impl::write(table_view const& table) pinned_buffer host_bfr{nullptr, cudaFreeHost}; // Encode row groups in batches - for (uint32_t b = 0, r = 0, global_r = global_rowgroup_base; b < (uint32_t)batch_list.size(); + for (auto b = 0, r = 0, global_r = global_rowgroup_base; + b < static_cast(batch_list.size()); b++) { // Count pages in this batch - uint32_t rnext = r + batch_list[b]; - uint32_t first_page_in_batch = chunks[r][0].first_page; - uint32_t first_page_in_next_batch = + auto const rnext = r + batch_list[b]; + auto const first_page_in_batch = chunks[r][0].first_page; + auto const first_page_in_next_batch = (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages; - uint32_t pages_in_batch = first_page_in_next_batch - first_page_in_batch; + auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch; // device_span batch_pages{pages.data() + first_page_in_batch, } encode_pages( chunks, @@ -1514,7 +1517,7 @@ std::unique_ptr> writer::close(std::string const& column_ch return _impl->close(column_chunks_file_path); } -std::unique_ptr> writer::merge_rowgroup_metadata( +std::unique_ptr> writer::merge_row_group_metadata( const std::vector>>& metadata_list) { std::vector output; diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index c7cdf8effd1..9188218f607 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -56,13 +56,6 @@ using cudf::detail::hostdevice_2dvector; * @brief Implementation for parquet writer */ class writer::impl { - // Parquet datasets are divided into fixed-size, independent rowgroups - static constexpr uint32_t DEFAULT_ROWGROUP_MAXSIZE = 128 * 1024 * 1024; // 128MB - static constexpr uint32_t DEFAULT_ROWGROUP_MAXROWS = 1000000; // Or at most 1M rows - - // rowgroups are divided into pages - static constexpr uint32_t DEFAULT_TARGET_PAGE_SIZE = 512 * 1024; - public: /** * @brief Constructor with writer options. @@ -209,9 +202,8 @@ class writer::impl { // Cuda stream to be used rmm::cuda_stream_view stream = rmm::cuda_stream_default; - size_t max_rowgroup_size_ = DEFAULT_ROWGROUP_MAXSIZE; - size_t max_rowgroup_rows_ = DEFAULT_ROWGROUP_MAXROWS; - size_t target_page_size_ = DEFAULT_TARGET_PAGE_SIZE; + size_t max_row_group_size = default_row_group_size_bytes; + size_type max_row_group_rows = default_row_group_size_rows; Compression compression_ = Compression::UNCOMPRESSED; statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 3bae8d7ab1e..b233819092a 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -3056,4 +3056,26 @@ TEST_F(ParquetReaderTest, EmptyOutput) CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); } +TEST_F(ParquetWriterTest, RowGroupSizeInvalid) +{ + const auto unused_table = std::make_unique
(); + std::vector out_buffer; + + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_rows(4999), + cudf::logic_error); + EXPECT_THROW( + cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view()) + .row_group_size_bytes(511 << 10), + cudf::logic_error); + + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .row_group_size_rows(4999), + cudf::logic_error); + EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer)) + .row_group_size_bytes(511 << 10), + cudf::logic_error); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index f0450483345..4b5ec913fb6 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -72,10 +72,10 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options() cudf_io_types.sink_info get_sink() except+ cudf_io_types.compression_type get_compression() except+ - bool enable_statistics() except+ - size_t stripe_size_bytes() except+ - size_type stripe_size_rows() except+ - size_type row_index_stride() except+ + bool is_enabled_statistics() except+ + size_t get_stripe_size_bytes() except+ + size_type get_stripe_size_rows() except+ + size_type get_row_index_stride() except+ cudf_table_view.table_view get_table() except+ const cudf_io_types.table_input_metadata *get_metadata() except+ diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 81ca7e5836b..9d95dce83bc 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -74,6 +74,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_table_view.table_view get_table() except + const cudf_io_types.table_input_metadata get_metadata() except + string get_column_chunks_file_path() except+ + size_t get_row_group_size_bytes() except+ + size_type get_row_group_size_rows() except+ void set_metadata( cudf_io_types.table_input_metadata *m @@ -87,6 +89,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_column_chunks_file_path( string column_chunks_file_path ) except + + void set_row_group_size_bytes(size_t val) except+ + void set_row_group_size_rows(size_type val) except+ @staticmethod parquet_writer_options_builder builder( @@ -116,6 +120,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& int96_timestamps( bool enabled ) except + + parquet_writer_options_builder& row_group_size_bytes( + size_t val + ) except+ + parquet_writer_options_builder& row_group_size_rows( + size_type val + ) except+ parquet_writer_options build() except + @@ -130,6 +140,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.statistics_freq get_stats_level() except + cudf_io_types.table_input_metadata* get_metadata( ) except+ + size_t get_row_group_size_bytes() except+ + size_type get_row_group_size_rows() except+ void set_metadata( cudf_io_types.table_input_metadata *m @@ -140,6 +152,8 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_compression( cudf_io_types.compression_type compression ) except + + void set_row_group_size_bytes(size_t val) except+ + void set_row_group_size_rows(size_type val) except+ @staticmethod chunked_parquet_writer_options_builder builder( @@ -160,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& compression( cudf_io_types.compression_type compression ) except + + parquet_writer_options_builder& row_group_size_bytes( + size_t val + ) except+ + parquet_writer_options_builder& row_group_size_rows( + size_type val + ) except+ chunked_parquet_writer_options build() except + @@ -173,6 +193,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: string column_chunks_file_path, ) except+ - cdef unique_ptr[vector[uint8_t]] merge_rowgroup_metadata( + cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata( const vector[unique_ptr[vector[uint8_t]]]& metadata_list ) except + diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 71705f4d0c1..d17184685fa 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -46,7 +46,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.io.parquet cimport ( chunked_parquet_writer_options, chunked_parquet_writer_options_builder, - merge_rowgroup_metadata as parquet_merge_metadata, + merge_row_group_metadata as parquet_merge_metadata, parquet_chunked_writer as cpp_parquet_chunked_writer, parquet_reader_options, parquet_writer_options, @@ -282,7 +282,9 @@ cpdef write_parquet( object compression="snappy", object statistics="ROWGROUP", object metadata_file_path=None, - object int96_timestamps=False): + object int96_timestamps=False, + object row_group_size_bytes=None, + object row_group_size_rows=None): """ Cython function to call into libcudf API, see `write_parquet`. @@ -334,7 +336,6 @@ cpdef write_parquet( cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) - cdef parquet_writer_options args cdef unique_ptr[vector[uint8_t]] out_metadata_c cdef string c_column_chunks_file_path cdef bool _int96_timestamps = int96_timestamps @@ -342,16 +343,21 @@ cpdef write_parquet( c_column_chunks_file_path = str.encode(metadata_file_path) # Perform write + cdef parquet_writer_options args = move( + parquet_writer_options.builder(sink, tv) + .metadata(tbl_meta.get()) + .compression(comp_type) + .stats_level(stat_freq) + .column_chunks_file_path(c_column_chunks_file_path) + .int96_timestamps(_int96_timestamps) + .build() + ) + if row_group_size_bytes is not None: + args.set_row_group_size_bytes(row_group_size_bytes) + if row_group_size_rows is not None: + args.set_row_group_size_rows(row_group_size_rows) + with nogil: - args = move( - parquet_writer_options.builder(sink, tv) - .metadata(tbl_meta.get()) - .compression(comp_type) - .stats_level(stat_freq) - .column_chunks_file_path(c_column_chunks_file_path) - .int96_timestamps(_int96_timestamps) - .build() - ) out_metadata_c = move(parquet_writer(args)) if metadata_file_path is not None: @@ -483,11 +489,11 @@ cdef class ParquetWriter: cpdef merge_filemetadata(object filemetadata_list): """ - Cython function to call into libcudf API, see `merge_rowgroup_metadata`. + Cython function to call into libcudf API, see `merge_row_group_metadata`. See Also -------- - cudf.io.parquet.merge_rowgroup_metadata + cudf.io.parquet.merge_row_group_metadata """ cdef vector[unique_ptr[vector[uint8_t]]] list_c cdef vector[uint8_t] blob_c diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 302021a082f..9d665d9a0a5 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -441,6 +441,8 @@ def to_parquet( statistics="ROWGROUP", metadata_file_path=None, int96_timestamps=False, + row_group_size_bytes=None, + row_group_size_rows=None, *args, **kwargs, ): @@ -480,6 +482,8 @@ def to_parquet( statistics=statistics, metadata_file_path=metadata_file_path, int96_timestamps=int96_timestamps, + row_group_size_bytes=row_group_size_bytes, + row_group_size_rows=row_group_size_rows, ) else: write_parquet_res = libparquet.write_parquet( @@ -490,6 +494,8 @@ def to_parquet( statistics=statistics, metadata_file_path=metadata_file_path, int96_timestamps=int96_timestamps, + row_group_size_bytes=row_group_size_bytes, + row_group_size_rows=row_group_size_rows, ) return write_parquet_res diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index df31738050b..b6595be9566 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -231,6 +231,11 @@ def _make_parquet_path_or_buf(src): yield _make_parquet_path_or_buf +@pytest.fixture(scope="module") +def large_int64_gdf(): + return cudf.DataFrame.from_pandas(pd.DataFrame({"col": range(0, 1 << 20)})) + + @pytest.mark.filterwarnings("ignore:Using CPU") @pytest.mark.parametrize("engine", ["pyarrow", "cudf"]) @pytest.mark.parametrize( @@ -2170,3 +2175,21 @@ def test_parquet_reader_brotli(datadir): got = cudf.read_parquet(fname).to_pandas(nullable=True) assert_eq(expect, got) + + +@pytest.mark.parametrize("size_bytes", [4_000_000, 1_000_000, 600_000]) +@pytest.mark.parametrize("size_rows", [1_000_000, 100_000, 10_000]) +def test_parquet_writer_row_group_size( + tmpdir, large_int64_gdf, size_bytes, size_rows +): + fname = tmpdir.join("row_group_size.parquet") + large_int64_gdf.to_parquet( + fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows + ) + + num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname) + # 8 bytes per row, as the column is int64 + expected_num_rows = max( + math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes) + ) + assert expected_num_rows == row_groups diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 6746753249c..11994830fed 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -221,6 +221,12 @@ timestamp[us] to the int96 format, which is the number of Julian days and the number of nanoseconds since midnight. If ``False``, timestamps will not be altered. +row_group_size_bytes: integer or None, default None + Maximum size of each stripe of the output. + If None, 13369344 (128MB) will be used. +row_group_size_rows: integer or None, default None + Maximum number of rows of each stripe of the output. + If None, 1000000 will be used. See Also @@ -404,10 +410,10 @@ stripe_size_bytes: integer or None, default None Maximum size of each stripe of the output. If None, 67108864 (64MB) will be used. -stripe_size_rows: integer or None, default None 1000000 +stripe_size_rows: integer or None, default None Maximum number of rows of each stripe of the output. If None, 1000000 will be used. -row_index_stride: integer or None, default None 10000 +row_index_stride: integer or None, default None Row index stride (maximum number of rows in each row group). If None, 10000 will be used. From 17e6f5b9d0a9456e82250f725da5fe61ce6c9ff5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 17 Nov 2021 14:58:38 -0800 Subject: [PATCH 009/202] Simplify merge internals and reduce overhead (#9516) This PR is a pretty thorough rewrite of the internals of merging. There is a ton of complexity imposed by matching all the different edge cases allowed by the pandas API, but I've tried to unify the logic for different code paths as much as possible. I've also added checks for a number of edge cases that were not previously being handled. I see about a 10% performance improvement for merges on small to medium data sizes from this PR (as expected, there's no change for large data where most time is spent in C++). There's also a substantial reduction in total code that should make it easier to address issues going forward. I'm still not entirely happy with the complexity of the result and I think that further simplification should be possible, but I think this is a sufficiently large step forward to be worth pushing forward in this state, especially if it helps enable other changes to joining. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9516 --- python/cudf/cudf/core/dataframe.py | 30 +- python/cudf/cudf/core/frame.py | 9 +- python/cudf/cudf/core/groupby/groupby.py | 24 - python/cudf/cudf/core/join/__init__.py | 4 +- python/cudf/cudf/core/join/_join_helpers.py | 118 ++-- python/cudf/cudf/core/join/join.py | 505 ++++++++---------- python/cudf/cudf/tests/test_joining.py | 34 +- python/dask_cudf/dask_cudf/tests/test_join.py | 4 - 8 files changed, 288 insertions(+), 440 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2e6588edb2..a95453a4e62 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -598,9 +598,12 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): else: if is_list_like(data): if len(data) > 0 and is_scalar(data[0]): - new_df = self._from_columns( - [data], index=index, columns=columns - ) + if columns is not None: + data = dict(zip(columns, [data])) + else: + data = dict(enumerate([data])) + new_df = DataFrame(data=data, index=index) + self._data = new_df._data self.index = new_df._index self.columns = new_df.columns @@ -3760,11 +3763,8 @@ def join( FutureWarning, ) - lhs = self - rhs = other - - df = lhs.merge( - rhs, + df = self.merge( + other, left_index=True, right_index=True, how=how, @@ -3772,7 +3772,7 @@ def join( sort=sort, ) df.index.name = ( - None if lhs.index.name != rhs.index.name else lhs.index.name + None if self.index.name != other.index.name else self.index.name ) return df @@ -5093,18 +5093,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df - @classmethod - def _from_columns(cls, cols, index=None, columns=None): - """ - Construct a DataFrame from a list of Columns - """ - if columns is not None: - data = dict(zip(columns, cols)) - else: - data = dict(enumerate(cols)) - - return cls(data=data, index=index,) - def interpolate( self, method="linear", diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index c0858398492..72239fc2a8e 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -46,7 +46,7 @@ serialize_columns, ) from cudf.core.column_accessor import ColumnAccessor -from cudf.core.join import merge +from cudf.core.join import Merge, MergeSemi from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame from cudf.core.window import Rolling from cudf.utils import ioutils @@ -3755,6 +3755,7 @@ def _merge( suffixes=("_x", "_y"), ): lhs, rhs = self, right + merge_cls = Merge if how == "right": # Merge doesn't support right, so just swap how = "left" @@ -3762,8 +3763,10 @@ def _merge( left_on, right_on = right_on, left_on left_index, right_index = right_index, left_index suffixes = (suffixes[1], suffixes[0]) + elif how in {"leftsemi", "leftanti"}: + merge_cls = MergeSemi - return merge( + return merge_cls( lhs, rhs, on=on, @@ -3775,7 +3778,7 @@ def _merge( sort=sort, indicator=indicator, suffixes=suffixes, - ) + ).perform_merge() def _is_sorted(self, ascending=None, null_position=None): """ diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ba69e42674a..dc6461663ce 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1178,18 +1178,6 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): _PROTECTED_KEYS = frozenset(("obj",)) - def __init__( - self, obj, by=None, level=None, sort=False, as_index=True, dropna=True - ): - super().__init__( - obj=obj, - by=by, - level=level, - sort=sort, - as_index=as_index, - dropna=dropna, - ) - def __getitem__(self, key): return self.obj[key].groupby( self.grouping, dropna=self._dropna, sort=self._sort @@ -1262,18 +1250,6 @@ class SeriesGroupBy(GroupBy): Name: Max Speed, dtype: float64 """ - def __init__( - self, obj, by=None, level=None, sort=False, as_index=True, dropna=True - ): - super().__init__( - obj=obj, - by=by, - level=level, - sort=sort, - as_index=as_index, - dropna=dropna, - ) - def agg(self, func): result = super().agg(func) diff --git a/python/cudf/cudf/core/join/__init__.py b/python/cudf/cudf/core/join/__init__.py index 0463b8f9df1..71a91c398ad 100644 --- a/python/cudf/cudf/core/join/__init__.py +++ b/python/cudf/cudf/core/join/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. -from cudf.core.join.join import merge +from cudf.core.join.join import Merge, MergeSemi diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index cc9c0fb66da..6dec0b10273 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -3,16 +3,17 @@ import collections import warnings -from typing import TYPE_CHECKING, Any, Iterable, Tuple +from typing import TYPE_CHECKING, Any, Tuple, cast import numpy as np -import pandas as pd import cudf +from cudf.api.types import is_dtype_equal +from cudf.core.column import CategoricalColumn from cudf.core.dtypes import CategoricalDtype if TYPE_CHECKING: - from cudf.core.column import CategoricalColumn, ColumnBase + from cudf.core.column import ColumnBase from cudf.core.frame import Frame @@ -28,61 +29,36 @@ class _Indexer: # >>> _Indexer("a", column=True).get(df) # returns column "a" of df # >>> _Indexer("b", index=True).get(df) # returns index level "b" of df - def __init__(self, name: Any, column=False, index=False): - if column and index: - raise ValueError("Cannot specify both column and index") + def __init__(self, name: Any): self.name = name - self.column, self.index = column, index + +class _ColumnIndexer(_Indexer): def get(self, obj: Frame) -> ColumnBase: - # get the column from `obj` - if self.column: - return obj._data[self.name] - else: - if obj._index is not None: - return obj._index._data[self.name] - raise KeyError() + return obj._data[self.name] def set(self, obj: Frame, value: ColumnBase, validate=False): - # set the colum in `obj` - if self.column: - obj._data.set_by_label(self.name, value, validate=validate) - else: - if obj._index is not None: - obj._index._data.set_by_label( - self.name, value, validate=validate - ) - else: - raise KeyError() - - -def _frame_select_by_indexers( - frame: Frame, indexers: Iterable[_Indexer] -) -> Frame: - # Select columns from the given `Frame` using `indexers`, - # and return a new `Frame`. - index_data = frame._data.__class__() - data = frame._data.__class__() - - for idx in indexers: - if idx.index: - index_data.set_by_label(idx.name, idx.get(frame), validate=False) - else: - data.set_by_label(idx.name, idx.get(frame), validate=False) + obj._data.set_by_label(self.name, value, validate=validate) - result_index = ( - cudf.core.index._index_from_data(index_data) if index_data else None - ) - result = cudf.core.frame.Frame(data=data, index=result_index) - return result + +class _IndexIndexer(_Indexer): + def get(self, obj: Frame) -> ColumnBase: + if obj._index is not None: + return obj._index._data[self.name] + raise KeyError + + def set(self, obj: Frame, value: ColumnBase, validate=False): + if obj._index is not None: + obj._index._data.set_by_label(self.name, value, validate=validate) + else: + raise KeyError def _match_join_keys( lcol: ColumnBase, rcol: ColumnBase, how: str ) -> Tuple[ColumnBase, ColumnBase]: - # returns the common dtype that lcol and rcol should be casted to, - # before they can be used as left and right join keys. - # If no casting is necessary, returns None + # Casts lcol and rcol to a common dtype for use as join keys. If no casting + # is necessary, they are returned as is. common_type = None @@ -91,12 +67,22 @@ def _match_join_keys( rtype = rcol.dtype # if either side is categorical, different logic - if isinstance(ltype, CategoricalDtype) or isinstance( - rtype, CategoricalDtype - ): - return _match_categorical_dtypes(lcol, rcol, how) + left_is_categorical = isinstance(ltype, CategoricalDtype) + right_is_categorical = isinstance(rtype, CategoricalDtype) + if left_is_categorical and right_is_categorical: + return _match_categorical_dtypes_both( + cast(CategoricalColumn, lcol), cast(CategoricalColumn, rcol), how + ) + elif left_is_categorical or right_is_categorical: + if left_is_categorical: + if how in {"left", "leftsemi", "leftanti"}: + return lcol, rcol.astype(ltype) + common_type = ltype.categories.dtype + else: + common_type = rtype.categories.dtype + return lcol.astype(common_type), rcol.astype(common_type) - if pd.api.types.is_dtype_equal(ltype, rtype): + if is_dtype_equal(ltype, rtype): return lcol, rcol if isinstance(ltype, cudf.Decimal64Dtype) or isinstance( @@ -131,34 +117,9 @@ def _match_join_keys( return lcol.astype(common_type), rcol.astype(common_type) -def _match_categorical_dtypes( - lcol: ColumnBase, rcol: ColumnBase, how: str -) -> Tuple[ColumnBase, ColumnBase]: - # cast the keys lcol and rcol to a common dtype - # when at least one of them is a categorical type - ltype, rtype = lcol.dtype, rcol.dtype - - if isinstance(lcol, cudf.core.column.CategoricalColumn) and isinstance( - rcol, cudf.core.column.CategoricalColumn - ): - # if both are categoricals, logic is complicated: - return _match_categorical_dtypes_both(lcol, rcol, how) - - if isinstance(ltype, CategoricalDtype): - if how in {"left", "leftsemi", "leftanti"}: - return lcol, rcol.astype(ltype) - common_type = ltype.categories.dtype - elif isinstance(rtype, CategoricalDtype): - common_type = rtype.categories.dtype - return lcol.astype(common_type), rcol.astype(common_type) - - def _match_categorical_dtypes_both( lcol: CategoricalColumn, rcol: CategoricalColumn, how: str ) -> Tuple[ColumnBase, ColumnBase]: - # The commontype depends on both `how` and the specifics of the - # categorical variables to be merged. - ltype, rtype = lcol.dtype, rcol.dtype # when both are ordered and both have the same categories, @@ -184,9 +145,6 @@ def _match_categorical_dtypes_both( "neither side is ordered" ) - # the following should now always hold - assert not ltype.ordered and not rtype.ordered - if how == "inner": # cast to category types -- we must cast them back later return _match_join_keys( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 28b2d5d8167..dd8f462fb1d 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,16 +1,14 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. from __future__ import annotations -import functools -from collections import namedtuple -from typing import TYPE_CHECKING, Callable, Tuple +from typing import TYPE_CHECKING, Callable import cudf from cudf import _lib as libcudf from cudf.core.join._join_helpers import ( _coerce_to_tuple, - _frame_select_by_indexers, - _Indexer, + _ColumnIndexer, + _IndexIndexer, _match_join_keys, ) @@ -18,47 +16,7 @@ from cudf.core.frame import Frame -def merge( - lhs, - rhs, - *, - on, - left_on, - right_on, - left_index, - right_index, - how, - sort, - indicator, - suffixes, -): - if how in {"leftsemi", "leftanti"}: - merge_cls = MergeSemi - else: - merge_cls = Merge - mergeobj = merge_cls( - lhs, - rhs, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - how=how, - sort=sort, - indicator=indicator, - suffixes=suffixes, - ) - return mergeobj.perform_merge() - - -_JoinKeys = namedtuple("JoinKeys", ["left", "right"]) - - -class Merge(object): - # A namedtuple of indexers representing the left and right keys - _keys: _JoinKeys - +class Merge: # The joiner function must have the following signature: # # def joiner( @@ -71,7 +29,7 @@ class Merge(object): # join key. The `joiner` returns a tuple of two Columns # representing the rows to gather from the left- and right- side # tables respectively. - _joiner: Callable + _joiner: Callable = libcudf.join.join def __init__( self, @@ -133,150 +91,157 @@ def __init__( how=how, suffixes=suffixes, ) - self._joiner = functools.partial(libcudf.join.join, how=how) - - self.lhs = lhs - self.rhs = rhs - self.on = on - self.left_on = left_on - self.right_on = right_on - self.left_index = left_index - self.right_index = right_index + + self.lhs = lhs.copy(deep=False) + self.rhs = rhs.copy(deep=False) self.how = how self.sort = sort - if suffixes: - self.lsuffix, self.rsuffix = suffixes - self._compute_join_keys() - - @property - def _out_class(self): - # type of the result - out_class = cudf.DataFrame + self.lsuffix, self.rsuffix = suffixes + + # At this point validation guarantees that if on is not None we + # don't have any other args, so we can apply it directly to left_on and + # right_on. + self._using_left_index = bool(left_index) + left_on = ( + lhs.index._data.names if left_index else left_on if left_on else on + ) + self._using_right_index = bool(right_index) + right_on = ( + rhs.index._data.names + if right_index + else right_on + if right_on + else on + ) - if isinstance(self.lhs, cudf.MultiIndex) or isinstance( - self.rhs, cudf.MultiIndex + if left_on or right_on: + self._left_keys = [ + _ColumnIndexer(name=on) + if not self._using_left_index and on in lhs._data + else _IndexIndexer(name=on) + for on in (_coerce_to_tuple(left_on) if left_on else []) + ] + self._right_keys = [ + _ColumnIndexer(name=on) + if not self._using_right_index and on in rhs._data + else _IndexIndexer(name=on) + for on in (_coerce_to_tuple(right_on) if right_on else []) + ] + if len(self._left_keys) != len(self._right_keys): + raise ValueError( + "Merge operands must have same number of join key columns" + ) + self._using_left_index = any( + isinstance(idx, _IndexIndexer) for idx in self._left_keys + ) + self._using_right_index = any( + isinstance(idx, _IndexIndexer) for idx in self._right_keys + ) + else: + # if `on` is not provided and we're not merging + # index with column or on both indexes, then use + # the intersection of columns in both frames + on_names = set(lhs._data) & set(rhs._data) + self._left_keys = [_ColumnIndexer(name=on) for on in on_names] + self._right_keys = [_ColumnIndexer(name=on) for on in on_names] + self._using_left_index = False + self._using_right_index = False + + if isinstance(lhs, cudf.MultiIndex) or isinstance( + rhs, cudf.MultiIndex ): - out_class = cudf.MultiIndex - elif isinstance(self.lhs, cudf.BaseIndex): - out_class = self.lhs.__class__ - return out_class + self._out_class = cudf.MultiIndex + elif isinstance(lhs, cudf.BaseIndex): + self._out_class = lhs.__class__ + else: + self._out_class = cudf.DataFrame + + self._key_columns_with_same_name = ( + set(_coerce_to_tuple(on)) + if on + else set() + if (self._using_left_index or self._using_right_index) + else set( + [ + lkey.name + for lkey, rkey in zip(self._left_keys, self._right_keys) + if lkey.name == rkey.name + ] + ) + ) def perform_merge(self) -> Frame: - lhs, rhs = self._match_key_dtypes(self.lhs, self.rhs) - - left_table = _frame_select_by_indexers(lhs, self._keys.left) - right_table = _frame_select_by_indexers(rhs, self._keys.right) + left_join_cols = {} + right_join_cols = {} + + for left_key, right_key in zip(self._left_keys, self._right_keys): + lcol = left_key.get(self.lhs) + rcol = right_key.get(self.rhs) + lcol_casted, rcol_casted = _match_join_keys(lcol, rcol, self.how) + left_join_cols[left_key.name] = lcol_casted + right_join_cols[left_key.name] = rcol_casted + + # Categorical dtypes must be cast back from the underlying codes + # type that was returned by _match_join_keys. + if ( + self.how == "inner" + and isinstance(lcol.dtype, cudf.CategoricalDtype) + and isinstance(rcol.dtype, cudf.CategoricalDtype) + ): + lcol_casted = lcol_casted.astype("category") + rcol_casted = rcol_casted.astype("category") + + left_key.set(self.lhs, lcol_casted, validate=False) + right_key.set(self.rhs, rcol_casted, validate=False) left_rows, right_rows = self._joiner( - left_table, right_table, how=self.how, + cudf.core.frame.Frame(left_join_cols), + cudf.core.frame.Frame(right_join_cols), + how=self.how, ) - lhs, rhs = self._restore_categorical_keys(lhs, rhs) - left_result = cudf.core.frame.Frame() - right_result = cudf.core.frame.Frame() + gather_index = self._using_left_index or self._using_right_index - gather_index = self.left_index or self.right_index - if left_rows is not None: - left_result = lhs._gather( + left_result = ( + self.lhs._gather( left_rows, nullify=True, keep_index=gather_index, check_bounds=False, ) - if right_rows is not None: - right_result = rhs._gather( + if left_rows is not None + else cudf.core.frame.Frame() + ) + right_result = ( + self.rhs._gather( right_rows, nullify=True, keep_index=gather_index, check_bounds=False, ) + if right_rows is not None + else cudf.core.frame.Frame() + ) - result = self._merge_results(left_result, right_result) + result = self._out_class._from_data( + *self._merge_results(left_result, right_result) + ) if self.sort: result = self._sort_result(result) return result - def _compute_join_keys(self): - # Computes self._keys - left_keys = [] - right_keys = [] - if ( - self.left_index - or self.right_index - or self.left_on - or self.right_on - ): - if self.left_index: - left_keys.extend( - [ - _Indexer(name=on, index=True) - for on in self.lhs.index._data.names - ] - ) - if self.left_on: - # TODO: require left_on or left_index to be specified - left_keys.extend( - [ - _Indexer(name=on, column=True) - for on in _coerce_to_tuple(self.left_on) - ] - ) - if self.right_index: - right_keys.extend( - [ - _Indexer(name=on, index=True) - for on in self.rhs.index._data.names - ] - ) - if self.right_on: - # TODO: require right_on or right_index to be specified - right_keys.extend( - [ - _Indexer(name=on, column=True) - for on in _coerce_to_tuple(self.right_on) - ] - ) - elif self.on: - on_names = _coerce_to_tuple(self.on) - for on in on_names: - # If `on` is provided, Merge on columns if present, - # otherwise default to indexes. - if on in self.lhs._data: - left_keys.append(_Indexer(name=on, column=True)) - else: - left_keys.append(_Indexer(name=on, index=True)) - if on in self.rhs._data: - right_keys.append(_Indexer(name=on, column=True)) - else: - right_keys.append(_Indexer(name=on, index=True)) - - else: - # if `on` is not provided and we're not merging - # index with column or on both indexes, then use - # the intersection of columns in both frames - on_names = set(self.lhs._data) & set(self.rhs._data) - left_keys = [_Indexer(name=on, column=True) for on in on_names] - right_keys = [_Indexer(name=on, column=True) for on in on_names] - - if len(left_keys) != len(right_keys): - raise ValueError( - "Merge operands must have same number of join key columns" - ) - - self._keys = _JoinKeys(left=left_keys, right=right_keys) - - def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: + def _merge_results(self, left_result: Frame, right_result: Frame): # Merge the Frames `left_result` and `right_result` into a single # `Frame`, suffixing column names if necessary. # If two key columns have the same name, a single output column appears - # in the result. For all other join types, the key column from the rhs - # is simply dropped. For outer joins, the two key columns are combined - # by filling nulls in the left key column with corresponding values - # from the right key column: + # in the result. For all non-outer join types, the key column from the + # rhs is simply dropped. For outer joins, the two key columns are + # combined by filling nulls in the left key column with corresponding + # values from the right key column: if self.how == "outer": - for lkey, rkey in zip(*self._keys): + for lkey, rkey in zip(self._left_keys, self._right_keys): if lkey.name == rkey.name: # fill nulls in lhs from values in the rhs lkey.set( @@ -285,36 +250,26 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: validate=False, ) - # Compute the result column names: - # left_names and right_names will be a mappings of input column names - # to the corresponding names in the final result. - left_names = dict(zip(left_result._data, left_result._data)) - right_names = dict(zip(right_result._data, right_result._data)) - - # For any columns from left_result and right_result that have the same - # name: - # - if they are key columns, keep only the left column - # - if they are not key columns, use suffixes to differentiate them - # in the final result - common_names = set(left_names) & set(right_names) - - if self.on: - key_columns_with_same_name = self.on - else: - key_columns_with_same_name = [ - lkey.name - for lkey, rkey in zip(*self._keys) - if ( - (lkey.index, rkey.index) == (False, False) - and lkey.name == rkey.name - ) - ] - for name in common_names: - if name not in key_columns_with_same_name: - left_names[name] = f"{name}{self.lsuffix}" - right_names[name] = f"{name}{self.rsuffix}" + # All columns from the left table make it into the output. Non-key + # columns that share a name with a column in the right table are + # suffixed with the provided suffix. + common_names = set(left_result._data.names) & set( + right_result._data.names + ) + cols_to_suffix = common_names - self._key_columns_with_same_name + data = { + (f"{name}{self.lsuffix}" if name in cols_to_suffix else name): col + for name, col in left_result._data.items() + } + + # The right table follows the same rule as the left table except that + # key columns from the right table are removed. + for name, col in right_result._data.items(): + if name in common_names: + if name not in self._key_columns_with_same_name: + data[f"{name}{self.rsuffix}"] = col else: - del right_names[name] + data[name] = col # determine if the result has multiindex columns. The result # of a join has a MultiIndex as its columns if: @@ -333,69 +288,44 @@ def _merge_results(self, left_result: Frame, right_result: Frame) -> Frame: else: multiindex_columns = False - # Assemble the data columns of the result - data = left_result._data.__class__(multiindex=multiindex_columns) - - for lcol in left_names: - data.set_by_label( - left_names[lcol], left_result._data[lcol], validate=False - ) - for rcol in right_names: - data.set_by_label( - right_names[rcol], right_result._data[rcol], validate=False - ) - - # Index of the result: - if self.left_index and self.right_index: + if self._using_right_index: + # right_index and left_on index = left_result._index - elif self.left_index: + elif self._using_left_index: # left_index and right_on index = right_result._index - elif self.right_index: - # right_index and left_on - index = left_result._index else: index = None # Construct result from data and index: - result = self._out_class._from_data(data=data, index=index) - - return result + return ( + left_result._data.__class__( + data=data, multiindex=multiindex_columns + ), + index, + ) def _sort_result(self, result: Frame) -> Frame: # Pandas sorts on the key columns in the # same order as given in 'on'. If the indices are used as # keys, the index will be sorted. If one index is specified, # the key columns on the other side will be used to sort. - if self.on: - if isinstance(result, cudf.BaseIndex): - sort_order = result._get_sorted_inds() - else: - # need a list instead of a tuple here because - # _get_sorted_inds calls down to ColumnAccessor.get_by_label - # which handles lists and tuples differently - sort_order = result._get_sorted_inds( - list(_coerce_to_tuple(self.on)) - ) - return result._gather( - sort_order, keep_index=False, check_bounds=False - ) by = [] - if self.left_index and self.right_index: + if self._using_left_index and self._using_right_index: if result._index is not None: by.extend(result._index._data.columns) - if self.left_on: - by.extend( - [result._data[col] for col in _coerce_to_tuple(self.left_on)] - ) - if self.right_on: - by.extend( - [result._data[col] for col in _coerce_to_tuple(self.right_on)] - ) + if not self._using_left_index: + by.extend([result._data[col.name] for col in self._left_keys]) + if not self._using_right_index: + by.extend([result._data[col.name] for col in self._right_keys]) if by: - to_sort = cudf.DataFrame._from_columns(by) + to_sort = cudf.DataFrame._from_data(dict(enumerate(by))) sort_order = to_sort.argsort() - result = result._gather(sort_order, check_bounds=False) + result = result._gather( + sort_order, + keep_index=self._using_left_index or self._using_right_index, + check_bounds=False, + ) return result @staticmethod @@ -410,10 +340,9 @@ def _validate_merge_params( how, suffixes, ): - """ - Error for various invalid combinations of merge input parameters - """ - # must actually support the requested merge type + # Error for various invalid combinations of merge input parameters + + # We must actually support the requested merge type if how not in {"left", "inner", "outer", "leftanti", "leftsemi"}: raise NotImplementedError(f"{how} merge not supported yet") @@ -424,15 +353,55 @@ def _validate_merge_params( 'Can only pass argument "on" OR "left_on" ' 'and "right_on", not a combination of both.' ) + elif left_index or right_index: + # Passing 'on' with 'left_index' or 'right_index' is ambiguous + raise ValueError( + 'Can only pass argument "on" OR "left_index" ' + 'and "right_index", not a combination of both.' + ) else: # the validity of 'on' being checked by _Indexer return + elif left_on and left_index: + raise ValueError( + 'Can only pass argument "left_on" OR "left_index" not both.' + ) + elif right_on and right_index: + raise ValueError( + 'Can only pass argument "right_on" OR "right_index" not both.' + ) + + # Can't merge on a column name that is present in both a frame and its + # indexes. + if on: + for key in on: + if (key in lhs._data and key in lhs.index._data) or ( + key in rhs._data and key in rhs.index._data + ): + raise ValueError( + f"{key} is both an index level and a " + "column label, which is ambiguous." + ) + if left_on: + for key in left_on: + if key in lhs._data and key in lhs.index._data: + raise ValueError( + f"{key} is both an index level and a " + "column label, which is ambiguous." + ) + if right_on: + for key in right_on: + if key in rhs._data and key in rhs.index._data: + raise ValueError( + f"{key} is both an index level and a " + "column label, which is ambiguous." + ) # Can't merge on unnamed Series if (isinstance(lhs, cudf.Series) and not lhs.name) or ( isinstance(rhs, cudf.Series) and not rhs.name ): - raise ValueError("Can not merge on unnamed Series") + raise ValueError("Cannot merge on unnamed Series") # If nothing specified, must have common cols to use implicitly same_named_columns = set(lhs._data) & set(rhs._data) @@ -459,59 +428,15 @@ def _validate_merge_params( "lsuffix and rsuffix are not defined" ) - def _match_key_dtypes(self, lhs: Frame, rhs: Frame) -> Tuple[Frame, Frame]: - # Match the dtypes of the key columns from lhs and rhs - out_lhs = lhs.copy(deep=False) - out_rhs = rhs.copy(deep=False) - for left_key, right_key in zip(*self._keys): - lcol, rcol = left_key.get(lhs), right_key.get(rhs) - lcol_casted, rcol_casted = _match_join_keys( - lcol, rcol, how=self.how - ) - if lcol is not lcol_casted: - left_key.set(out_lhs, lcol_casted, validate=False) - if rcol is not rcol_casted: - right_key.set(out_rhs, rcol_casted, validate=False) - return out_lhs, out_rhs - - def _restore_categorical_keys( - self, lhs: Frame, rhs: Frame - ) -> Tuple[Frame, Frame]: - # For inner joins, any categorical keys in `self.lhs` and `self.rhs` - # were casted to their category type to produce `lhs` and `rhs`. - # Here, we cast them back. - out_lhs = lhs.copy(deep=False) - out_rhs = rhs.copy(deep=False) - if self.how == "inner": - for left_key, right_key in zip(*self._keys): - if isinstance( - left_key.get(self.lhs).dtype, cudf.CategoricalDtype - ) and isinstance( - right_key.get(self.rhs).dtype, cudf.CategoricalDtype - ): - left_key.set( - out_lhs, - left_key.get(out_lhs).astype("category"), - validate=False, - ) - right_key.set( - out_rhs, - right_key.get(out_rhs).astype("category"), - validate=False, - ) - return out_lhs, out_rhs - class MergeSemi(Merge): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._joiner = functools.partial( - libcudf.join.semi_join, how=kwargs["how"] - ) + _joiner: Callable = libcudf.join.semi_join - def _merge_results(self, lhs: Frame, rhs: Frame) -> Frame: + def _merge_results(self, lhs: Frame, rhs: Frame): # semi-join result includes only lhs columns - if issubclass(self._out_class, cudf.Index): - return self._out_class._from_data(lhs._data) - else: - return self._out_class._from_data(lhs._data, index=lhs._index) + return ( + lhs._data, + lhs._index + if not issubclass(self._out_class, cudf.Index) + else None, + ) diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index e9f55c9e51a..0518cc2c9b9 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -230,10 +230,7 @@ def test_dataframe_join_combine_cats(): expect.index = expect.index.astype("category") got = lhs.join(rhs, how="outer") - # TODO: Remove copying to host - # after https://github.com/rapidsai/cudf/issues/5676 - # is implemented - assert_eq(expect.index.sort_values(), got.index.to_pandas().sort_values()) + assert_eq(expect.index.sort_values(), got.index.sort_values()) @pytest.mark.parametrize("how", ["left", "right", "inner", "outer"]) @@ -744,12 +741,6 @@ def test_merge_sort(ons, hows): [ {"left_on": ["a"], "left_index": False, "right_index": True}, {"right_on": ["b"], "left_index": True, "right_index": False}, - { - "left_on": ["a"], - "right_on": ["b"], - "left_index": True, - "right_index": True, - }, ], ) def test_merge_sort_on_indexes(kwargs): @@ -1791,12 +1782,6 @@ def test_typecast_on_join_indexes_matching_categorical(): {"left_index": True, "right_on": "b"}, {"left_on": "a", "right_index": True}, {"left_index": True, "right_index": True}, - { - "left_on": "a", - "right_on": "b", - "left_index": True, - "right_index": True, - }, ], ) def test_series_dataframe_mixed_merging(lhs, rhs, how, kwargs): @@ -2148,3 +2133,20 @@ def test_join_on_index_with_duplicate_names(): got = lhs.join(rhs, how="inner") assert_join_results_equal(expect, got, how="inner") + + +def test_join_redundant_params(): + lhs = cudf.DataFrame( + {"a": [1, 2, 3], "c": [2, 3, 4]}, index=cudf.Index([0, 1, 2], name="c") + ) + rhs = cudf.DataFrame( + {"b": [1, 2, 3]}, index=cudf.Index([0, 1, 2], name="a") + ) + with pytest.raises(ValueError): + lhs.merge(rhs, on="a", left_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, left_on="a", left_index=True, right_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, right_on="a", left_index=True, right_index=True) + with pytest.raises(ValueError): + lhs.merge(rhs, left_on="c", right_on="b") diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 58811ee98fc..8b2d85c59d7 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -245,8 +245,6 @@ def test_merge_should_fail(): left.merge(right, how="left", on=["b"]) with pytest.raises(KeyError): left.merge(right, how="left", on=["c"]) - with pytest.raises(KeyError): - left.merge(right, how="left", on=["a"]) # Same column names df2["b"] = np.random.randint(0, 12, 12) @@ -254,8 +252,6 @@ def test_merge_should_fail(): with pytest.raises(KeyError): left.merge(right, how="left", on="NonCol") - with pytest.raises(KeyError): - left.merge(right, how="left", on="a") @pytest.mark.parametrize("how", ["inner", "left"]) From 32bacfaa0a75fd3fb5fb44b106d8138f83001184 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Kon=C3=A9?= Date: Thu, 18 Nov 2021 00:24:07 +0100 Subject: [PATCH 010/202] Interchange dataframe protocol (#9071) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR is a basic implementation of the [interchange dataframe protocol](https://github.com/data-apis/dataframe-api/blob/main/protocol/dataframe_protocol.py) for cudf. As well-known, there are many dataframe libraries out there where one's weakness is handle by another. To work across these libraries, we rely on `pandas` with method like `from_pandas` and `to_pandas`. This is a bad design as libraries should maintain an additional dependency to pandas peculiarities. This protocol provides a high level API that must be implemented by dataframe libraries to allow communication between them. Thus, we get rid of the high coupling with pandas and depend only on the protocol API where each library has the freedom of its implementation details. To illustrate: - `df_obj = cudf_dataframe.__dataframe__()` `df_obj` can be consumed by any library implementing the protocol. - `df = cudf.from_dataframe(any_supported_dataframe)` here we create a `cudf dataframe` from any dataframe object supporting the protocol. So far, it supports the following: - Column dtypes: `uint8`, `int`, `float`, `bool` and `categorical`. - Missing values are handled for all these dtypes. - `string` support is on the way. Additionally, we support dataframe from CPU device like `pandas`. But it is not testable here as pandas has not yet adopted the protocol. We've tested it locally with a pandas monkey patched implementation of the protocol. Authors: - Ismaël Koné (https://github.com/iskode) - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9071 --- python/cudf/cudf/__init__.py | 2 +- python/cudf/cudf/core/dataframe.py | 13 +- python/cudf/cudf/core/df_protocol.py | 829 +++++++++++++++++++++ python/cudf/cudf/tests/test_df_protocol.py | 219 ++++++ 4 files changed, 1061 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/core/df_protocol.py create mode 100644 python/cudf/cudf/tests/test_df_protocol.py diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index bc35551b5bd..f696a00d1ed 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -42,7 +42,7 @@ UInt64Index, interval_range, ) -from cudf.core.dataframe import DataFrame, from_pandas, merge +from cudf.core.dataframe import DataFrame, from_pandas, merge, from_dataframe from cudf.core.series import Series from cudf.core.multiindex import MultiIndex from cudf.core.cut import cut diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a95453a4e62..bfbe8b06c17 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -40,7 +40,7 @@ is_string_dtype, is_struct_dtype, ) -from cudf.core import column, reshape +from cudf.core import column, df_protocol, reshape from cudf.core.abc import Serializable from cudf.core.column import ( as_column, @@ -6329,6 +6329,17 @@ def explode(self, column, ignore_index=False): return super()._explode(column, ignore_index) + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ): + return df_protocol.__dataframe__( + self, nan_as_null=nan_as_null, allow_copy=allow_copy + ) + + +def from_dataframe(df, allow_copy=False): + return df_protocol.from_dataframe(df, allow_copy=allow_copy) + def make_binop_func(op, postprocess=None): # This function is used to wrap binary operations in Frame with an diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py new file mode 100644 index 00000000000..8f258ce27b2 --- /dev/null +++ b/python/cudf/cudf/core/df_protocol.py @@ -0,0 +1,829 @@ +import collections +import enum +from typing import ( + Any, + Dict, + Iterable, + Mapping, + Optional, + Sequence, + Tuple, + cast, +) + +import cupy as cp +import numpy as np +from numba.cuda import as_cuda_array + +import cudf +from cudf.core.buffer import Buffer +from cudf.core.column import as_column, build_categorical_column, build_column + +# Implementation of interchange protocol classes +# ---------------------------------------------- + + +class _DtypeKind(enum.IntEnum): + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + + +class _Device(enum.IntEnum): + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + + +_SUPPORTED_KINDS = { + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.CATEGORICAL, + _DtypeKind.BOOL, + _DtypeKind.STRING, +} +ProtoDtype = Tuple[_DtypeKind, int, str, str] + + +class _CuDFBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__( + self, + buf: cudf.core.buffer.Buffer, + dtype: np.dtype, + allow_copy: bool = True, + ) -> None: + """ + Use cudf.core.buffer.Buffer object. + """ + # Store the cudf buffer where the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._buf = buf + self._dtype = dtype + self._allow_copy = allow_copy + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buf.nbytes + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buf.ptr + + def __dlpack__(self): + """ + DLPack not implemented in NumPy yet, so leave it out here. + """ + try: + cudarray = as_cuda_array(self._buf).view(self._dtype) + res = cp.asarray(cudarray).toDlpack() + + except ValueError: + raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") + + return res + + def __dlpack_device__(self) -> Tuple[_Device, int]: + """ + _Device type and _Device ID for where the data in the buffer resides. + """ + return (_Device.CUDA, cp.asarray(self._buf).device.id) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(" + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "dlpack": self.__dlpack__(), + "device": self.__dlpack_device__()[0].name, + } + ) + +")" + + +class _CuDFColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + + """ + + def __init__( + self, + column: cudf.core.column.ColumnBase, + nan_as_null: bool = True, + allow_copy: bool = True, + ) -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + if not isinstance(column, cudf.core.column.ColumnBase): + raise TypeError( + "column must be a subtype of df.core.column.ColumnBase," + f"got {type(column)}" + ) + self._col = column + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + @property + def size(self) -> int: + """ + Size of the column, in elements. + """ + return self._col.size + + @property + def offset(self) -> int: + """ + Offset of first element. Always zero. + """ + return 0 + + @property + def dtype(self) -> ProtoDtype: + """ + Dtype description as a tuple + ``(kind, bit-width, format string, endianness)`` + + Kind : + + - INT = 0 + - UINT = 1 + - FLOAT = 2 + - BOOL = 20 + - STRING = 21 # UTF-8 + - DATETIME = 22 + - CATEGORICAL = 23 + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + + - Kind specifiers are aligned with DLPack where possible + (hence the jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 + (for bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case + in the future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, + and for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding + of the categorical (e.g. an integer to string mapping), + this can be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, + binary, decimal, and nested (list, struct, map, union) dtypes. + """ + dtype = self._col.dtype + + # For now, assume that, if the column dtype is 'O' (i.e., `object`), + # then we have an array of strings + if not isinstance(dtype, cudf.CategoricalDtype) and dtype.kind == "O": + return (_DtypeKind.STRING, 8, "u", "=") + + return self._dtype_from_cudfdtype(dtype) + + def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: + """ + See `self.dtype` for details. + """ + # Note: 'c' (complex) not handled yet (not in array spec v1). + # 'b', 'B' (bytes), 'S', 'a', (old-style string) 'V' (void) + # not handled datetime and timedelta both map to datetime + # (is timedelta handled?) + _np_kinds = { + "i": _DtypeKind.INT, + "u": _DtypeKind.UINT, + "f": _DtypeKind.FLOAT, + "b": _DtypeKind.BOOL, + "U": _DtypeKind.STRING, + "M": _DtypeKind.DATETIME, + "m": _DtypeKind.DATETIME, + } + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + # Not a NumPy/CuPy dtype. Check if it's a categorical maybe + if isinstance(dtype, cudf.CategoricalDtype): + kind = _DtypeKind.CATEGORICAL + # Codes and categories' dtypes are different. + # We use codes' dtype as these are stored in the buffer. + codes = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes + dtype = codes.dtype + else: + raise ValueError( + f"Data type {dtype} not supported by exchange protocol" + ) + + if kind not in _SUPPORTED_KINDS: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + bitwidth = dtype.itemsize * 8 + format_str = dtype.str + endianness = dtype.byteorder if kind != _DtypeKind.CATEGORICAL else "=" + return (kind, bitwidth, format_str, endianness) + + @property + def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: + """ + If the dtype is categorical, there are two options: + + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + Raises TypeError if the dtype is not categorical + + Content of returned dict: + + - "is_ordered" : bool, whether the ordering of dictionary + indices is semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + """ + if not self.dtype[0] == _DtypeKind.CATEGORICAL: + raise TypeError( + "`describe_categorical only works on " + "a column with categorical dtype!" + ) + categ_col = cast(cudf.core.column.CategoricalColumn, self._col) + ordered = bool(categ_col.dtype.ordered) + is_dictionary = True + # NOTE: this shows the children approach is better, transforming + # `categories` to a "mapping" dict is inefficient + categories = categ_col.categories + mapping = {ix: val for ix, val in enumerate(categories.values_host)} + return ordered, is_dictionary, mapping + + @property + def describe_null(self) -> Tuple[int, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Kind: + + - 0 : non-nullable + - 1 : NaN/NaT + - 2 : sentinel value + - 3 : bit mask + - 4 : byte mask + + Value : if kind is "sentinel value", the actual value. + If kind is a bit mask or a byte mask, the value (0 or 1) + indicating a missing value. + None otherwise. + """ + kind = self.dtype[0] + if self.null_count == 0: + # there is no validity mask so it is non-nullable + return 0, None + + elif kind in _SUPPORTED_KINDS: + # bit mask is universally used in cudf for missing + return 3, 0 + + else: + raise NotImplementedError( + f"Data type {self.dtype} not yet supported" + ) + + @property + def null_count(self) -> int: + """ + Number of null elements. Should always be known. + """ + return self._col.null_count + + @property + def metadata(self) -> Dict[str, Any]: + """ + Store specific metadata of the column. + """ + return {} + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + return 1 + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFColumn"]: + """ + Return an iterable yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + return (self,) + + def get_buffers( + self, + ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ + buffers = {} + try: + buffers["validity"] = self._get_validity_buffer() + except RuntimeError: + buffers["validity"] = None + + try: + buffers["offsets"] = self._get_offsets_buffer() + except RuntimeError: + buffers["offsets"] = None + + buffers["data"] = self._get_data_buffer() + + return buffers + + def _get_validity_buffer( + self, + ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + """ + Return the buffer containing the mask values + indicating missing data and the buffer's associated dtype. + + Raises RuntimeError if null representation is not a bit or byte mask. + """ + + null, invalid = self.describe_null + if null == 3: + if self.dtype[0] == _DtypeKind.CATEGORICAL: + valid_mask = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes._get_mask_as_column() + else: + valid_mask = self._col._get_mask_as_column() + + assert (valid_mask is not None) and ( + valid_mask.data is not None + ), "valid_mask(.data) should not be None when " + "_CuDFColumn.describe_null[0] = 3" + buffer = _CuDFBuffer( + valid_mask.data, cp.uint8, allow_copy=self._allow_copy + ) + dtype = (_DtypeKind.UINT, 8, "C", "=") + return buffer, dtype + + elif null == 1: + raise RuntimeError( + "This column uses NaN as null " + "so does not have a separate mask" + ) + elif null == 0: + raise RuntimeError( + "This column is non-nullable so does not have a mask" + ) + else: + raise NotImplementedError( + f"See {self.__class__.__name__}.describe_null method." + ) + + def _get_offsets_buffer(self,) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + """ + Return the buffer containing the offset values for + variable-size binary data (e.g., variable-length strings) + and the buffer's associated dtype. + + Raises RuntimeError if the data buffer does not have an associated + offsets buffer. + """ + if self.dtype[0] == _DtypeKind.STRING: + offsets = self._col.children[0] + assert (offsets is not None) and (offsets.data is not None), " " + "offsets(.data) should not be None for string column" + + buffer = _CuDFBuffer( + offsets.data, offsets.dtype, allow_copy=self._allow_copy + ) + dtype = self._dtype_from_cudfdtype(offsets.dtype) + else: + raise RuntimeError( + "This column has a fixed-length dtype " + "so does not have an offsets buffer" + ) + + return buffer, dtype + + def _get_data_buffer(self,) -> Tuple[_CuDFBuffer, ProtoDtype]: + """ + Return the buffer containing the data and + the buffer's associated dtype. + """ + if self.dtype[0] in ( + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.BOOL, + ): + col_data = self._col + dtype = self.dtype + + elif self.dtype[0] == _DtypeKind.CATEGORICAL: + col_data = cast( + cudf.core.column.CategoricalColumn, self._col + ).codes + dtype = self._dtype_from_cudfdtype(col_data.dtype) + + elif self.dtype[0] == _DtypeKind.STRING: + col_data = self._col.children[1] + dtype = self._dtype_from_cudfdtype(col_data.dtype) + + else: + raise NotImplementedError( + f"Data type {self._col.dtype} not handled yet" + ) + assert (col_data is not None) and (col_data.data is not None), " " + f"col_data(.data) should not be None when dtype = {dtype}" + buffer = _CuDFBuffer( + col_data.data, col_data.dtype, allow_copy=self._allow_copy + ) + + return buffer, dtype + + +class _CuDFDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + Instances of this (private) class are returned from + ``cudf.DataFrame.__dataframe__`` as objects with the methods and + attributes defined on this class. + """ + + def __init__( + self, + df: "cudf.core.dataframe.DataFrame", + nan_as_null: bool = True, + allow_copy: bool = True, + ) -> None: + """ + Constructor - an instance of this (private) class is returned from + `cudf.DataFrame.__dataframe__`. + """ + self._df = df + # ``nan_as_null`` is a keyword intended for the consumer to tell the + # producer to overwrite null values in the data with + # ``NaN`` (or ``NaT``). + # This currently has no effect; once support for nullable extension + # dtypes is added, this value should be propagated to columns. + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + + @property + def metadata(self): + # `index` isn't a regular column, and the protocol doesn't support row + # labels - so we export it as cuDF-specific metadata here. + return {"cudf.index": self._df.index} + + def num_columns(self) -> int: + return len(self._df.columns) + + def num_rows(self) -> int: + return len(self._df) + + def num_chunks(self) -> int: + return 1 + + def column_names(self) -> Iterable[str]: + return self._df.columns.tolist() + + def get_column(self, i: int) -> _CuDFColumn: + return _CuDFColumn( + as_column(self._df.iloc[:, i]), allow_copy=self._allow_copy + ) + + def get_column_by_name(self, name: str) -> _CuDFColumn: + return _CuDFColumn( + as_column(self._df[name]), allow_copy=self._allow_copy + ) + + def get_columns(self) -> Iterable[_CuDFColumn]: + return [ + _CuDFColumn(as_column(self._df[name]), allow_copy=self._allow_copy) + for name in self._df.columns + ] + + def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame": + if not isinstance(indices, collections.abc.Sequence): + raise ValueError("`indices` is not a sequence") + + return _CuDFDataFrame(self._df.iloc[:, indices]) + + def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": + if not isinstance(names, collections.Sequence): + raise ValueError("`names` is not a sequence") + + return _CuDFDataFrame( + self._df.loc[:, names], self._nan_as_null, self._allow_copy + ) + + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["_CuDFDataFrame"]: + """ + Return an iterator yielding the chunks. + """ + return (self,) + + +def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True +) -> _CuDFDataFrame: + """ + The public method to attach to cudf.DataFrame. + + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this protocol + specifies contiguous buffers. + """ + return _CuDFDataFrame(self, nan_as_null=nan_as_null, allow_copy=allow_copy) + + +""" +Implementation of the dataframe exchange protocol. + +Public API +---------- + +from_dataframe : construct a cudf.DataFrame from an input data frame which + implements the exchange protocol + +Notes +----- + +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to + do in pure Python. It's more general but definitely less friendly than + having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), + this is worth looking at again. + +""" + + +# A typing protocol could be added later to let Mypy validate code using +# `from_dataframe` better. +DataFrameObject = Any +ColumnObject = Any + + +_INTS = {8: cp.int8, 16: cp.int16, 32: cp.int32, 64: cp.int64} +_UINTS = {8: cp.uint8, 16: cp.uint16, 32: cp.uint32, 64: cp.uint64} +_FLOATS = {32: cp.float32, 64: cp.float64} +_CP_DTYPES = {0: _INTS, 1: _UINTS, 2: _FLOATS, 20: {8: bool}} + + +def from_dataframe( + df: DataFrameObject, allow_copy: bool = False +) -> _CuDFDataFrame: + """ + Construct a cudf DataFrame from ``df`` if it supports ``__dataframe__`` + """ + if isinstance(df, cudf.DataFrame): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + return _from_dataframe(df.__dataframe__(allow_copy=allow_copy)) + + +def _from_dataframe(df: DataFrameObject) -> _CuDFDataFrame: + """ + Create a cudf DataFrame object from DataFrameObject. + """ + # Check number of chunks, if there's more than one we need to iterate + if df.num_chunks() > 1: + raise NotImplementedError("More than one chunk not handled yet") + + # We need a dict of columns here, with each column being a cudf column. + columns = dict() + _buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + col = df.get_column_by_name(name) + + if col.dtype[0] in ( + _DtypeKind.INT, + _DtypeKind.UINT, + _DtypeKind.FLOAT, + _DtypeKind.BOOL, + ): + columns[name], _buf = _protocol_to_cudf_column_numeric(col) + + elif col.dtype[0] == _DtypeKind.CATEGORICAL: + columns[name], _buf = _protocol_to_cudf_column_categorical(col) + + elif col.dtype[0] == _DtypeKind.STRING: + columns[name], _buf = _protocol_to_cudf_column_string(col) + + else: + raise NotImplementedError( + f"Data type {col.dtype[0]} not handled yet" + ) + + _buffers.append(_buf) + + df_new = cudf.DataFrame._from_data(columns) + df_new._buffers = _buffers + return df_new + + +def _protocol_to_cudf_column_numeric( + col: _CuDFColumn, +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: + """ + Convert an int, uint, float or bool protocol column + to the corresponding cudf column + """ + if col.offset != 0: + raise NotImplementedError("column.offset > 0 not handled yet") + + buffers = col.get_buffers() + assert buffers["data"] is not None, "data buffer should not be None" + _dbuffer, _ddtype = buffers["data"] + _check_buffer_is_on_gpu(_dbuffer) + cudfcol_num = build_column( + Buffer(_dbuffer.ptr, _dbuffer.bufsize), + protocol_dtype_to_cupy_dtype(_ddtype), + ) + return _set_missing_values(col, cudfcol_num), buffers + + +def _check_buffer_is_on_gpu(buffer: _CuDFBuffer) -> None: + if ( + buffer.__dlpack_device__()[0] != _Device.CUDA + and not buffer._allow_copy + ): + raise TypeError( + "This operation must copy data from CPU to GPU. " + "Set `allow_copy=True` to allow it." + ) + + elif buffer.__dlpack_device__()[0] != _Device.CUDA and buffer._allow_copy: + raise NotImplementedError( + "Only cuDF/GPU dataframes are supported for now. " + "CPU (like `Pandas`) dataframes will be supported shortly." + ) + + +def _set_missing_values( + protocol_col: _CuDFColumn, cudf_col: cudf.core.column.ColumnBase +) -> cudf.core.column.ColumnBase: + + valid_mask = protocol_col.get_buffers()["validity"] + if valid_mask is not None: + bitmask = cp.asarray( + Buffer(valid_mask[0].ptr, valid_mask[0].bufsize), cp.bool8 + ) + cudf_col[~bitmask] = None + + return cudf_col + + +def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: + kind = _dtype[0] + bitwidth = _dtype[1] + if _dtype[0] not in _SUPPORTED_KINDS: + raise RuntimeError(f"Data type {_dtype[0]} not handled yet") + + return _CP_DTYPES[kind][bitwidth] + + +def _protocol_to_cudf_column_categorical( + col: _CuDFColumn, +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: + """ + Convert a categorical column to a Series instance + """ + ordered, is_dict, mapping = col.describe_categorical + if not is_dict: + raise NotImplementedError( + "Non-dictionary categoricals not supported yet" + ) + + categories = as_column(mapping.values()) + buffers = col.get_buffers() + assert buffers["data"] is not None, "data buffer should not be None" + codes_buffer, codes_dtype = buffers["data"] + _check_buffer_is_on_gpu(codes_buffer) + cdtype = protocol_dtype_to_cupy_dtype(codes_dtype) + codes = build_column( + Buffer(codes_buffer.ptr, codes_buffer.bufsize), cdtype + ) + + cudfcol = build_categorical_column( + categories=categories, + codes=codes, + mask=codes.base_mask, + size=codes.size, + ordered=ordered, + ) + + return _set_missing_values(col, cudfcol), buffers + + +def _protocol_to_cudf_column_string( + col: _CuDFColumn, +) -> Tuple[ + cudf.core.column.ColumnBase, + Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], +]: + """ + Convert a string ColumnObject to cudf Column object. + """ + # Retrieve the data buffers + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + assert buffers["data"] is not None, "data buffer should never be None" + data_buffer, data_dtype = buffers["data"] + _check_buffer_is_on_gpu(data_buffer) + encoded_string = build_column( + Buffer(data_buffer.ptr, data_buffer.bufsize), + protocol_dtype_to_cupy_dtype(data_dtype), + ) + + # Retrieve the offsets buffer containing the index offsets demarcating + # the beginning and end of each string + assert buffers["offsets"] is not None, "not possible for string column" + offset_buffer, offset_dtype = buffers["offsets"] + _check_buffer_is_on_gpu(offset_buffer) + offsets = build_column( + Buffer(offset_buffer.ptr, offset_buffer.bufsize), + protocol_dtype_to_cupy_dtype(offset_dtype), + ) + + cudfcol_str = build_column( + None, dtype=cp.dtype("O"), children=(offsets, encoded_string) + ) + return _set_missing_values(col, cudfcol_str), buffers diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py new file mode 100644 index 00000000000..d24c8ca2860 --- /dev/null +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -0,0 +1,219 @@ +from typing import Any, Tuple + +import cupy as cp +import pandas as pd +import pytest + +import cudf +from cudf.core.buffer import Buffer +from cudf.core.column import build_column +from cudf.core.df_protocol import ( + DataFrameObject, + _CuDFBuffer, + _CuDFColumn, + _DtypeKind, + _from_dataframe, + protocol_dtype_to_cupy_dtype, +) +from cudf.testing._utils import assert_eq + + +def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): + buf, dtype = buffer_and_dtype + device_id = cp.asarray(cudfcol.data).device.id + assert buf.__dlpack_device__() == (2, device_id) + col_from_buf = build_column( + Buffer(buf.ptr, buf.bufsize), protocol_dtype_to_cupy_dtype(dtype) + ) + # check that non null values are the equals as nulls are represented + # by sentinel values in the buffer. + non_null_idxs = cudf.Series(cudfcol) != cudf.NA + assert_eq(col_from_buf[non_null_idxs], cudfcol[non_null_idxs]) + + if dtype[0] != _DtypeKind.BOOL: + array_from_dlpack = cp.fromDlpack(buf.__dlpack__()) + col_array = cp.asarray(cudfcol.data_array_view) + assert_eq(array_from_dlpack.flatten(), col_array.flatten()) + else: + pytest.raises(TypeError, buf.__dlpack__) + + +def assert_column_equal(col: _CuDFColumn, cudfcol): + assert col.size == cudfcol.size + assert col.offset == 0 + assert col.null_count == cudfcol.null_count + assert col.num_chunks() == 1 + if col.null_count == 0: + pytest.raises(RuntimeError, col._get_validity_buffer) + assert col.get_buffers()["validity"] is None + else: + assert_buffer_equal( + col.get_buffers()["validity"], + cudfcol._get_mask_as_column().astype(cp.uint8), + ) + + if col.dtype[0] == _DtypeKind.CATEGORICAL: + assert_buffer_equal(col.get_buffers()["data"], cudfcol.codes) + assert col.get_buffers()["offsets"] is None + + elif col.dtype[0] == _DtypeKind.STRING: + assert_buffer_equal(col.get_buffers()["data"], cudfcol.children[1]) + assert_buffer_equal(col.get_buffers()["offsets"], cudfcol.children[0]) + + else: + assert_buffer_equal(col.get_buffers()["data"], cudfcol) + assert col.get_buffers()["offsets"] is None + + if col.null_count == 0: + assert col.describe_null == (0, None) + else: + assert col.describe_null == (3, 0) + + +def assert_dataframe_equal(dfo: DataFrameObject, df: cudf.DataFrame): + assert dfo.num_columns() == len(df.columns) + assert dfo.num_rows() == len(df) + assert dfo.num_chunks() == 1 + assert dfo.column_names() == list(df.columns) + for col in df.columns: + assert_column_equal(dfo.get_column_by_name(col), df[col]._column) + + +def assert_from_dataframe_equals(dfobj): + df2 = _from_dataframe(dfobj) + + assert_dataframe_equal(dfobj, df2) + if isinstance(dfobj._df, cudf.DataFrame): + assert_eq(dfobj._df, df2) + + elif isinstance(dfobj._df, pd.DataFrame): + assert_eq(cudf.DataFrame(dfobj._df), df2) + + else: + raise TypeError(f"{type(dfobj._df)} not supported yet.") + + +def assert_from_dataframe_exception(dfobj): + exception_msg = "This operation must copy data from CPU to GPU." + " Set `allow_copy=True` to allow it." + with pytest.raises(TypeError, match=exception_msg): + _from_dataframe(dfobj) + + +def assert_df_unique_dtype_cols(data): + cdf = cudf.DataFrame(data=data) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + + +def test_from_dataframe(): + data = dict(a=[1, 2, 3], b=[9, 10, 11]) + df1 = cudf.DataFrame(data=data) + df2 = cudf.from_dataframe(df1) + assert_eq(df1, df2) + + +def test_int_dtype(): + data_int = dict(a=[1, 2, 3], b=[9, 10, 11]) + assert_df_unique_dtype_cols(data_int) + + +def test_float_dtype(): + data_float = dict(a=[1.5, 2.5, 3.5], b=[9.2, 10.5, 11.8]) + assert_df_unique_dtype_cols(data_float) + + +def test_categorical_dtype(): + cdf = cudf.DataFrame({"A": [1, 2, 5, 1]}) + cdf["A"] = cdf["A"].astype("category") + col = cdf.__dataframe__().get_column_by_name("A") + assert col.dtype[0] == _DtypeKind.CATEGORICAL + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(cdf.__dataframe__(allow_copy=True)) + + +def test_bool_dtype(): + data_bool = dict(a=[True, True, False], b=[False, True, False]) + assert_df_unique_dtype_cols(data_bool) + + +def test_string_dtype(): + data_string = dict(a=["a", "b", "cdef", "", "g"]) + assert_df_unique_dtype_cols(data_string) + + +def test_mixed_dtype(): + data_mixed = dict( + int=[1, 2, 3], + float=[1.5, 2.5, 3.5], + bool=[True, False, True], + categorical=[5, 1, 5], + string=["rapidsai-cudf ", "", "df protocol"], + ) + assert_df_unique_dtype_cols(data_mixed) + + +def test_NA_int_dtype(): + data_int = dict( + a=[1, None, 3, None, 5], + b=[9, 10, None, 7, 8], + c=[6, 19, 20, 100, 1000], + ) + assert_df_unique_dtype_cols(data_int) + + +def test_NA_float_dtype(): + data_float = dict( + a=[1.4, None, 3.6, None, 5.2], + b=[9.7, 10.9, None, 7.8, 8.2], + c=[6.1, 19.2, 20.3, 100.4, 1000.5], + ) + assert_df_unique_dtype_cols(data_float) + + +def test_NA_categorical_dtype(): + df = cudf.DataFrame({"A": [1, 2, 5, 1]}) + df["B"] = df["A"].astype("category") + df.at[[1, 3], "B"] = None # Set two items to null + + # Some detailed testing for correctness of dtype and null handling: + col = df.__dataframe__().get_column_by_name("B") + assert col.dtype[0] == _DtypeKind.CATEGORICAL + assert col.null_count == 2 + assert col.describe_null == (3, 0) + assert col.num_chunks() == 1 + assert col.describe_categorical == (False, True, {0: 1, 1: 2, 2: 5}) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + + +def test_NA_bool_dtype(): + data_bool = dict(a=[None, True, False], b=[False, None, None]) + assert_df_unique_dtype_cols(data_bool) + + +def test_NA_string_dtype(): + df = cudf.DataFrame({"A": ["a", "b", "cdef", "", "g"]}) + df["B"] = df["A"].astype("object") + df.at[1, "B"] = cudf.NA # Set one item to null + + # Test for correctness and null handling: + col = df.__dataframe__().get_column_by_name("B") + assert col.dtype[0] == _DtypeKind.STRING + assert col.null_count == 1 + assert col.describe_null == (3, 0) + assert col.num_chunks() == 1 + assert_from_dataframe_equals(df.__dataframe__(allow_copy=False)) + assert_from_dataframe_equals(df.__dataframe__(allow_copy=True)) + + +def test_NA_mixed_dtype(): + data_mixed = dict( + int=[1, None, 2, 3, 1000], + float=[None, 1.5, 2.5, 3.5, None], + bool=[True, None, False, None, None], + categorical=[5, 1, 5, 3, None], + string=[None, None, None, "df protocol", None], + ) + assert_df_unique_dtype_cols(data_mixed) From d4ff5185d10a988e26b9a32affed0ca5af821e78 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Thu, 18 Nov 2021 00:07:28 -0600 Subject: [PATCH 011/202] Simplify write_csv by removing unnecessary writer/impl classes (#9089) Depends on #9040 and (unfortunately) #9041 Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/9089 --- cpp/include/cudf/io/detail/csv.hpp | 56 ++++--------- cpp/src/io/csv/durations.hpp | 39 +++++++++ cpp/src/io/csv/writer_impl.cu | 113 +++++++++++++------------- cpp/src/io/csv/writer_impl.hpp | 122 ----------------------------- cpp/src/io/functions.cpp | 10 ++- 5 files changed, 116 insertions(+), 224 deletions(-) create mode 100644 cpp/src/io/csv/durations.hpp delete mode 100644 cpp/src/io/csv/writer_impl.hpp diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index aac44bed50e..c190340f6c1 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,45 +40,23 @@ table_with_metadata read_csv(std::unique_ptr&& source, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); -class writer { - public: - class impl; - - private: - std::unique_ptr _impl; - - public: - /** - * @brief Constructor for output to a file. - * - * @param sinkp The data sink to write the data to - * @param options Settings for controlling writing behavior - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation - */ - writer(std::unique_ptr sinkp, - csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); // cannot provide definition here (because - // _impl is incomplete hence unique_ptr has - // not enough sizeof() info) - - /** - * @brief Destructor explicitly-declared to avoid inlined in header - */ - ~writer(); +/** + * @brief Write an entire dataset to CSV format. + * + * @param sink Output sink + * @param table The set of columns + * @param metadata The metadata associated with the table + * @param options Settings for controlling behavior + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource to use for device memory allocation + */ +void write_csv(data_sink* sink, + table_view const& table, + const table_metadata* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** - * @brief Writes the entire dataset. - * - * @param table Set of columns to output - * @param metadata Table metadata and column names - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); -}; } // namespace csv } // namespace detail } // namespace io diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp new file mode 100644 index 00000000000..d42ddf3817c --- /dev/null +++ b/cpp/src/io/csv/durations.hpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include + +#include + +namespace cudf { +namespace io { +namespace detail { +namespace csv { + +std::unique_ptr pandas_format_durations( + column_view const& durations, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace csv +} // namespace detail +} // namespace io +} // namespace cudf diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index e8c673751db..b9b6fc6cf94 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -19,17 +19,25 @@ * @brief cuDF-IO CSV writer class implementation */ -#include "writer_impl.hpp" +#include "durations.hpp" + +#include "csv_common.h" +#include "csv_gpu.h" #include #include #include +#include +#include #include #include #include #include #include #include +#include +#include +#include #include #include @@ -40,13 +48,19 @@ #include #include +#include #include +#include +#include namespace cudf { namespace io { namespace detail { namespace csv { +using namespace cudf::io::csv; +using namespace cudf::io; + namespace { /** @@ -260,32 +274,16 @@ struct column_to_strings_fn { }; } // unnamed namespace -// Forward to implementation -writer::writer(std::unique_ptr sink, - csv_writer_options const& options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(std::move(sink), options, mr)) -{ -} - -// Destructor within this translation unit -writer::~writer() = default; - -writer::impl::impl(std::unique_ptr sink, - csv_writer_options const& options, - rmm::mr::device_memory_resource* mr) - : out_sink_(std::move(sink)), mr_(mr), options_(options) -{ -} - // write the header: column names: // -void writer::impl::write_chunked_begin(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_chunked_begin(data_sink* out_sink, + table_view const& table, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - if (options_.is_enabled_include_header()) { + if (options.is_enabled_include_header()) { // need to generate column names if metadata is not provided std::vector generated_col_names; if (metadata == nullptr) { @@ -298,8 +296,8 @@ void writer::impl::write_chunked_begin(table_view const& table, CUDF_EXPECTS(column_names.size() == static_cast(table.num_columns()), "Mismatch between number of column headers and table columns."); - auto const delimiter = options_.get_inter_column_delimiter(); - auto const terminator = options_.get_line_terminator(); + auto const delimiter = options.get_inter_column_delimiter(); + auto const terminator = options.get_line_terminator(); // process header names: // - if the header name includes the delimiter or terminator character, @@ -341,18 +339,21 @@ void writer::impl::write_chunked_begin(table_view const& table, } header.append(terminator); - out_sink_->host_write(header.data(), header.size()); + out_sink->host_write(header.data(), header.size()); } } -void writer::impl::write_chunked(strings_column_view const& str_column_view, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_chunked(data_sink* out_sink, + strings_column_view const& str_column_view, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // algorithm outline: // // for_each(strings_column.begin(), strings_column.end(), - // [sink = out_sink_](auto str_row) mutable { + // [sink = out_sink](auto str_row) mutable { // auto host_buffer = str_row.host_buffer(); // sink->host_write(host_buffer_.data(), host_buffer_.size()); // });//or...sink->device_write(device_buffer,...); @@ -362,7 +363,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); - cudf::string_scalar newline{options_.get_line_terminator()}; + cudf::string_scalar newline{options.get_line_terminator()}; auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream); strings_column_view strings_column{p_str_col_w_nl->view()}; @@ -370,9 +371,9 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, auto total_num_bytes = strings_column.chars_size(); char const* ptr_all_bytes = strings_column.chars_begin(); - if (out_sink_->is_device_write_preferred(total_num_bytes)) { + if (out_sink->is_device_write_preferred(total_num_bytes)) { // Direct write from device memory - out_sink_->device_write(ptr_all_bytes, total_num_bytes, stream); + out_sink->device_write(ptr_all_bytes, total_num_bytes, stream); } else { // copy the bytes to host to write them out thrust::host_vector h_bytes(total_num_bytes); @@ -383,30 +384,33 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view, stream.value())); stream.synchronize(); - out_sink_->host_write(h_bytes.data(), total_num_bytes); + out_sink->host_write(h_bytes.data(), total_num_bytes); } // Needs newline at the end, to separate from next chunk - if (out_sink_->is_device_write_preferred(newline.size())) { - out_sink_->device_write(newline.data(), newline.size(), stream); + if (out_sink->is_device_write_preferred(newline.size())) { + out_sink->device_write(newline.data(), newline.size(), stream); } else { - out_sink_->host_write(options_.get_line_terminator().data(), - options_.get_line_terminator().size()); + out_sink->host_write(options.get_line_terminator().data(), + options.get_line_terminator().size()); } } -void writer::impl::write(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) +void write_csv(data_sink* out_sink, + table_view const& table, + table_metadata const* metadata, + csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // write header: column names separated by delimiter: // (even for tables with no rows) // - write_chunked_begin(table, metadata, stream); + write_chunked_begin(out_sink, table, metadata, options, stream, mr); if (table.num_rows() > 0) { // no need to check same-size columns constraint; auto-enforced by table_view - auto n_rows_per_chunk = options_.get_rows_per_chunk(); + auto n_rows_per_chunk = options.get_rows_per_chunk(); // // This outputs the CSV in row chunks to save memory. // Maybe we can use the total_rows*count calculation and a memory threshold @@ -436,7 +440,7 @@ void writer::impl::write(table_view const& table, // convert each chunk to CSV: // - column_to_strings_fn converter{options_, stream, rmm::mr::get_current_device_resource()}; + column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()}; for (auto&& sub_view : vector_views) { // Skip if the table has no rows if (sub_view.num_rows() == 0) continue; @@ -459,32 +463,21 @@ void writer::impl::write(table_view const& table, // concatenate columns in each row into one big string column // (using null representation and delimiter): // - std::string delimiter_str{options_.get_inter_column_delimiter()}; + std::string delimiter_str{options.get_inter_column_delimiter()}; auto str_concat_col = [&] { if (str_table_view.num_columns() > 1) return cudf::strings::detail::concatenate(str_table_view, delimiter_str, - options_.get_na_rep(), + options.get_na_rep(), strings::separator_on_nulls::YES, stream); - cudf::string_scalar narep{options_.get_na_rep()}; + cudf::string_scalar narep{options.get_na_rep()}; return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); }(); - write_chunked(str_concat_col->view(), metadata, stream); + write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr); } } - - // finalize (no-op, for now, but offers a hook for future extensions): - // - write_chunked_end(table, metadata, stream); -} - -void writer::write(table_view const& table, - const table_metadata* metadata, - rmm::cuda_stream_view stream) -{ - _impl->write(table, metadata, stream); } } // namespace csv diff --git a/cpp/src/io/csv/writer_impl.hpp b/cpp/src/io/csv/writer_impl.hpp deleted file mode 100644 index 965c036dc75..00000000000 --- a/cpp/src/io/csv/writer_impl.hpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2020, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "csv_common.h" -#include "csv_gpu.h" - -#include -#include - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -namespace cudf { -namespace io { -namespace detail { -namespace csv { - -using namespace cudf::io::csv; -using namespace cudf::io; - -/** - * @brief Implementation for CSV writer - */ -class writer::impl { - public: - /** - * @brief Constructor with writer options. - * - * @param sink Output sink - * @param options Settings for controlling behavior - * @param mr Device memory resource to use for device memory allocation - */ - impl(std::unique_ptr sink, - csv_writer_options const& options, - rmm::mr::device_memory_resource* mr); - - /** - * @brief Write an entire dataset to CSV format. - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write the header of a CSV format. - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked_begin(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write dataset to CSV format without header. - * - * @param strings_column Subset of columns converted to string to be written. - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked(strings_column_view const& strings_column, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - /** - * @brief Write footer of CSV format (typically, empty). - * - * @param table The set of columns - * @param metadata The metadata associated with the table - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void write_chunked_end(table_view const& table, - const table_metadata* metadata = nullptr, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) - { - // purposely no-op (for now); - } - - private: - std::unique_ptr out_sink_; - rmm::mr::device_memory_resource* mr_ = nullptr; - csv_writer_options const options_; -}; - -std::unique_ptr pandas_format_durations( - column_view const& durations, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -} // namespace csv -} // namespace detail -} // namespace io -} // namespace cudf diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index a8ca1d3a459..402e212f07b 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -219,10 +219,14 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc using namespace cudf::io::detail; auto sink = make_datasink(options.get_sink()); - auto writer = - std::make_unique(std::move(sink), options, rmm::cuda_stream_default, mr); - writer->write(options.get_table(), options.get_metadata()); + return csv::write_csv( // + sink.get(), + options.get_table(), + options.get_metadata(), + options, + rmm::cuda_stream_default, + mr); } namespace detail_orc = cudf::io::detail::orc; From 406429a66fad55414fce22f2723270df411e1b75 Mon Sep 17 00:00:00 2001 From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com> Date: Thu, 18 Nov 2021 10:07:58 -0500 Subject: [PATCH 012/202] ceil/floor for `DatetimeIndex` (#9554) Follow-up to #9571 where we add `ceil` and `floor` support for `Series`. Here we add `ceil` and `floor` support to `DatetimeIndex` class. This PR is dependent on #9571 getting merged first since it assumes the `libcudf` implementation for `floor` exists. Authors: - Mayank Anand (https://github.com/mayankanand007) Approvers: - Michael Wang (https://github.com/isVoid) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9554 --- docs/cudf/source/api_docs/index_objects.rst | 2 + python/cudf/cudf/core/frame.py | 14 +++++ python/cudf/cudf/core/index.py | 62 +++++++++++++++++++++ python/cudf/cudf/tests/test_index.py | 26 +++++++++ 4 files changed, 104 insertions(+) diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 30269bb2a72..2a4dd5ff9c8 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -280,6 +280,8 @@ Time-specific operations :toctree: api/ DatetimeIndex.round + DatetimeIndex.ceil + DatetimeIndex.floor Conversion ~~~~~~~~~~ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 72239fc2a8e..58fe8a43d8d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3673,6 +3673,13 @@ def ceil(self): 3 5.0 dtype: float64 """ + + warnings.warn( + "Series.ceil and DataFrame.ceil are deprecated and will be \ + removed in the future", + DeprecationWarning, + ) + return self._unaryop("ceil") def floor(self): @@ -3705,6 +3712,13 @@ def floor(self): 5 3.0 dtype: float64 """ + + warnings.warn( + "Series.ceil and DataFrame.ceil are deprecated and will be \ + removed in the future", + DeprecationWarning, + ) + return self._unaryop("floor") def scale(self): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 35b80715cca..63fda21152d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1898,6 +1898,68 @@ def _get_dt_field(self, field): def is_boolean(self): return False + def ceil(self, field): + """ + Perform ceil operation on the data to the specified freq. + + Parameters + ---------- + field : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + DatetimeIndex + Index of the same type for a DatetimeIndex + + Examples + -------- + >>> import cudf + >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", + ... "1999-12-31 18:40:00"]) + >>> gIndex.ceil("T") + DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'], + dtype='datetime64[ns]', freq=None) + """ + out_column = self._values.ceil(field) + + return self.__class__._from_data({self.name: out_column}) + + def floor(self, field): + """ + Perform floor operation on the data to the specified freq. + + Parameters + ---------- + field : str + One of ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"]. + Must be a fixed frequency like 'S' (second) not 'ME' (month end). + See `frequency aliases `__ + for more details on these aliases. + + Returns + ------- + DatetimeIndex + Index of the same type for a DatetimeIndex + + Examples + -------- + >>> import cudf + >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59" + ... ,"1999-12-31 18:44:59"]) + >>> gIndex.floor("T") + DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], + dtype='datetime64[ns]', freq=None) + """ + out_column = self._values.floor(field) + + return self.__class__._from_data({self.name: out_column}) + class TimedeltaIndex(GenericIndex): """ diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c6cf7c4e6f5..ab211616a02 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2470,3 +2470,29 @@ def test_index_type_methods(data, func): assert_eq(False, actual) else: assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) +def test_index_datetime_ceil(resolution): + cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) + pidx = cuidx.to_pandas() + + pidx_ceil = pidx.ceil(resolution) + cuidx_ceil = cuidx.ceil(resolution) + + assert_eq(pidx_ceil, cuidx_ceil) + + +@pytest.mark.parametrize( + "resolution", ["D", "H", "T", "min", "S", "L", "ms", "U", "us", "N"] +) +def test_index_datetime_floor(resolution): + cuidx = cudf.DatetimeIndex([1000000, 2000000, 3000000, 4000000, 5000000]) + pidx = cuidx.to_pandas() + + pidx_floor = pidx.floor(resolution) + cuidx_floor = cuidx.floor(resolution) + + assert_eq(pidx_floor, cuidx_floor) From 91fd74e0e2b9ada200f3c707cc4d0ca4efee329a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 18 Nov 2021 09:42:48 -0700 Subject: [PATCH 013/202] Support `min` and `max` reduction for structs (#9697) This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` reduction. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Mark Harris (https://github.com/harrism) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/9697 --- cpp/src/groupby/sort/group_scan_util.cuh | 20 +-- .../sort/group_single_pass_reduction_util.cuh | 20 +-- .../arg_minmax_util.cuh} | 4 +- cpp/src/reductions/simple.cuh | 61 +++++++- cpp/tests/reductions/reduction_tests.cpp | 131 +++++++++++++++++- 5 files changed, 210 insertions(+), 26 deletions(-) rename cpp/src/{groupby/sort/group_util.cuh => reductions/arg_minmax_util.cuh} (98%) diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index 013ea924cce..b565e8dc6d8 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include @@ -221,16 +221,18 @@ struct group_scan_functor(0); if (values.has_nulls()) { - auto const binop = row_arg_minmax_fn(values.size(), - *d_flattened_values_ptr, - flattened_null_precedences.data(), - K == aggregation::MIN); + auto const binop = + cudf::reduction::detail::row_arg_minmax_fn(values.size(), + *d_flattened_values_ptr, + flattened_null_precedences.data(), + K == aggregation::MIN); do_scan(count_iter, map_begin, binop); } else { - auto const binop = row_arg_minmax_fn(values.size(), - *d_flattened_values_ptr, - flattened_null_precedences.data(), - K == aggregation::MIN); + auto const binop = + cudf::reduction::detail::row_arg_minmax_fn(values.size(), + *d_flattened_values_ptr, + flattened_null_precedences.data(), + K == aggregation::MIN); do_scan(count_iter, map_begin, binop); } diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 4e0820af236..decb127b264 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include @@ -271,10 +271,11 @@ struct group_reduction_functor< auto const count_iter = thrust::make_counting_iterator(0); auto const result_begin = result->mutable_view().template begin(); if (values.has_nulls()) { - auto const binop = row_arg_minmax_fn(values.size(), - *d_flattened_values_ptr, - flattened_null_precedences.data(), - K == aggregation::ARGMIN); + auto const binop = + cudf::reduction::detail::row_arg_minmax_fn(values.size(), + *d_flattened_values_ptr, + flattened_null_precedences.data(), + K == aggregation::ARGMIN); do_reduction(count_iter, result_begin, binop); // Generate bitmask for the output by segmented reduction of the input bitmask. @@ -288,10 +289,11 @@ struct group_reduction_functor< validity.begin(), validity.end(), thrust::identity{}, stream, mr); result->set_null_mask(std::move(null_mask), null_count); } else { - auto const binop = row_arg_minmax_fn(values.size(), - *d_flattened_values_ptr, - flattened_null_precedences.data(), - K == aggregation::ARGMIN); + auto const binop = + cudf::reduction::detail::row_arg_minmax_fn(values.size(), + *d_flattened_values_ptr, + flattened_null_precedences.data(), + K == aggregation::ARGMIN); do_reduction(count_iter, result_begin, binop); } diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/reductions/arg_minmax_util.cuh similarity index 98% rename from cpp/src/groupby/sort/group_util.cuh rename to cpp/src/reductions/arg_minmax_util.cuh index 31ff29ed4c3..40df23bcd8e 100644 --- a/cpp/src/groupby/sort/group_util.cuh +++ b/cpp/src/reductions/arg_minmax_util.cuh @@ -19,7 +19,7 @@ #include namespace cudf { -namespace groupby { +namespace reduction { namespace detail { /** @@ -62,5 +62,5 @@ struct row_arg_minmax_fn { }; } // namespace detail -} // namespace groupby +} // namespace reduction } // namespace cudf diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh index 13dfe5cb26c..7dd54e9250a 100644 --- a/cpp/src/reductions/simple.cuh +++ b/cpp/src/reductions/simple.cuh @@ -16,9 +16,13 @@ #pragma once +#include + #include #include +#include #include +#include #include #include #include @@ -28,6 +32,9 @@ #include #include +#include + +#include namespace cudf { namespace reduction { @@ -252,8 +259,7 @@ struct same_element_type_dispatcher { template static constexpr bool is_supported() { - return !(cudf::is_dictionary() || std::is_same_v || - std::is_same_v); + return !(cudf::is_dictionary() || std::is_same_v); } template () && - not cudf::is_fixed_point()>* = nullptr> + std::enable_if_t && + (std::is_same_v || + std::is_same_v)>* = nullptr> + std::unique_ptr operator()(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + if (input.is_empty()) { return cudf::make_empty_scalar_like(input, stream, mr); } + + auto constexpr is_min_op = std::is_same_v; + + // We will do reduction to find the ARGMIN/ARGMAX index, then return the element at that index. + // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the + // opposite for ARGMAX. + auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE; + auto const flattened_input = cudf::structs::detail::flatten_nested_columns( + table_view{{input}}, {}, std::vector{null_precedence}); + auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream); + auto const flattened_null_precedences = + is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream) + : rmm::device_uvector(0, stream); + + // Perform reduction to find ARGMIN/ARGMAX. + auto const do_reduction = [&](auto const& binop) { + return thrust::reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + size_type{0}, + binop); + }; + + auto const minmax_idx = [&] { + if (input.has_nulls()) { + auto const binop = cudf::reduction::detail::row_arg_minmax_fn( + input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op); + return do_reduction(binop); + } else { + auto const binop = cudf::reduction::detail::row_arg_minmax_fn( + input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op); + return do_reduction(binop); + } + }(); + + return cudf::detail::get_element(input, minmax_idx, stream, mr); + } + + template () && !cudf::is_fixed_point() && + !std::is_same_v>* = nullptr> std::unique_ptr operator()(column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 376f5ce5dd2..2c9279260e7 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -2055,7 +2056,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement) struct StructReductionTest : public cudf::test::BaseFixture { using SCW = cudf::test::structs_column_wrapper; - void reduction_test(SCW const& struct_column, + void reduction_test(cudf::column_view const& struct_column, cudf::table_view const& expected_value, bool succeeded_condition, bool is_valid, @@ -2066,7 +2067,7 @@ struct StructReductionTest : public cudf::test::BaseFixture { cudf::reduce(struct_column, agg, cudf::data_type(cudf::type_id::STRUCT)); auto struct_result = dynamic_cast(result.get()); EXPECT_EQ(is_valid, struct_result->is_valid()); - if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUAL(expected_value, struct_result->view()); } + if (is_valid) { CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_value, struct_result->view()); } }; if (succeeded_condition) { @@ -2210,4 +2211,130 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement) cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE)); } +TEST_F(StructReductionTest, StructReductionMinMaxNoNull) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + + auto const input = [] { + auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"}; + auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + return STRUCTS_CW{{child1, child2}}; + }(); + + { + auto const expected_child1 = STRINGS_CW{"$1"}; + auto const expected_child2 = INTS_CW{8}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_min_aggregation()); + } + + { + auto const expected_child1 = STRINGS_CW{"₹1"}; + auto const expected_child2 = INTS_CW{3}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_max_aggregation()); + } +} + +TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + constexpr int32_t dont_care{1}; + + auto const input_original = [] { + auto child1 = STRINGS_CW{"$dont_care", + "$dont_care", + "año", + "bit", + "₹1", + "aaa", + "zit", + "bat", + "aab", + "$1", + "€1", + "wut", + "₹dont_care"}; + auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care}; + return STRUCTS_CW{{child1, child2}}; + }(); + + auto const input = cudf::slice(input_original, {2, 12})[0]; + + { + auto const expected_child1 = STRINGS_CW{"$1"}; + auto const expected_child2 = INTS_CW{8}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_min_aggregation()); + } + + { + auto const expected_child1 = STRINGS_CW{"₹1"}; + auto const expected_child2 = INTS_CW{3}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_max_aggregation()); + } +} + +TEST_F(StructReductionTest, StructReductionMinMaxWithNulls) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + using cudf::test::iterators::nulls_at; + + auto const input = [] { + auto child1 = STRINGS_CW{{"año", + "bit", + "₹1" /*NULL*/, + "aaa" /*NULL*/, + "zit", + "bat", + "aab", + "$1" /*NULL*/, + "€1" /*NULL*/, + "wut"}, + nulls_at({2, 7})}; + auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10}, + nulls_at({2, 7})}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; + }(); + + { + auto const expected_child1 = STRINGS_CW{"aab"}; + auto const expected_child2 = INTS_CW{7}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_min_aggregation()); + } + + { + auto const expected_child1 = STRINGS_CW{"zit"}; + auto const expected_child2 = INTS_CW{5}; + this->reduction_test(input, + cudf::table_view{{expected_child1, expected_child2}}, + true, + true, + cudf::make_max_aggregation()); + } +} + CUDF_TEST_PROGRAM_MAIN() From fc82b1d206e93a46c9ef3535711c88ec20bd4fde Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 19 Nov 2021 02:06:54 +0530 Subject: [PATCH 014/202] Spell check fixes (#9682) Regular spell check fixes in comments and docs. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Charles Blackmon-Luca (https://github.com/charlesbluca) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/9682 --- cpp/src/binaryop/compiled/binary_ops.cuh | 4 ++-- cpp/src/groupby/sort/aggregate.cpp | 4 ++-- cpp/src/io/orc/aggregate_orc_metadata.cpp | 2 +- cpp/src/io/orc/aggregate_orc_metadata.hpp | 2 +- cpp/src/io/orc/stripe_enc.cu | 4 ++-- cpp/src/io/orc/writer_impl.cu | 2 +- cpp/src/io/parquet/parquet_gpu.hpp | 4 ++-- cpp/src/io/text/multibyte_split.cu | 2 +- cpp/src/lists/drop_list_duplicates.cu | 4 ++-- cpp/src/rolling/rolling_detail.cuh | 2 +- cpp/tests/column/column_view_shallow_test.cpp | 2 +- cpp/tests/datetime/datetime_ops_test.cpp | 4 ++-- cpp/tests/transform/row_bit_count_test.cu | 6 +++--- python/cudf/cudf/core/column/column.py | 4 ++-- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/decimal.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 8 ++++---- python/cudf/cudf/core/udf/pipeline.py | 2 +- python/cudf/cudf/core/udf/typing.py | 4 ++-- python/cudf/cudf/testing/testing.py | 2 +- python/cudf/cudf/tests/test_binops.py | 2 +- python/cudf/cudf/tests/test_custom_accessor.py | 2 +- python/cudf/cudf/tests/test_datetime.py | 2 +- python/cudf/cudf/tests/test_multiindex.py | 10 +++++----- python/cudf/cudf/tests/test_orc.py | 4 ++-- python/cudf/cudf/utils/gpu_utils.py | 2 +- python/cudf/cudf/utils/ioutils.py | 4 ++-- python/cudf/cudf/utils/utils.py | 4 ++-- python/dask_cudf/dask_cudf/_version.py | 2 +- python/dask_cudf/dask_cudf/backends.py | 2 +- python/dask_cudf/dask_cudf/io/parquet.py | 4 ++-- python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 2 +- 36 files changed, 57 insertions(+), 57 deletions(-) diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 84147fc9220..10e9b2532af 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -117,7 +117,7 @@ struct ops_wrapper { } else { return BinaryOperator{}.template operator()(x, y); } - // To supress nvcc warning + // To suppress nvcc warning return std::invoke_result_t{}; }(); if constexpr (is_bool_result()) @@ -164,7 +164,7 @@ struct ops2_wrapper { } else { return BinaryOperator{}.template operator()(x, y); } - // To supress nvcc warning + // To suppress nvcc warning return std::invoke_result_t{}; }(); if constexpr (is_bool_result()) diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 234bb447761..d68b701d75f 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -559,7 +559,7 @@ auto column_view_with_common_nulls(column_view const& column_0, column_view cons } /** - * @brief Perform covariance betweeen two child columns of non-nullable struct column. + * @brief Perform covariance between two child columns of non-nullable struct column. * */ template <> @@ -602,7 +602,7 @@ void aggregate_result_functor::operator()(aggregation c }; /** - * @brief Perform correlation betweeen two child columns of non-nullable struct column. + * @brief Perform correlation between two child columns of non-nullable struct column. * */ template <> diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index 45d60605936..82161233a92 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -79,7 +79,7 @@ void add_nested_columns(std::map>& selected_co * @brief Adds the column with the given id to the mapping * * All nested columns and direct ancestors of column `id` are included. - * Columns that are not on the direct path are excluded, which may result in prunning. + * Columns that are not on the direct path are excluded, which may result in pruning. */ void add_column_to_mapping(std::map>& selected_columns, metadata const& metadata, diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp index 5132906a5fc..01418fd3bd6 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.hpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp @@ -119,7 +119,7 @@ class aggregate_orc_metadata { * @brief Filters ORC file to a selection of columns, based on their paths in the file. * * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is - * ommited to match the cuDF table hierarchy. + * omitted to match the cuDF table hierarchy. * * @param column_paths List of full column names (i.e. paths) to select from the ORC file * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 217aee8756e..829e4877c44 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -660,7 +660,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s, auto const mask_byte = get_mask_byte(column.null_mask(), column.offset()); auto dst_offset = offset + s->nnz; auto vbuf_bit_idx = [](int row) { - // valid_buf is a circular buffer with validitiy of 8 rows in each element + // valid_buf is a circular buffer with validity of 8 rows in each element return row % (encode_block_size * 8); }; if (dst_offset % 8 == 0 and pd_set_cnt == 8) { @@ -696,7 +696,7 @@ static __device__ void encode_null_mask(orcenc_state_s* s, ByteRLE(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8; if (!t) { - // Number of rows enocoded so far + // Number of rows encoded so far s->present_out += nrows_encoded; s->numvals -= min(s->numvals, nrows_encoded); } diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 1563e3e1fd7..25c4bd65c8f 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1426,7 +1426,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, } } if (col.orc_kind() == LIST or col.orc_kind() == MAP) { - // Need a new pushdown mask unless both the parent and current colmn are not nullable + // Need a new pushdown mask unless both the parent and current column are not nullable auto const child_col = orc_table.column(col.child_begin()[0]); // pushdown mask applies to child column(s); use the child column size pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream); diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index ac2e6ba5cfb..1bd4cb3c6f4 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -307,7 +307,7 @@ struct EncColumnChunk { statistics_chunk const* stats; //!< Fragment statistics uint32_t bfr_size; //!< Uncompressed buffer size uint32_t compressed_size; //!< Compressed buffer size - uint32_t max_page_data_size; //!< Max data size (excuding header) of any page in this chunk + uint32_t max_page_data_size; //!< Max data size (excluding header) of any page in this chunk uint32_t page_headers_size; //!< Sum of size of all page headers uint32_t start_row; //!< First row of chunk uint32_t num_rows; //!< Number of rows in chunk @@ -489,7 +489,7 @@ void InitFragmentStatistics(cudf::detail::device_2dspan groups /** * @brief Initialize per-chunk hash maps used for dictionary with sentinel values * - * @param chunks Flat span of chunks to intialize hash maps for + * @param chunks Flat span of chunks to initialize hash maps for * @param stream CUDA stream to use */ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index a427809c81a..d287b9f2419 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -260,7 +260,7 @@ cudf::size_type multibyte_split_scan_full_source(cudf::io::text::data_chunk_sour // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block - // would have to follow seperate logic. + // would have to follow separate logic. multibyte_split_seed_kernel<<<1, 1, 0, stream.value()>>>( // tile_multistates, tile_offsets, diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 0663bc18ab3..527e834c76c 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -67,7 +67,7 @@ struct has_negative_nans_fn { * @brief A structure to be used along with type_dispatcher to check if a column has any * negative NaN value. * - * This functor is neccessary because when calling to segmented sort on the list entries, the + * This functor is necessary because when calling to segmented sort on the list entries, the * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output * lists. We want to move all NaN values close together in order to call unique_copy later on. */ @@ -563,7 +563,7 @@ std::pair, std::unique_ptr> drop_list_duplicates values ? cudf::empty_like(values.value().parent()) : nullptr}; } - // The child column conotaining list entries. + // The child column containing list entries. auto const keys_child = keys.get_sliced_child(stream); // Generate a mapping from list entries to their 1-based list indices for the keys column. diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index 12227404d83..bc1947dfeed 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -722,7 +722,7 @@ class rolling_aggregation_preprocessor final : public cudf::detail::simple_aggre } // STD aggregations depends on VARIANCE aggregation. Each element is applied - // with sqaured-root in the finalize() step. + // with square-root in the finalize() step. std::vector> visit(data_type, cudf::detail::std_aggregation const& agg) override { diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp index ab324ea8505..4afa96f08d7 100644 --- a/cpp/tests/column/column_view_shallow_test.cpp +++ b/cpp/tests/column/column_view_shallow_test.cpp @@ -84,7 +84,7 @@ TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes); // Test for fixed_width, dict, string, list, struct // column_view, column_view = same hash. // column_view, make a copy = same hash. -// new column_view from colmn = same hash +// new column_view from column = same hash // column_view, copy column = diff hash // column_view, diff column = diff hash. // diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index b70ac29fd5d..2097e09e674 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -758,7 +758,7 @@ TEST_F(BasicDatetimeOpsTest, TestIsLeapYear) 707904541L, // 1992-06-07 08:09:01 GMT - leap year -2181005247L, // 1900-11-20 09:12:33 GMT - non leap year 0L, // UNIX EPOCH 1970-01-01 00:00:00 GMT - non leap year - -12212553600L, // First full year of Gregorian Calandar 1583-01-01 00:00:00 - non-leap-year + -12212553600L, // First full year of Gregorian Calendar 1583-01-01 00:00:00 - non-leap-year 0L, // null 13591632822L, // 2400-09-13 13:33:42 GMT - leap year 4539564243L, // 2113-11-08 06:04:03 GMT - non leap year @@ -827,7 +827,7 @@ TEST_F(BasicDatetimeOpsTest, TestQuarter) 707904541L, // 1992-06-07 08:09:01 GMT -2181005247L, // 1900-11-20 09:12:33 GMT 0L, // UNIX EPOCH 1970-01-01 00:00:00 GMT - -12212553600L, // First full year of Gregorian Calandar 1583-01-01 00:00:00 + -12212553600L, // First full year of Gregorian Calendar 1583-01-01 00:00:00 0L, // null 13591632822L, // 2400-09-13 13:33:42 GMT 4539564243L, // 2113-11-08 06:04:03 GMT diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 4645ff9be5f..7fb7326f221 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -228,7 +228,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) // Tests that `row_bit_count()` can handle struct> with more // than max_block_size (256) rows. // With a large number of rows, computation spills to multiple thread-blocks, - // thus exercising the branch-stack comptutation. + // thus exercising the branch-stack computation. // The contents of the input column aren't as pertinent to this test as the // column size. For what it's worth, it looks as follows: // [ struct({0,1}), struct({2,3}), struct({4,5}), ... ] @@ -362,7 +362,7 @@ std::pair, std::unique_ptr> build_nested_and_exp // Inner list column // clang-format off cudf::test::lists_column_wrapper list{ - {1, 2, 3, 4, 5}, + {1, 2, 3, 4, 5}, {6, 7, 8}, {33, 34, 35, 36, 37, 38, 39}, {-1, -2}, @@ -408,7 +408,7 @@ std::unique_ptr build_nested_column(std::vector const& struct_vali // Inner list column // clang-format off - cudf::test::lists_column_wrapper list{ + cudf::test::lists_column_wrapper list{ {{1, 2, 3, 4, 5}, {2, 3}}, {{6, 7, 8}, {8, 9}}, {{1, 2}, {3, 4, 5}, {33, 34, 35, 36, 37, 38, 39}}}; diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 6f2f01c746d..e2bedd9d0b1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -337,7 +337,7 @@ def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray": else: return self.dropna(drop_nan=False).data_array_view - # TODO: This method is decpreated and can be removed when the associated + # TODO: This method is deprecated and can be removed when the associated # Frame methods are removed. def to_array(self, fillna=None) -> np.ndarray: """Get a dense numpy array for the data. @@ -1851,7 +1851,7 @@ def as_column( arbitrary = np.asarray(arbitrary) - # Handle case that `arbitary` elements are cupy arrays + # Handle case that `arbitrary` elements are cupy arrays if ( shape and shape[0] diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 756e48edccb..7c8837ef45f 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -543,7 +543,7 @@ def infer_format(element: str, **kwargs) -> str: if len(second_parts) > 1: # "Z" indicates Zulu time(widely used in aviation) - Which is # UTC timezone that currently cudf only supports. Having any other - # unsuppported timezone will let the code fail below + # unsupported timezone will let the code fail below # with a ValueError. second_parts.remove("Z") second_part = "".join(second_parts[1:]) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 6409a9f9196..7037b8e6f36 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -160,7 +160,7 @@ def binary_operator(self, op, other, reflect=False): if reflect: self, other = other, self - # Binary Arithmatics between decimal columns. `Scale` and `precision` + # Binary Arithmetics between decimal columns. `Scale` and `precision` # are computed outside of libcudf if op in ("add", "sub", "mul", "div"): scale = _binop_scale(self.dtype, other.dtype, op) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bfbe8b06c17..c0cb6f1917f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6367,7 +6367,7 @@ def wrapper(self, other, axis="columns", level=None, fill_value=None): # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature # string of a function by recursively delving into __wrapped__ until # it hits the first function that has __signature__ attribute set. To make - # the signature stirng of `wrapper` matches with its actual parameter list, + # the signature string of `wrapper` matches with its actual parameter list, # we directly set the __signature__ attribute of `wrapper` below. new_sig = inspect.signature( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index dc6461663ce..7f9f61ed3fd 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -18,7 +18,7 @@ # The three functions below return the quantiles [25%, 50%, 75%] -# respectively, which are called in the describe() method to ouput +# respectively, which are called in the describe() method to output # the summary stats of a GroupBy object def _quantile_25(x): return x.quantile(0.25) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 63fda21152d..5ea9ac945dc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -623,7 +623,7 @@ def _union(self, other, sort=None): else: return result - # If all the above optimizations don't cater to the inpputs, + # If all the above optimizations don't cater to the inputs, # we materialize RangeIndex's into `Int64Index` and # then perform `union`. return Int64Index(self._values)._union(other, sort=sort) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 8c4f87d5f67..a1eda697683 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -970,7 +970,7 @@ def _concat(cls, objs): source_data = [o.to_frame(index=False) for o in objs] - # TODO: Verify if this is really necesary or if we can rely on + # TODO: Verify if this is really necessary or if we can rely on # DataFrame._concat. if len(source_data) > 1: colnames = source_data[0].columns diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c804f2bca2c..cf035ef457d 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2916,7 +2916,7 @@ def unique(self): def nunique(self, method="sort", dropna=True): """Returns the number of unique values of the Series: approximate version, - and exact version to be moved to libgdf + and exact version to be moved to libcudf Excludes NA values by default. @@ -2985,7 +2985,7 @@ def value_counts( Returns ------- - result : Series contanining counts of unique values. + result : Series containing counts of unique values. See also -------- @@ -3802,7 +3802,7 @@ def wrapper(self, other, level=None, fill_value=None, axis=0): # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature # string of a function by recursively delving into __wrapped__ until # it hits the first function that has __signature__ attribute set. To make - # the signature stirng of `wrapper` matches with its actual parameter list, + # the signature string of `wrapper` matches with its actual parameter list, # we directly set the __signature__ attribute of `wrapper` below. new_sig = inspect.signature( @@ -5054,7 +5054,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): """Returns a boolean array where two arrays are equal within a tolerance. - Two values in ``a`` and ``b`` are considiered equal when the following + Two values in ``a`` and ``b`` are considered equal when the following equation is satisfied. .. math:: diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py index deb4546e8b8..2464906be04 100644 --- a/python/cudf/cudf/core/udf/pipeline.py +++ b/python/cudf/cudf/core/udf/pipeline.py @@ -316,7 +316,7 @@ def compile_or_get(frame, func, args): Return a compiled kernel in terms of MaskedTypes that launches a kernel equivalent of `f` for the dtypes of `df`. The kernel uses a thread for each row and calls `f` using that rows data / mask - to produce an output value and output valdity for each row. + to produce an output value and output validity for each row. If the UDF has already been compiled for this requested dtypes, a cached version will be returned instead of running compilation. diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py index 4b0f0bf1283..da7ff4c0e32 100644 --- a/python/cudf/cudf/core/udf/typing.py +++ b/python/cudf/cudf/core/udf/typing.py @@ -67,7 +67,7 @@ def unify(self, context, other): """ Often within a UDF an instance arises where a variable could be a `MaskedType`, an `NAType`, or a literal based off - the data at runtime, for examplem the variable `ret` here: + the data at runtime, for example the variable `ret` here: def f(x): if x == 1: @@ -185,7 +185,7 @@ class NAType(types.Type): """ A type for handling ops against nulls Exists so we can: - 1. Teach numba that all occurances of `cudf.NA` are + 1. Teach numba that all occurrences of `cudf.NA` are to be read as instances of this type instead 2. Define ops like `if x is cudf.NA` where `x` is of type `Masked` to mean `if x.valid is False` diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py index 9562fca7399..59c291eea0b 100644 --- a/python/cudf/cudf/testing/testing.py +++ b/python/cudf/cudf/testing/testing.py @@ -410,7 +410,7 @@ def assert_series_equal( Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True - Whether to check the seires class, dtype and + Whether to check the series class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. check_less_precise : bool or int, default False diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 542dcd9301c..ba2a6dce369 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1173,7 +1173,7 @@ def make_scalar_product_data(): ) ) - # we can muliply any timedelta by any int, or bool + # we can multiply any timedelta by any int, or bool valid |= set(product(TIMEDELTA_TYPES, INTEGER_TYPES | BOOL_TYPES)) # we can multiply a float by any int, float, or bool diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index 16e5b345ce2..bfd2ccbccef 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -44,7 +44,7 @@ def test_dataframe_accessor(gdf): "gdf2", [gd.datasets.randomdata(nrows=1, dtypes={"x": int, "y": int})] ) def test_dataframe_accessor_idendity(gdf1, gdf2): - """Test for accessor idendities + """Test for accessor identities - An object should hold persistent reference to the same accessor - Different objects should hold difference instances of the accessor """ diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index bf75badc06f..a95be4f7932 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -171,7 +171,7 @@ def test_dt_ops(data): assert_eq(pd_data > pd_data, gdf_data > gdf_data) -# libgdf doesn't respect timezones +# libcudf doesn't respect timezones @pytest.mark.parametrize("data", [data1()]) @pytest.mark.parametrize("field", fields) def test_dt_series(data, field): diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index d409a099806..07407b8d359 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -738,9 +738,9 @@ def test_multiindex_copy_sem(data, levels, codes, names): ) @pytest.mark.parametrize("deep", [True, False]) def test_multiindex_copy_deep(data, deep): - """Test memory idendity for deep copy + """Test memory identity for deep copy Case1: Constructed from GroupBy, StringColumns - Case2: Constrcuted from MultiIndex, NumericColumns + Case2: Constructed from MultiIndex, NumericColumns """ same_ref = not deep @@ -768,19 +768,19 @@ def test_multiindex_copy_deep(data, deep): mi1 = data mi2 = mi1.copy(deep=deep) - # Assert ._levels idendity + # Assert ._levels identity lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels] rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels] assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) - # Assert ._codes idendity + # Assert ._codes identity lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()] rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()] assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) - # Assert ._data idendity + # Assert ._data identity lptrs = [d.base_data.ptr for _, d in mi1._data.items()] rptrs = [d.base_data.ptr for _, d in mi2._data.items()] diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 99b5652110b..6b02874146e 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -502,7 +502,7 @@ def test_orc_writer_sliced(tmpdir): "TestOrcFile.decimal.orc", "TestOrcFile.decimal.same.values.orc", "TestOrcFile.decimal.multiple.values.orc", - # For addional information take look at PR 7034 + # For additional information take look at PR 7034 "TestOrcFile.decimal.runpos.issue.orc", ], ) @@ -541,7 +541,7 @@ def test_orc_decimal_precision_fail(datadir): assert_eq(pdf, gdf) -# For addional information take look at PR 6636 and 6702 +# For additional information take look at PR 6636 and 6702 @pytest.mark.parametrize( "orc_file", [ diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py index 77963f8bcc1..dbdd68f2df8 100644 --- a/python/cudf/cudf/utils/gpu_utils.py +++ b/python/cudf/cudf/utils/gpu_utils.py @@ -143,7 +143,7 @@ def _try_get_old_or_new_symbols(): cuda_driver_supported_rt_version >= 11000 and cuda_runtime_version >= 11000 ): - # With cuda enhanced compatibitlity any code compiled + # With cuda enhanced compatibility any code compiled # with 11.x version of cuda can now run on any # driver >= 450.80.02. 11000 is the minimum cuda # version 450.80.02 supports. diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 11994830fed..0f9d9d53b23 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1038,7 +1038,7 @@ should consume messages from. Valid values are 0 - (N-1) start_offset : int, Kafka Topic/Partition offset that consumption should begin at. Inclusive. -end_offset : int, Kafka Topic/Parition offset that consumption +end_offset : int, Kafka Topic/Partition offset that consumption should end at. Inclusive. batch_timeout : int, default 10000 Maximum number of milliseconds that will be spent trying to @@ -1061,7 +1061,7 @@ or any object with a `read()` method (such as builtin `open()` file handler function or `StringIO`). delimiter : string, default None, The delimiter that should be used - for splitting text chunks into seperate cudf column rows. Currently + for splitting text chunks into separate cudf column rows. Currently only a single delimiter is supported. Returns diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4f9b23bf6fe..a9611a91554 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -353,7 +353,7 @@ def get_appropriate_dispatched_func( elif hasattr(cupy_submodule, fname): cupy_func = getattr(cupy_submodule, fname) - # Handle case if cupy impliments it as a numpy function + # Handle case if cupy implements it as a numpy function # Unsure if needed if cupy_func is func: return NotImplemented @@ -374,7 +374,7 @@ def _cast_to_appropriate_cudf_type(val, index=None): elif (val.ndim == 1) or (val.ndim == 2 and val.shape[1] == 1): # if index is not None and is of a different length # than the index, cupy dispatching behaviour is undefined - # so we dont impliment it + # so we don't implement it if (index is None) or (len(index) == len(val)): return cudf.Series(val, index=index) diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py index eb7457f3465..8ca2cf98381 100644 --- a/python/dask_cudf/dask_cudf/_version.py +++ b/python/dask_cudf/dask_cudf/_version.py @@ -417,7 +417,7 @@ def render_pep440_old(pieces): The ".dev0" means dirty. - Eexceptions: + Exceptions: 1: no tags. 0.postDISTANCE[.dev0] """ if pieces["closest-tag"]: diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index f81a4743a4a..89b5301ee83 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -196,7 +196,7 @@ def make_meta_object_cudf(x, index=None): ) elif not hasattr(x, "dtype") and x is not None: # could be a string, a dtype object, or a python type. Skip `None`, - # because it is implictly converted to `dtype('f8')`, which we don't + # because it is implicitly converted to `dtype('f8')`, which we don't # want here. try: dtype = np.dtype(x) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 2e5d55e92d2..b47a5e78095 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -111,7 +111,7 @@ def _read_paths( frag = next(ds.get_fragments()) if frag: # Extract hive-partition keys, and make sure they - # are orderd the same as they are in `partitions` + # are ordered the same as they are in `partitions` raw_keys = pa_ds._get_partition_keys(frag.partition_expression) partition_keys = [ (hive_part.name, raw_keys[hive_part.name]) @@ -173,7 +173,7 @@ def read_partition( strings_to_cats = kwargs.get("strings_to_categorical", False) - # Assume multi-peice read + # Assume multi-piece read paths = [] rgs = [] last_partition_keys = None diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index d93037b3802..706b0e272ea 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -378,7 +378,7 @@ def test_chunksize(tmpdir, chunksize, metadata): # one output partition assert ddf3.npartitions == 1 else: - # Files can be aggregateed together, but + # Files can be aggregated together, but # chunksize is not large enough to produce # a single output partition assert ddf3.npartitions < num_row_groups From c1bfb26715e0234f6d90aceac7a52caded2e9f9e Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 18 Nov 2021 19:29:14 -0500 Subject: [PATCH 015/202] Fix regex non-multiline EOL/$ matching strings ending with a new-line (#9715) Closes #9620 Fixes an edge case described in https://docs.python.org/3/library/re.html#re.MULTILINE where the '$' EOL regex pattern character (without `MULTILINE` set) should match at the very end of a string and also just before the end of the string if the end of that string contains a new-line. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Christopher Harris (https://github.com/cwharris) - Vukasin Milovanovic (https://github.com/vuule) - Sheilah Kirui (https://github.com/skirui-source) URL: https://github.com/rapidsai/cudf/pull/9715 --- cpp/src/strings/regex/regex.inl | 5 ++++- cpp/tests/strings/contains_tests.cpp | 17 +++++++++-------- python/cudf/cudf/tests/test_string.py | 5 +++-- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl index 66e99756615..bc0679993d0 100644 --- a/cpp/src/strings/regex/regex.inl +++ b/cpp/src/strings/regex/regex.inl @@ -276,7 +276,10 @@ __device__ inline int32_t reprog_device::regexec( } break; case EOL: - if (last_character || (inst->u1.c == '$' && c == '\n')) { + if (last_character || + (c == '\n' && (inst->u1.c == '$' || + // edge case where \n appears at the end of the string + pos + 1 == dstr.length()))) { id_activate = inst->u2.next_id; expanded = true; } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 3c11444e4b5..229f9e4cc82 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -302,28 +302,29 @@ TEST_F(StringsContainsTests, CountTest) TEST_F(StringsContainsTests, MultiLine) { - auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""}); - auto view = cudf::strings_column_view(input); + auto input = + cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); + auto view = cudf::strings_column_view(input); auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0}); + auto expected_contains = cudf::test::fixed_width_column_wrapper({1, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); results = cudf::strings::contains_re(view, "^abc$"); - expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_contains = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains); results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0}); + auto expected_matches = cudf::test::fixed_width_column_wrapper({1, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); results = cudf::strings::matches_re(view, "^abc$"); - expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_matches = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches); results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE); - auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0}); + auto expected_count = cudf::test::fixed_width_column_wrapper({2, 1, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); results = cudf::strings::count_re(view, "^abc$"); - expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0}); + expected_count = cudf::test::fixed_width_column_wrapper({0, 0, 1, 0, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count); } diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index c75eb91a335..cf52c4684c8 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1746,12 +1746,13 @@ def test_string_wrap(data, width): ["A B", "1.5", "3,000"], ["23", "³", "⅕", ""], [" ", "\t\r\n ", ""], - ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], + ["$", "B", "Aab$", "$$ca", "C$B$", "cat", "cat\n"], ["line\nto be wrapped", "another\nline\nto be wrapped"], ], ) @pytest.mark.parametrize( - "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"] + "pat", + ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be", "cat$"], ) @pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL]) def test_string_count(data, pat, flags): From 05dd5415b1391270ea74d1f33080bbbf58f848cc Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 19 Nov 2021 14:32:37 -0800 Subject: [PATCH 016/202] Use List of Columns as Input for `drop_nulls`, `gather` and `drop_duplicates` (#9558) Currently, there are several APIs that accepts a `Frame` object as input, in corresponding to their libcudf counterparts that accepts a `table_view`. To make some also work for columns, currently we pass them through `as_frame` and return with `_as_column`. This PR changes the cython API to accept a list of columns and greatly reduces the overhead of column roundtrip (see benchmark for column APIs below). Starting as a pilot study of standardizing cython calling convention for table APIs, some decisions were made in this PR: 1. Use `list` as the container for the collection of the columns. Ideally, an iterable is most pythonic, but may lose some type safety. 2. The column collection is agnostic to index/data columns, libcudf handle index column separately either. This helps simplify cython logics.
Gather/Take Benchmark ``` ----------------------------------- benchmark '100-random-False': 4 tests ------------------------------------ Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------------------- gather_single_column[100-random-False] (afte) 420.4372 (1.0) 552.7758 (1.0) 428.8227 (1.0) gather_single_column[100-random-False] (befo) 597.7047 (1.42) 811.8181 (1.47) 606.3709 (1.41) take_multiple_column[100-random-False] (afte) 849.6591 (2.02) 6,339.7521 (11.47) 870.1292 (2.03) take_multiple_column[100-random-False] (befo) 864.0001 (2.06) 1,091.5170 (1.97) 872.8270 (2.04) -------------------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '100-random-True': 4 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------------------- gather_single_column[100-random-True] (afte) 141.4879 (1.0) 3,144.3723 (2.64) 145.7316 (1.0) gather_single_column[100-random-True] (befo) 291.5259 (2.06) 3,083.7669 (2.59) 299.2343 (2.05) take_multiple_column[100-random-True] (afte) 958.2350 (6.77) 1,295.6643 (1.09) 971.2230 (6.66) take_multiple_column[100-random-True] (befo) 967.4439 (6.84) 1,191.7809 (1.0) 976.4725 (6.70) ------------------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '100-reverse-False': 4 tests ----------------------------------- Name (time in us) Min Max Mean --------------------------------------------------------------------------------------------------------------- gather_single_column[100-reverse-False] (afte) 414.2257 (1.0) 6,856.2678 (2.05) 426.5804 (1.0) gather_single_column[100-reverse-False] (befo) 589.7889 (1.42) 3,387.3413 (1.01) 602.0794 (1.41) take_multiple_column[100-reverse-False] (afte) 849.6824 (2.05) 4,650.7069 (1.39) 862.7702 (2.02) take_multiple_column[100-reverse-False] (befo) 863.7700 (2.09) 3,348.6579 (1.0) 877.5145 (2.06) --------------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '100-reverse-True': 4 tests ------------------------------------ Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------------------- gather_single_column[100-reverse-True] (afte) 141.5601 (1.0) 292.0129 (1.0) 144.5997 (1.0) gather_single_column[100-reverse-True] (befo) 286.7738 (2.03) 4,374.5530 (14.98) 297.3910 (2.06) take_multiple_column[100-reverse-True] (afte) 960.0958 (6.78) 1,354.3908 (4.64) 973.7589 (6.73) take_multiple_column[100-reverse-True] (befo) 963.5990 (6.81) 1,175.8050 (4.03) 975.9332 (6.75) -------------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '100-sequence-False': 4 tests ------------------------------------ Name (time in us) Min Max Mean ---------------------------------------------------------------------------------------------------------------- gather_single_column[100-sequence-False] (afte) 418.4479 (1.0) 4,602.9259 (2.09) 436.3953 (1.0) gather_single_column[100-sequence-False] (befo) 589.5318 (1.41) 4,665.3422 (2.12) 605.6177 (1.39) take_multiple_column[100-sequence-False] (afte) 851.3979 (2.03) 5,037.6062 (2.29) 866.8329 (1.99) take_multiple_column[100-sequence-False] (befo) 858.9821 (2.05) 2,197.5730 (1.0) 872.5517 (2.00) ---------------------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '100-sequence-True': 4 tests ----------------------------------- Name (time in us) Min Max Mean --------------------------------------------------------------------------------------------------------------- gather_single_column[100-sequence-True] (afte) 145.0991 (1.0) 229.3726 (1.0) 148.7882 (1.0) gather_single_column[100-sequence-True] (befo) 289.9761 (2.00) 363.9143 (1.59) 295.9855 (1.99) take_multiple_column[100-sequence-True] (afte) 961.4970 (6.63) 1,028.0283 (4.48) 969.3146 (6.51) take_multiple_column[100-sequence-True] (befo) 962.7347 (6.64) 1,048.2450 (4.57) 973.8807 (6.55) --------------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '10000-random-False': 4 tests ------------------------------------ Name (time in us) Min Max Mean ---------------------------------------------------------------------------------------------------------------- gather_single_column[10000-random-False] (afte) 419.3909 (1.0) 669.2931 (1.0) 427.0140 (1.0) gather_single_column[10000-random-False] (befo) 600.0311 (1.43) 2,198.0200 (3.28) 610.3418 (1.43) take_multiple_column[10000-random-False] (afte) 862.4257 (2.06) 4,764.4433 (7.12) 880.1974 (2.06) take_multiple_column[10000-random-False] (befo) 873.0851 (2.08) 1,024.1494 (1.53) 881.4482 (2.06) ---------------------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '10000-random-True': 4 tests ----------------------------------- Name (time in us) Min Max Mean --------------------------------------------------------------------------------------------------------------- gather_single_column[10000-random-True] (afte) 134.2846 (1.0) 4,995.3298 (12.11) 139.0623 (1.0) gather_single_column[10000-random-True] (befo) 284.2899 (2.12) 412.4213 (1.0) 289.8005 (2.08) take_multiple_column[10000-random-True] (afte) 960.2159 (7.15) 1,361.8441 (3.30) 973.4057 (7.00) take_multiple_column[10000-random-True] (befo) 965.8998 (7.19) 1,140.6899 (2.77) 976.9224 (7.03) --------------------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '10000-reverse-False': 4 tests ----------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------------------- gather_single_column[10000-reverse-False] (afte) 419.7811 (1.0) 634.7937 (1.0) 428.2997 (1.0) gather_single_column[10000-reverse-False] (befo) 600.3999 (1.43) 762.5762 (1.20) 608.6369 (1.42) take_multiple_column[10000-reverse-False] (afte) 856.1970 (2.04) 1,138.3081 (1.79) 870.1638 (2.03) take_multiple_column[10000-reverse-False] (befo) 869.8748 (2.07) 3,184.0033 (5.02) 889.7182 (2.08) ----------------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '10000-reverse-True': 4 tests ------------------------------------ Name (time in us) Min Max Mean ---------------------------------------------------------------------------------------------------------------- gather_single_column[10000-reverse-True] (afte) 135.4842 (1.0) 3,634.2950 (7.81) 140.8658 (1.0) gather_single_column[10000-reverse-True] (befo) 284.9372 (2.10) 465.4219 (1.0) 292.6105 (2.08) take_multiple_column[10000-reverse-True] (afte) 957.0192 (7.06) 1,240.3540 (2.67) 966.7779 (6.86) take_multiple_column[10000-reverse-True] (befo) 967.6940 (7.14) 1,062.0849 (2.28) 975.9307 (6.93) ---------------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '10000-sequence-False': 4 tests ------------------------------------ Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------------------------ gather_single_column[10000-sequence-False] (afte) 420.3622 (1.0) 555.1544 (1.0) 427.4441 (1.0) gather_single_column[10000-sequence-False] (befo) 601.7918 (1.43) 3,534.9689 (6.37) 613.6190 (1.44) take_multiple_column[10000-sequence-False] (afte) 858.0340 (2.04) 1,166.5919 (2.10) 868.6121 (2.03) take_multiple_column[10000-sequence-False] (befo) 871.3542 (2.07) 1,118.0961 (2.01) 881.9761 (2.06) ------------------------------------------------------------------------------------------------------------------ ------------------------------------ benchmark '10000-sequence-True': 4 tests ----------------------------------- Name (time in us) Min Max Mean ----------------------------------------------------------------------------------------------------------------- gather_single_column[10000-sequence-True] (afte) 135.8581 (1.0) 3,894.4702 (3.55) 141.3496 (1.0) gather_single_column[10000-sequence-True] (befo) 284.5018 (2.09) 2,703.6560 (2.47) 290.8583 (2.06) take_multiple_column[10000-sequence-True] (afte) 957.4448 (7.05) 1,096.1141 (1.0) 966.4487 (6.84) take_multiple_column[10000-sequence-True] (befo) 966.2341 (7.11) 1,242.0323 (1.13) 978.3753 (6.92) ----------------------------------------------------------------------------------------------------------------- ```
Dropna Benchmark ``` ------------------------------------ benchmark '100-False': 2 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------------- dropna_single_column[100-False] (afte) 143.9294 (1.0) 6,808.9343 (1.58) 150.8468 (1.0) dropna_single_column[100-False] (befo) 306.3441 (2.13) 4,297.9000 (1.0) 315.3899 (2.09) ------------------------------------------------------------------------------------------------------- ---------------------------------- benchmark '100-True': 2 tests ----------------------------------- Name (time in us) Min Max Mean ---------------------------------------------------------------------------------------------------- dropna_single_column[100-True] (afte) 275.7823 (1.0) 327.2779 (1.0) 279.8443 (1.0) dropna_single_column[100-True] (befo) 548.6836 (1.99) 692.2791 (2.12) 557.9867 (1.99) ---------------------------------------------------------------------------------------------------- ------------------------------------ benchmark '10000-False': 2 tests ----------------------------------- Name (time in us) Min Max Mean --------------------------------------------------------------------------------------------------------- dropna_single_column[10000-False] (afte) 164.9209 (1.0) 5,742.9820 (1.61) 170.0143 (1.0) dropna_single_column[10000-False] (befo) 328.6479 (1.99) 3,565.7589 (1.0) 336.6208 (1.98) --------------------------------------------------------------------------------------------------------- ----------------------------------- benchmark '10000-True': 2 tests ------------------------------------ Name (time in us) Min Max Mean -------------------------------------------------------------------------------------------------------- dropna_single_column[10000-True] (afte) 304.6701 (1.0) 441.9931 (1.0) 309.9858 (1.0) dropna_single_column[10000-True] (befo) 571.9690 (1.88) 5,526.0560 (12.50) 586.4943 (1.89) -------------------------------------------------------------------------------------------------------- ```
Unique/Drop_duplicate Benchmark ``` ------------------------------------ benchmark '100': 4 tests ----------------------------------- Name (time in us) Min Max Mean ------------------------------------------------------------------------------------------------- drop_duplicate_df[100] (afte) 891.9560 (2.77) 1,151.0071 (2.76) 904.5752 (2.74) drop_duplicate_df[100] (befo) 880.9832 (2.74) 5,528.1101 (13.23) 896.1535 (2.72) unique_single_column[100] (afte) 322.0579 (1.0) 417.7210 (1.0) 329.5932 (1.0) unique_single_column[100] (befo) 480.7310 (1.49) 4,470.7772 (10.70) 491.7183 (1.49) ------------------------------------------------------------------------------------------------- -------------------------------- benchmark '10000': 4 tests ------------------------------- Name (time in ms) Min Max Mean ------------------------------------------------------------------------------------------- drop_duplicate_df[10000] (afte) 1.0108 (2.23) 3.9981 (4.72) 1.0280 (2.17) drop_duplicate_df[10000] (befo) 1.0021 (2.21) 3.5031 (4.14) 1.0177 (2.15) unique_single_column[10000] (afte) 0.4534 (1.0) 4.5188 (5.33) 0.4740 (1.0) unique_single_column[10000] (befo) 0.6095 (1.34) 0.8471 (1.0) 0.6332 (1.34) ------------------------------------------------------------------------------------------- ```
Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/9558 --- python/cudf/cudf/_lib/copying.pyx | 39 ++--- python/cudf/cudf/_lib/stream_compaction.pyx | 81 +++------- python/cudf/cudf/_lib/utils.pxd | 1 + python/cudf/cudf/_lib/utils.pyx | 34 +++- python/cudf/cudf/core/column/column.py | 56 ++++--- python/cudf/cudf/core/frame.py | 162 ++++++++++++-------- python/cudf/cudf/core/index.py | 29 +--- python/cudf/cudf/core/indexed_frame.py | 113 +++++++++++++- python/cudf/cudf/core/multiindex.py | 8 +- python/cudf/cudf/utils/utils.py | 18 +++ 10 files changed, 322 insertions(+), 219 deletions(-) diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 26ef428f21f..28bd78733a3 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -37,7 +37,12 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport size_type -from cudf._lib.utils cimport data_from_table_view, data_from_unique_ptr +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_table_view, + data_from_unique_ptr, + table_view_from_columns, +) # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar @@ -144,27 +149,12 @@ def copy_range(Column input_column, def gather( - source_table, + columns: list, Column gather_map, - bool keep_index=True, - bool nullify=False, - bool check_bounds=True + bool nullify=False ): - if not pd.api.types.is_integer_dtype(gather_map.dtype): - raise ValueError("Gather map is not integer dtype.") - - if check_bounds and len(gather_map) > 0 and not nullify: - gm_min, gm_max = minmax(gather_map) - if gm_min < -len(source_table) or gm_max >= len(source_table): - raise IndexError(f"Gather map index with min {gm_min}," - f" max {gm_max} is out of bounds in" - f" {type(source_table)} with {len(source_table)}" - f" rows.") - cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table( - source_table, not keep_index - ) + cdef table_view source_table_view = table_view_from_columns(columns) cdef column_view gather_map_view = gather_map.view() cdef cpp_copying.out_of_bounds_policy policy = ( cpp_copying.out_of_bounds_policy.NULLIFY if nullify @@ -180,16 +170,7 @@ def gather( ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if ( - source_table._index is None) - or keep_index is False - else source_table._index_names - ) - ) + return columns_from_unique_ptr(move(c_result)) def scatter(object source, Column scatter_map, Column target_column, diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx index 7167d18409e..ef47e843723 100644 --- a/python/cudf/cudf/_lib/stream_compaction.pyx +++ b/python/cudf/cudf/_lib/stream_compaction.pyx @@ -24,40 +24,34 @@ from cudf._lib.cpp.types cimport ( null_policy, size_type, ) -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +from cudf._lib.utils cimport ( + columns_from_unique_ptr, + data_from_unique_ptr, + table_view_from_columns, + table_view_from_table, +) -def drop_nulls(source_table, how="any", keys=None, thresh=None): +def drop_nulls(columns: list, how="any", keys=None, thresh=None): """ Drops null rows from cols depending on key columns. Parameters ---------- - source_table : source table whose null rows are dropped to form new table + columns : list of columns how : "any" or "all". If thresh is None, drops rows of cols that have any nulls or all nulls (respectively) in subset (default: "any") - keys : List of Column names. If set, then these columns are checked for - nulls rather than all of cols (optional) + keys : List of column indices. If set, then these columns are checked for + nulls rather than all of columns (optional) thresh : Minimum number of non-nulls required to keep a row (optional) Returns ------- - Frame with null rows dropped + columns with null rows dropped """ - num_index_columns = ( - 0 if source_table._index is None else - source_table._index._num_columns) - # shifting the index number by number of index columns cdef vector[size_type] cpp_keys = ( - [ - num_index_columns + source_table._column_names.index(name) - for name in keys - ] - if keys is not None - else range( - num_index_columns, num_index_columns + source_table._num_columns - ) + keys if keys is not None else range(len(columns)) ) cdef size_type c_keep_threshold = cpp_keys.size() @@ -67,7 +61,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None): c_keep_threshold = 1 cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table(source_table) + cdef table_view source_table_view = table_view_from_columns(columns) with nogil: c_result = move( @@ -78,13 +72,7 @@ def drop_nulls(source_table, how="any", keys=None, thresh=None): ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if source_table._index is None - else source_table._index_names) - ) + return columns_from_unique_ptr(move(c_result)) def apply_boolean_mask(source_table, Column boolean_mask): @@ -124,26 +112,29 @@ def apply_boolean_mask(source_table, Column boolean_mask): ) -def drop_duplicates(source_table, +def drop_duplicates(columns: list, object keys=None, object keep='first', - bool nulls_are_equal=True, - bool ignore_index=False): + bool nulls_are_equal=True): """ Drops rows in source_table as per duplicate rows in keys. Parameters ---------- - source_table : source_table whose rows gets dropped - keys : List of Column names belong to source_table + columns : List of columns + keys : List of column indices. If set, then these columns are checked for + duplicates rather than all of columns (optional) keep : keep 'first' or 'last' or none of the duplicate rows nulls_are_equal : if True, nulls are treated equal else not. Returns ------- - Frame with duplicate dropped + columns with duplicate dropped """ + cdef vector[size_type] cpp_keys = ( + keys if keys is not None else range(len(columns)) + ) cdef duplicate_keep_option cpp_keep_option if keep == 'first': @@ -155,30 +146,14 @@ def drop_duplicates(source_table, else: raise ValueError('keep must be either "first", "last" or False') - num_index_columns =( - 0 if (source_table._index is None or ignore_index) - else source_table._index._num_columns) # shifting the index number by number of index columns - cdef vector[size_type] cpp_keys = ( - [ - num_index_columns + source_table._column_names.index(name) - for name in keys - ] - if keys is not None - else range( - num_index_columns, num_index_columns + source_table._num_columns - ) - ) - cdef null_equality cpp_nulls_equal = ( null_equality.EQUAL if nulls_are_equal else null_equality.UNEQUAL ) cdef unique_ptr[table] c_result - cdef table_view source_table_view = table_view_from_table( - source_table, ignore_index - ) + cdef table_view source_table_view = table_view_from_columns(columns) with nogil: c_result = move( @@ -190,13 +165,7 @@ def drop_duplicates(source_table, ) ) - return data_from_unique_ptr( - move(c_result), - column_names=source_table._column_names, - index_names=( - None if (source_table._index is None or ignore_index) - else source_table._index_names) - ) + return columns_from_unique_ptr(move(c_result)) def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False): diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index 10f76279401..50893ef9838 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -16,3 +16,4 @@ cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * cdef table_view table_view_from_table(tbl, ignore_index=*) except* +cdef columns_from_unique_ptr(unique_ptr[table] c_tbl) diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 18eed2b3396..40edd4bf9a2 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -36,7 +36,6 @@ PARQUET_META_TYPE_MAP = { for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items() } - cdef table_view table_view_from_columns(columns) except*: """Create a cudf::table_view from an iterable of Columns.""" cdef vector[column_view] column_views @@ -221,6 +220,32 @@ def _index_level_name(index_name, level, column_names): return f"__index_level_{level}__" +cdef columns_from_unique_ptr( + unique_ptr[table] c_tbl +): + """Convert a libcudf table into list of columns. + + Parameters + ---------- + c_tbl : unique_ptr[cudf::table] + The libcudf table whose columns will be extracted + + Returns + ------- + list[Column] + A list of columns. + """ + cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) + cdef vector[unique_ptr[column]].iterator it = c_columns.begin() + + cdef size_t i + + columns = [Column.from_unique_ptr(move(dereference(it+i))) + for i in range(c_columns.size())] + + return columns + + cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=None ): @@ -255,13 +280,8 @@ cdef data_from_unique_ptr( tuple(Dict[str, Column], Optional[Index]) A dict of the columns in the output table. """ - cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release()) - cdef vector[unique_ptr[column]].iterator it = c_columns.begin() - - cdef size_t i - columns = [Column.from_unique_ptr(move(dereference(it+i))) - for i in range(c_columns.size())] + columns = columns_from_unique_ptr(move(c_tbl)) # First construct the index, if any index = ( diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e2bedd9d0b1..1d113f6e159 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -34,7 +34,11 @@ create_null_mask, ) from cudf._lib.scalar import as_device_scalar -from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count +from cudf._lib.stream_compaction import ( + distinct_count as cpp_distinct_count, + drop_duplicates, + drop_nulls, +) from cudf._lib.transform import bools_to_mask from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import ( @@ -71,7 +75,7 @@ pandas_dtypes_alias_to_cudf_alias, pandas_dtypes_to_np_dtypes, ) -from cudf.utils.utils import mask_dtype +from cudf.utils.utils import _gather_map_is_valid, mask_dtype T = TypeVar("T", bound="ColumnBase") @@ -200,11 +204,8 @@ def any(self, skipna: bool = True) -> bool: return result_col def dropna(self, drop_nan: bool = False) -> ColumnBase: - if drop_nan: - col = self.nans_to_nulls() - else: - col = self - return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column() + col = self.nans_to_nulls() if drop_nan else self + return drop_nulls([col])[0] def to_arrow(self) -> pa.Array: """Convert to PyArrow Array @@ -686,28 +687,27 @@ def median(self, skipna: bool = None) -> ScalarLike: raise TypeError(f"cannot perform median with type {self.dtype}") def take( - self: T, - indices: ColumnBase, - keep_index: bool = True, - nullify: bool = False, + self: T, indices: ColumnBase, nullify: bool = False, check_bounds=True ) -> T: - """Return Column by taking values from the corresponding *indices*.""" + """Return Column by taking values from the corresponding *indices*. + + Skip bounds checking if check_bounds is False. + Set rows to null for all out of bound indices if nullify is `True`. + """ # Handle zero size if indices.size == 0: return cast(T, column_empty_like(self, newsize=0)) - try: - return ( - self.as_frame() - ._gather(indices, keep_index=keep_index, nullify=nullify) - ._as_column() - ._with_type_metadata(self.dtype) - ) - except RuntimeError as e: - if "out of bounds" in str(e): - raise IndexError( - f"index out of bounds for column of size {len(self)}" - ) from e - raise + + # TODO: For performance, the check and conversion of gather map should + # be done by the caller. This check will be removed in future release. + if not is_integer_dtype(indices.dtype): + indices = indices.astype("int32") + if not _gather_map_is_valid(indices, len(self), check_bounds, nullify): + raise IndexError("Gather map index is out of bounds.") + + return libcudf.copying.gather([self], indices, nullify=nullify)[ + 0 + ]._with_type_metadata(self.dtype) def isin(self, values: Sequence) -> ColumnBase: """Check whether values are contained in the Column. @@ -1098,11 +1098,7 @@ def unique(self) -> ColumnBase: # the following issue resolved: # https://github.com/rapidsai/cudf/issues/5286 - return ( - self.as_frame() - .drop_duplicates(keep="first", ignore_index=True) - ._as_column() - ) + return drop_duplicates([self], keep="first")[0] def serialize(self) -> Tuple[dict, list]: header: Dict[Any, Any] = {} diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 58fe8a43d8d..d7a75cb9f40 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -52,6 +52,7 @@ from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type, is_column_like +from cudf.utils.utils import _gather_map_is_valid T = TypeVar("T", bound="Frame") @@ -140,6 +141,37 @@ def _from_data( Frame.__init__(obj, data, index) return obj + @classmethod + def _from_columns( + cls, + columns: List[ColumnBase], + column_names: List[str], + index_names: Optional[List[str]] = None, + ): + """Construct a `Frame` object from a list of columns. + + If `index_names` is set, the first `len(index_names)` columns are + used to construct the index of the frame. + """ + index = None + n_index_columns = 0 + if index_names is not None: + n_index_columns = len(index_names) + index = cudf.core.index._index_from_data( + dict(zip(range(n_index_columns), columns)) + ) + if isinstance(index, cudf.MultiIndex): + index.names = index_names + else: + index.name = index_names[0] + + data = { + name: columns[i + n_index_columns] + for i, name in enumerate(column_names) + } + + return cls._from_data(data, index) + def _mimic_inplace( self: T, result: Frame, inplace: bool = False ) -> Optional[Frame]: @@ -520,22 +552,32 @@ def _get_columns_by_index(self, indices): def _gather( self, gather_map, keep_index=True, nullify=False, check_bounds=True ): + """Gather rows of frame specified by indices in `gather_map`. + + Skip bounds checking if check_bounds is False. + Set rows to null for all out of bound indices if nullify is `True`. + """ + # TODO: `keep_index` argument is to be removed. + gather_map = cudf.core.column.as_column(gather_map) + + # TODO: For performance, the check and conversion of gather map should + # be done by the caller. This check will be removed in future release. if not is_integer_dtype(gather_map.dtype): gather_map = gather_map.astype("int32") - result = self.__class__._from_data( - *libcudf.copying.gather( - self, - as_column(gather_map), - keep_index=keep_index, - nullify=nullify, - check_bounds=check_bounds, - ) + + if not _gather_map_is_valid( + gather_map, len(self), check_bounds, nullify + ): + raise IndexError("Gather map index is out of bounds.") + + result = self.__class__._from_columns( + libcudf.copying.gather( + list(self._columns), gather_map, nullify=nullify, + ), + self._column_names, ) - result._copy_type_metadata(self, include_index=keep_index) - result._data.names = self._data.names - if keep_index and self._index is not None: - result._index.names = self._index.names + result._copy_type_metadata(self) return result def _hash(self, method, initial_hash=None): @@ -1396,10 +1438,8 @@ def _drop_na_rows( diff = set(subset) - set(self._data) if len(diff) != 0: raise KeyError(f"columns {diff} do not exist") - subset_cols = [ - name for name, col in self._data.items() if name in subset - ] - if len(subset_cols) == 0: + + if len(subset) == 0: return self.copy(deep=True) frame = self.copy(deep=False) @@ -1412,16 +1452,19 @@ def _drop_na_rows( else: frame._data[name] = col - result = self.__class__._from_data( - *libcudf.stream_compaction.drop_nulls( - frame, how=how, keys=subset, thresh=thresh - ) + result = self.__class__._from_columns( + libcudf.stream_compaction.drop_nulls( + list(self._index._data.columns + frame._columns), + how=how, + keys=self._positions_from_column_names( + subset, offset_by_index_columns=True + ), + thresh=thresh, + ), + self._column_names, + self._index.names, ) result._copy_type_metadata(frame) - if self._index is not None: - result._index.name = self._index.name - if isinstance(self._index, cudf.MultiIndex): - result._index.names = self._index.names return result def _drop_na_columns(self, how="any", subset=None, thresh=None): @@ -2262,55 +2305,45 @@ def to_arrow(self): ) def drop_duplicates( - self, - subset=None, - keep="first", - nulls_are_equal=True, - ignore_index=False, + self, keep="first", nulls_are_equal=True, ): """ - Drops rows in frame as per duplicate rows in `subset` columns from - self. + Drop duplicate rows in frame. - subset : list, optional - List of columns to consider when dropping rows. - keep : ["first", "last", False] first will keep first of duplicate, - last will keep last of the duplicate and False drop all - duplicate - nulls_are_equal: null elements are considered equal to other null - elements - ignore_index: bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. + keep : ["first", "last", False], default "first" + "first" will keep the first duplicate entry, "last" will keep the + last duplicate entry, and False will drop all duplicates. + nulls_are_equal: bool, default True + Null elements are considered equal to other null elements. """ - if subset is None: - subset = self._column_names - elif ( - not np.iterable(subset) - or isinstance(subset, str) - or isinstance(subset, tuple) - and subset in self._data.names - ): - subset = (subset,) - diff = set(subset) - set(self._data) - if len(diff) != 0: - raise KeyError(f"columns {diff} do not exist") - subset_cols = [name for name in self._column_names if name in subset] - if len(subset_cols) == 0: - return self.copy(deep=True) - result = self.__class__._from_data( - *libcudf.stream_compaction.drop_duplicates( - self, - keys=subset, + result = self.__class__._from_columns( + libcudf.stream_compaction.drop_duplicates( + list(self._columns), + keys=range(len(self._columns)), keep=keep, nulls_are_equal=nulls_are_equal, - ignore_index=ignore_index, - ) + ), + self._column_names, ) - + # TODO: _copy_type_metadata is a common pattern to apply after the + # roundtrip from libcudf. We should build this into a factory function + # to increase reusability. result._copy_type_metadata(self) return result + def _positions_from_column_names(self, column_names): + """Map each column name into their positions in the frame. + + The order of indices returned corresponds to the column order in this + Frame. + """ + return [ + i + for i, name in enumerate(self._column_names) + if name in set(column_names) + ] + def replace( self, to_replace=None, @@ -2589,7 +2622,10 @@ def _copy_type_metadata( self._index, cudf.core.index.CategoricalIndex ): self._index = cudf.Index( - cast(cudf.core.index.NumericIndex, self._index)._column + cast( + cudf.core.index.NumericIndex, self._index + )._column, + name=self._index.name, ) return self diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5ea9ac945dc..8f905ee6d49 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -86,6 +86,7 @@ def _lexsorted_equal_range( def _index_from_data(data: MutableMapping, name: Any = None): """Construct an index of the appropriate type from some data.""" + if len(data) == 0: raise ValueError("Cannot construct Index from any empty Table") if len(data) == 1: @@ -770,34 +771,6 @@ def deserialize(cls, header, frames): return super().deserialize(header, frames) - def drop_duplicates(self, keep="first"): - """ - Return Index with duplicate values removed - - Parameters - ---------- - keep : {‘first’, ‘last’, False}, default ‘first’ - * ‘first’ : Drop duplicates except for the - first occurrence. - * ‘last’ : Drop duplicates except for the - last occurrence. - * False : Drop all duplicates. - - Returns - ------- - Index - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - >>> idx - StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') - >>> idx.drop_duplicates() - StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') - """ # noqa: E501 - return super().drop_duplicates(keep=keep) - def _binaryop( self, other: T, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index cf12907d96a..2044bad9675 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -8,17 +8,19 @@ from uuid import uuid4 import cupy as cp +import numpy as np import pandas as pd from nvtx import annotate import cudf +import cudf._lib as libcudf from cudf._typing import ColumnLike -from cudf.api.types import is_categorical_dtype, is_list_like +from cudf.api.types import is_categorical_dtype, is_integer_dtype, is_list_like from cudf.core.column import arange from cudf.core.frame import Frame from cudf.core.index import Index from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import cached_property +from cudf.utils.utils import _gather_map_is_valid, cached_property def _indices_from_labels(obj, labels): @@ -435,6 +437,113 @@ def sort_index( out = out.reset_index(drop=True) return self._mimic_inplace(out, inplace=inplace) + def _gather( + self, gather_map, keep_index=True, nullify=False, check_bounds=True + ): + """Gather rows of frame specified by indices in `gather_map`. + + Skip bounds checking if check_bounds is False. + Set rows to null for all out of bound indices if nullify is `True`. + """ + gather_map = cudf.core.column.as_column(gather_map) + + # TODO: For performance, the check and conversion of gather map should + # be done by the caller. This check will be removed in future release. + if not is_integer_dtype(gather_map.dtype): + gather_map = gather_map.astype("int32") + + if not _gather_map_is_valid( + gather_map, len(self), check_bounds, nullify + ): + raise IndexError("Gather map index is out of bounds.") + + result = self.__class__._from_columns( + libcudf.copying.gather( + list(self._index._columns + self._columns) + if keep_index + else list(self._columns), + gather_map, + nullify=nullify, + ), + self._column_names, + self._index.names if keep_index else None, + ) + + result._copy_type_metadata(self, include_index=keep_index) + return result + + def _positions_from_column_names( + self, column_names, offset_by_index_columns=False + ): + """Map each column name into their positions in the frame. + + Return positions of the provided column names, offset by the number of + index columns `offset_by_index_columns` is True. The order of indices + returned corresponds to the column order in this Frame. + """ + num_index_columns = ( + len(self._index._data) if offset_by_index_columns else 0 + ) + return [ + i + num_index_columns + for i, name in enumerate(self._column_names) + if name in set(column_names) + ] + + def drop_duplicates( + self, + subset=None, + keep="first", + nulls_are_equal=True, + ignore_index=False, + ): + """ + Drop duplicate rows in frame. + + subset : list, optional + List of columns to consider when dropping rows. + keep : ["first", "last", False] + "first" will keep the first duplicate entry, "last" will keep the + last duplicate entry, and False will drop all duplicates. + nulls_are_equal: bool, default True + Null elements are considered equal to other null elements. + ignore_index: bool, default False + If True, the resulting axis will be labeled 0, 1, ..., n - 1. + """ + if subset is None: + subset = self._column_names + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self._data.names + ): + subset = (subset,) + diff = set(subset) - set(self._data) + if len(diff) != 0: + raise KeyError(f"columns {diff} do not exist") + subset_cols = [name for name in self._column_names if name in subset] + if len(subset_cols) == 0: + return self.copy(deep=True) + + keys = self._positions_from_column_names( + subset, offset_by_index_columns=not ignore_index + ) + result = self.__class__._from_columns( + libcudf.stream_compaction.drop_duplicates( + list(self._columns) + if ignore_index + else list(self._index._columns + self._columns), + keys=keys, + keep=keep, + nulls_are_equal=nulls_are_equal, + ), + self._column_names, + self._index.names if not ignore_index else None, + ) + result._copy_type_metadata(self) + return result + def sort_values( self, by, diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a1eda697683..e0c68e56f63 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -137,9 +137,9 @@ def __init__( else: level = cudf.DataFrame({column_name: levels[i]}) - source_data[column_name] = libcudf.copying.gather(level, col)[0][ - column_name - ] + source_data[column_name] = libcudf.copying.gather( + [level._data[column_name]], col + )[0] super().__init__(source_data) self._levels = levels @@ -1409,7 +1409,7 @@ def fillna(self, value): return super().fillna(value=value) def unique(self): - return self.drop_duplicates(ignore_index=True) + return self.drop_duplicates(keep="first") def _clean_nulls_from_index(self): """ diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index a9611a91554..cea384b9c11 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -12,6 +12,7 @@ import rmm import cudf +from cudf._lib.reduce import minmax from cudf.core import column from cudf.core.buffer import Buffer from cudf.utils.dtypes import to_cudf_compatible_scalar @@ -506,3 +507,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: if (indices == cp.arange(start, stop, step)).all(): return slice(start, stop, step) return indices + + +def _gather_map_is_valid( + gather_map: "cudf.core.column.ColumnBase", + nrows: int, + check_bounds: bool, + nullify: bool, +) -> bool: + """Returns true if gather map is valid. + + A gather map is valid if empty or all indices are within the range + ``[-nrows, nrows)``, except when ``nullify`` is specifed. + """ + if not check_bounds or nullify or len(gather_map) == 0: + return True + gm_min, gm_max = minmax(gather_map) + return gm_min >= -nrows and gm_max < nrows From 09a8a4773f74ef6241e9eac4e674181bc753de50 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 19 Nov 2021 16:57:25 -0600 Subject: [PATCH 017/202] Use stop instead of stop_. (#9735) Small fix to inconsistent variable names in tests, following up from #9571. Previous conversation: https://github.com/rapidsai/cudf/pull/9571#discussion_r750568195 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/9735 --- cpp/tests/datetime/datetime_ops_test.cpp | 11 +++++------ cpp/tests/wrappers/timestamps_test.cu | 15 +++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 2097e09e674..4ac24317145 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -183,10 +183,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) using namespace cudf::datetime; using namespace cuda::std::chrono; - auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop_ = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto timestamps = - generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop_)); + auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + auto timestamps = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); auto expected_years = fixed_width_column_wrapper{1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033}; @@ -221,9 +220,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen using namespace cuda::std::chrono; auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop_ = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT auto timestamps = - generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop_)); + generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); auto expected_years = fixed_width_column_wrapper{ {1890, 1906, 1922, 1938, 1954, 1970, 1985, 2001, 2017, 2033}, diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu index b458f34cca8..097b786aefe 100644 --- a/cpp/tests/wrappers/timestamps_test.cu +++ b/cpp/tests/wrappers/timestamps_test.cu @@ -78,10 +78,9 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation) using namespace cudf::test; using namespace cuda::std::chrono; - auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT - auto stop_ = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto chrono_col = - generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop_)); + auto start = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT + auto stop = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + auto chrono_col = generate_timestamps(this->size(), time_point_ms(start), time_point_ms(stop)); // round-trip through the host to copy `chrono_col` values // to a new fixed_width_column_wrapper `primitive_col` @@ -135,14 +134,14 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode) auto start_lhs = milliseconds(-2500000000000); // Sat, 11 Oct 1890 19:33:20 GMT auto start_rhs = milliseconds(-2400000000000); // Tue, 12 Dec 1893 05:20:00 GMT - auto stop_lhs_ = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT - auto stop_rhs_ = milliseconds(2600000000000); // Wed, 22 May 2052 14:13:20 GMT + auto stop_lhs = milliseconds(2500000000000); // Mon, 22 Mar 2049 04:26:40 GMT + auto stop_rhs = milliseconds(2600000000000); // Wed, 22 May 2052 14:13:20 GMT auto chrono_lhs_col = - generate_timestamps(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs_)); + generate_timestamps(this->size(), time_point_ms(start_lhs), time_point_ms(stop_lhs)); auto chrono_rhs_col = - generate_timestamps(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs_)); + generate_timestamps(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs)); rmm::device_uvector indices(this->size(), rmm::cuda_stream_default); thrust::sequence(rmm::exec_policy(), indices.begin(), indices.end()); From f0367c0e1ebec54c964a2114b248926b8f82ec04 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 19 Nov 2021 15:06:25 -0800 Subject: [PATCH 018/202] Use cuFile direct device reads/writes by default in cuIO (#9722) Making this change early in 22.02 to test through internal use + nightly builds before the release. - Modify the way cuFile integration is enabled to match the nvCOMP integration. - Change the default from OFF to GDS (GDS on, only for direct reads/writes, no compatibility mode). - cuFile JSON config file is now modified on first cuFile use (same time as the driver), instead of the first query that checks if GDS use is enabled. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9722 --- cpp/CMakeLists.txt | 1 + cpp/src/io/utilities/config_utils.cpp | 86 ++++++++++++++ cpp/src/io/utilities/config_utils.hpp | 37 +++--- cpp/src/io/utilities/datasource.cpp | 9 +- cpp/src/io/utilities/file_io_utilities.cpp | 110 +++++++++--------- cpp/src/io/utilities/file_io_utilities.hpp | 26 ----- .../cudf/source/basics/io-gds-integration.rst | 11 +- .../source/basics/io-nvcomp-integration.rst | 7 +- 8 files changed, 171 insertions(+), 116 deletions(-) create mode 100644 cpp/src/io/utilities/config_utils.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 966728d7647..7a556d2c0f6 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -310,6 +310,7 @@ add_library( src/io/statistics/parquet_column_statistics.cu src/io/text/multibyte_split.cu src/io/utilities/column_buffer.cpp + src/io/utilities/config_utils.cpp src/io/utilities/data_sink.cpp src/io/utilities/datasource.cpp src/io/utilities/file_io_utilities.cpp diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp new file mode 100644 index 00000000000..2c1dc1cc0aa --- /dev/null +++ b/cpp/src/io/utilities/config_utils.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config_utils.hpp" + +#include + +#include +#include + +namespace cudf::io::detail { + +std::string getenv_or(std::string const& env_var_name, std::string_view default_val) +{ + auto const env_val = std::getenv(env_var_name.c_str()); + return std::string{(env_val == nullptr) ? default_val : env_val}; +} + +namespace cufile_integration { + +namespace { +/** + * @brief Defines which cuFile usage to enable. + */ +enum class usage_policy : uint8_t { OFF, GDS, ALWAYS }; + +/** + * @brief Get the current usage policy. + */ +usage_policy get_env_policy() +{ + static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS"); + if (env_val == "OFF") return usage_policy::OFF; + if (env_val == "GDS") return usage_policy::GDS; + if (env_val == "ALWAYS") return usage_policy::ALWAYS; + CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val); +} +} // namespace + +bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; } + +bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; } + +} // namespace cufile_integration + +namespace nvcomp_integration { + +namespace { +/** + * @brief Defines which nvCOMP usage to enable. + */ +enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS }; + +/** + * @brief Get the current usage policy. + */ +usage_policy get_env_policy() +{ + static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE"); + if (env_val == "OFF") return usage_policy::OFF; + if (env_val == "STABLE") return usage_policy::STABLE; + if (env_val == "ALWAYS") return usage_policy::ALWAYS; + CUDF_FAIL("Invalid LIBCUDF_NVCOMP_POLICY value: " + env_val); +} +} // namespace + +bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; } + +bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; } + +} // namespace nvcomp_integration + +} // namespace cudf::io::detail diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp index a1d8e747e44..baa45fef08a 100644 --- a/cpp/src/io/utilities/config_utils.hpp +++ b/cpp/src/io/utilities/config_utils.hpp @@ -15,7 +15,6 @@ */ #pragma once -#include #include namespace cudf::io::detail { @@ -24,44 +23,34 @@ namespace cudf::io::detail { * @brief Returns the value of the environment variable, or a default value if the variable is not * present. */ -inline std::string getenv_or(std::string const& env_var_name, std::string_view default_val) -{ - auto const env_val = std::getenv(env_var_name.c_str()); - return std::string{(env_val == nullptr) ? default_val : env_val}; -} +std::string getenv_or(std::string const& env_var_name, std::string_view default_val); -namespace nvcomp_integration { +namespace cufile_integration { -namespace { /** - * @brief Defines which nvCOMP usage to enable. + * @brief Returns true if cuFile and its compatibility mode are enabled. */ -enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS }; +bool is_always_enabled(); /** - * @brief Get the current usage policy. + * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled). */ -inline usage_policy get_env_policy() -{ - static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE"); - if (env_val == "OFF") return usage_policy::OFF; - if (env_val == "ALWAYS") return usage_policy::ALWAYS; - return usage_policy::STABLE; -} -} // namespace +bool is_gds_enabled(); + +} // namespace cufile_integration + +namespace nvcomp_integration { /** * @brief Returns true if all nvCOMP uses are enabled. */ -inline bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; } +bool is_all_enabled(); /** * @brief Returns true if stable nvCOMP use is enabled. */ -inline bool is_stable_enabled() -{ - return is_all_enabled() or get_env_policy() == usage_policy::STABLE; -} +bool is_stable_enabled(); } // namespace nvcomp_integration + } // namespace cudf::io::detail diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 7afffaede9e..3de6f35cb0d 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -14,15 +14,16 @@ * limitations under the License. */ +#include "file_io_utilities.hpp" + #include +#include +#include #include #include #include -#include -#include "file_io_utilities.hpp" - namespace cudf { namespace io { namespace { @@ -239,7 +240,7 @@ std::unique_ptr datasource::create(const std::string& filepath, size_t size) { #ifdef CUFILE_FOUND - if (detail::cufile_config::instance()->is_required()) { + if (detail::cufile_integration::is_always_enabled()) { // avoid mmap as GDS is expected to be used for most reads return std::make_unique(filepath.c_str()); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index 387452e171a..7a48b7d7301 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -51,45 +51,14 @@ file_wrapper::~file_wrapper() { close(fd); } #ifdef CUFILE_FOUND -cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)} -{ - if (is_enabled()) { - // Modify the config file based on the policy - auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); - std::ifstream user_config_file(config_file_path); - // Modified config file is stored in a temporary directory - auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; - std::ofstream cudf_config_file(cudf_config_path); - - std::string line; - while (std::getline(user_config_file, line)) { - std::string const tag = "\"allow_compat_mode\""; - if (line.find(tag) != std::string::npos) { - // TODO: only replace the true/false value - // Enable compatiblity mode when cuDF does not fall back to host path - cudf_config_file << tag << ": " << (is_required() ? "true" : "false") << ",\n"; - } else { - cudf_config_file << line << '\n'; - } - - // Point libcufile to the modified config file - CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, - "Failed to set the cuFile config file environment variable."); - } - } -} -cufile_config const* cufile_config::instance() -{ - static cufile_config _instance; - return &_instance; -} - /** * @brief Class that dynamically loads the cuFile library and manages the cuFile driver. */ class cufile_shim { private: cufile_shim(); + void modify_cufile_json() const; + void load_cufile_lib(); void* cf_lib = nullptr; decltype(cuFileDriverOpen)* driver_open = nullptr; @@ -116,25 +85,60 @@ class cufile_shim { decltype(cuFileWrite)* write = nullptr; }; +void cufile_shim::modify_cufile_json() const +{ + std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; + temp_directory tmp_config_dir{"cudf_cufile_config"}; + + // Modify the config file based on the policy + auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json"); + std::ifstream user_config_file(config_file_path); + // Modified config file is stored in a temporary directory + auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json"; + std::ofstream cudf_config_file(cudf_config_path); + + std::string line; + while (std::getline(user_config_file, line)) { + std::string const tag = "\"allow_compat_mode\""; + if (line.find(tag) != std::string::npos) { + // TODO: only replace the true/false value instead of replacing the whole line + // Enable compatibility mode when cuDF does not fall back to host path + cudf_config_file << tag << ": " + << (cufile_integration::is_always_enabled() ? "true" : "false") << ",\n"; + } else { + cudf_config_file << line << '\n'; + } + + // Point libcufile to the modified config file + CUDF_EXPECTS(setenv(json_path_env_var.c_str(), cudf_config_path.c_str(), 0) == 0, + "Failed to set the cuFile config file environment variable."); + } +} + +void cufile_shim::load_cufile_lib() +{ + cf_lib = dlopen("libcufile.so", RTLD_NOW); + driver_open = reinterpret_cast(dlsym(cf_lib, "cuFileDriverOpen")); + CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol"); + driver_close = reinterpret_cast(dlsym(cf_lib, "cuFileDriverClose")); + CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol"); + handle_register = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleRegister")); + CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol"); + handle_deregister = + reinterpret_cast(dlsym(cf_lib, "cuFileHandleDeregister")); + CUDF_EXPECTS(handle_deregister != nullptr, "could not find cuFile cuFileHandleDeregister symbol"); + read = reinterpret_cast(dlsym(cf_lib, "cuFileRead")); + CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol"); + write = reinterpret_cast(dlsym(cf_lib, "cuFileWrite")); + CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol"); +} + cufile_shim::cufile_shim() { try { - cf_lib = dlopen("libcufile.so", RTLD_NOW); - driver_open = reinterpret_cast(dlsym(cf_lib, "cuFileDriverOpen")); - CUDF_EXPECTS(driver_open != nullptr, "could not find cuFile cuFileDriverOpen symbol"); - driver_close = reinterpret_cast(dlsym(cf_lib, "cuFileDriverClose")); - CUDF_EXPECTS(driver_close != nullptr, "could not find cuFile cuFileDriverClose symbol"); - handle_register = - reinterpret_cast(dlsym(cf_lib, "cuFileHandleRegister")); - CUDF_EXPECTS(handle_register != nullptr, "could not find cuFile cuFileHandleRegister symbol"); - handle_deregister = - reinterpret_cast(dlsym(cf_lib, "cuFileHandleDeregister")); - CUDF_EXPECTS(handle_deregister != nullptr, - "could not find cuFile cuFileHandleDeregister symbol"); - read = reinterpret_cast(dlsym(cf_lib, "cuFileRead")); - CUDF_EXPECTS(read != nullptr, "could not find cuFile cuFileRead symbol"); - write = reinterpret_cast(dlsym(cf_lib, "cuFileWrite")); - CUDF_EXPECTS(write != nullptr, "could not find cuFile cuFileWrite symbol"); + modify_cufile_json(); + load_cufile_lib(); CUDF_EXPECTS(driver_open().err == CU_FILE_SUCCESS, "Failed to initialize cuFile driver"); } catch (cudf::logic_error const& err) { @@ -285,11 +289,11 @@ std::future cufile_output_impl::write_async(void const* data, size_t offse std::unique_ptr make_cufile_input(std::string const& filepath) { #ifdef CUFILE_FOUND - if (cufile_config::instance()->is_enabled()) { + if (cufile_integration::is_gds_enabled()) { try { return std::make_unique(filepath); } catch (...) { - if (cufile_config::instance()->is_required()) throw; + if (cufile_integration::is_always_enabled()) throw; } } #endif @@ -299,11 +303,11 @@ std::unique_ptr make_cufile_input(std::string const& filepath std::unique_ptr make_cufile_output(std::string const& filepath) { #ifdef CUFILE_FOUND - if (cufile_config::instance()->is_enabled()) { + if (cufile_integration::is_gds_enabled()) { try { return std::make_unique(filepath); } catch (...) { - if (cufile_config::instance()->is_required()) throw; + if (cufile_integration::is_always_enabled()) throw; } } #endif diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index 0119f9b7abd..ede0eb6f925 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -162,32 +162,6 @@ class cufile_output : public cufile_io_base { class cufile_shim; -/** - * @brief Class that manages cuFile configuration. - */ -class cufile_config { - std::string const default_policy = "OFF"; - std::string const json_path_env_var = "CUFILE_ENV_PATH_JSON"; - - std::string const policy = default_policy; - temp_directory tmp_config_dir{"cudf_cufile_config"}; - - cufile_config(); - - public: - /** - * @brief Returns true when cuFile use is enabled. - */ - bool is_enabled() const { return policy == "ALWAYS" or policy == "GDS"; } - - /** - * @brief Returns true when cuDF should not fall back to host IO. - */ - bool is_required() const { return policy == "ALWAYS"; } - - static cufile_config const* instance(); -}; - /** * @brief Class that provides RAII for cuFile file registration. */ diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 29cbc2024fc..20f3ec87ccb 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -5,17 +5,18 @@ Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. The SDK is available for download `here `_. +GDS is also included in CUDA Toolkit 11.4 and higher. -Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. +Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. This variable also controls the GDS compatibility mode. -There are three special values for the environment variable: +There are three valid values for the environment variable: - "GDS": Enable GDS use; GDS compatibility mode is *off*. - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*. -- "OFF": Compretely disable GDS use. +- "OFF": Completely disable GDS use. -Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers. +If no value is set, behavior will be the same as the "GDS" option. This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. @@ -30,5 +31,3 @@ Operations that support the use of GPUDirect Storage: - `to_csv` - `to_parquet` - `to_orc` - -NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases. diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst index af89ab5285f..521833e2afd 100644 --- a/docs/cudf/source/basics/io-nvcomp-integration.rst +++ b/docs/cudf/source/basics/io-nvcomp-integration.rst @@ -3,15 +3,16 @@ nvCOMP Integration Some types of compression/decompression can be performed using either `nvCOMP library `_ or the internal implementation. -Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``. +Which implementation is used by default depends on the data format and the compression type. +Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``. -There are three special values for the environment variable: +There are three valid values for the environment variable: - "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations. - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead. -Any other value (or no value set) will result in the same behavior as the "STABLE" option. +If no value is set, behavior will be the same as the "STABLE" option. .. table:: Current policy for nvCOMP use for different types From 65af9a301acd19784fe7d2d03702be827ce97661 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 19 Nov 2021 16:02:13 -0800 Subject: [PATCH 019/202] Improve cmake format script (#9723) This PR ports some improvements from rapidsai/rmm#913. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/9723 --- .pre-commit-config.yaml | 4 ++-- cpp/scripts/run-cmake-format.sh | 32 +++++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e993f548e1d..1e1ad94ab0b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -72,7 +72,7 @@ repos: args: ['-fallback-style=none'] - id: cmake-format name: cmake-format - entry: bash cpp/scripts/run-cmake-format.sh cmake-format + entry: ./cpp/scripts/run-cmake-format.sh cmake-format language: python types: [cmake] # Note that pre-commit autoupdate does not update the versions @@ -81,7 +81,7 @@ repos: - cmake-format==0.6.11 - id: cmake-lint name: cmake-lint - entry: bash cpp/scripts/run-cmake-format.sh cmake-lint + entry: ./cpp/scripts/run-cmake-format.sh cmake-lint language: python types: [cmake] # Note that pre-commit autoupdate does not update the versions diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh index 76de008b14a..9c981c6cdaa 100755 --- a/cpp/scripts/run-cmake-format.sh +++ b/cpp/scripts/run-cmake-format.sh @@ -1,6 +1,6 @@ #!/bin/bash -# This script is a pre-commit hook that wraps cmakelang's cmake linters. The +# This script is a wrapper for cmakelang that may be used with pre-commit. The # wrapping is necessary because RAPIDS libraries split configuration for # cmakelang linters between a local config file and a second config file that's # shared across all of RAPIDS via rapids-cmake. In order to keep it up to date @@ -16,19 +16,33 @@ # config file at a nonstandard location, they may do so by setting the # environment variable RAPIDS_CMAKE_FORMAT_FILE. # -# While this script can be invoked directly (but only from the repo root since -# all paths are relative to that), it is advisable to instead use the -# pre-commit hooks via -# `pre-commit run (cmake-format)|(cmake-format)`. +# This script can be invoked directly anywhere within the project repository. +# Alternatively, it may be invoked as a pre-commit hook via +# `pre-commit run (cmake-format)|(cmake-lint)`. # # Usage: # bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...] -# Note that pre-commit always runs from the root of the repository, so relative -# paths are automatically relative to the repo root. +status=0 +if [ -z ${CUDF_ROOT:+PLACEHOLDER} ]; then + CUDF_BUILD_DIR=$(git rev-parse --show-toplevel 2>&1)/cpp/build + status=$? +else + CUDF_BUILD_DIR=${CUDF_ROOT} +fi + +if ! [ ${status} -eq 0 ]; then + if [[ ${CUDF_BUILD_DIR} == *"not a git repository"* ]]; then + echo "This script must be run inside the cudf repository, or the CUDF_ROOT environment variable must be set." + else + echo "Script failed with unknown error attempting to determine project root:" + echo ${CUDF_BUILD_DIR} + fi + exit 1 +fi + DEFAULT_FORMAT_FILE_LOCATIONS=( - "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" - "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" + "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" ) From 43a13c6aac76a2a5a42674b4e3e05dbb65ddb741 Mon Sep 17 00:00:00 2001 From: Peixin Date: Mon, 22 Nov 2021 13:43:19 +0800 Subject: [PATCH 020/202] Skip cufile tests in JNI build script (#9744) Signed-off-by: Peixin Li related to #9722 skip cufile test in JNI build while we have a separate pipeline for GDS testing Authors: - Peixin (https://github.com/pxLi) Approvers: - Tim Liu (https://github.com/NvTimLiu) - Gary Shen (https://github.com/GaryShen2008) URL: https://github.com/rapidsai/cudf/pull/9744 --- java/ci/build-in-docker.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index e596cdae5b3..df4ca853398 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -16,7 +16,7 @@ # limitations under the License. # -set -e +set -ex gcc --version PARALLEL_LEVEL=${PARALLEL_LEVEL:-4} @@ -58,7 +58,7 @@ make -j$PARALLEL_LEVEL make install DESTDIR=$INSTALL_PREFIX ###### Build cudf jar ###### -BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS" +BUILD_ARG="-Dmaven.repo.local=\"$WORKSPACE/.m2\" -DskipTests=$SKIP_JAVA_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL -DUSE_GDS=$ENABLE_GDS -Dtest=*,!CuFileTest" if [ "$SIGN_FILE" == true ]; then # Build javadoc and sources only when SIGN_FILE is true BUILD_ARG="$BUILD_ARG -Prelease" From 7fa15db306631c026642942993283bd93da1c7c2 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 22 Nov 2021 09:33:47 -0500 Subject: [PATCH 021/202] Fix doxygen for enum types in libcudf (#9724) Fix some doxygen formatting errors around enum types found when looking at various pages in the published docs: https://docs.rapids.ai/api/libcudf/stable/namespacecudf.html Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9724 --- .../cudf/ast/detail/expression_parser.hpp | 6 +-- cpp/include/cudf/ast/expressions.hpp | 6 +-- cpp/include/cudf/copying.hpp | 8 ++-- cpp/include/cudf/detail/gather.cuh | 6 +-- cpp/include/cudf/detail/structs/utilities.hpp | 4 +- cpp/include/cudf/io/types.hpp | 6 +-- .../cudf/strings/char_types/char_types.hpp | 20 ++++---- cpp/include/cudf/strings/regex/flags.hpp | 6 +-- cpp/include/cudf/strings/strip.hpp | 6 +-- cpp/include/cudf/strings/translate.hpp | 5 +- cpp/include/cudf/unary.hpp | 46 +++++++++---------- 11 files changed, 61 insertions(+), 58 deletions(-) diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index dc800bde527..4f73cb1ef6e 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -37,9 +37,9 @@ namespace detail { * linearization process but cannot be explicitly created by the user. */ enum class device_data_reference_type { - COLUMN, // A value in a table column - LITERAL, // A literal value - INTERMEDIATE // An internal temporary value + COLUMN, ///< A value in a table column + LITERAL, ///< A literal value + INTERMEDIATE ///< An internal temporary value }; /** diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index 7ae40a7d65f..20aaa42fb68 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -122,9 +122,9 @@ enum class ast_operator { * This determines which table to use in cases with two tables (e.g. joins). */ enum class table_reference { - LEFT, // Column index in the left table - RIGHT, // Column index in the right table - OUTPUT // Column index in the output table + LEFT, ///< Column index in the left table + RIGHT, ///< Column index in the right table + OUTPUT ///< Column index in the output table }; /** diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index ba5043fb261..81dddbd284a 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -41,8 +41,8 @@ namespace cudf { */ enum class out_of_bounds_policy : bool { - NULLIFY, /// Output values corresponding to out-of-bounds indices are null - DONT_CHECK /// No bounds checking is performed, better performance + NULLIFY, ///< Output values corresponding to out-of-bounds indices are null + DONT_CHECK ///< No bounds checking is performed, better performance }; /** @@ -901,8 +901,8 @@ std::unique_ptr get_element( * @brief Indicates whether a row can be sampled more than once. */ enum class sample_with_replacement : bool { - FALSE, // A row can be sampled only once - TRUE // A row can be sampled more than once + FALSE, ///< A row can be sampled only once + TRUE ///< A row can be sampled more than once }; /** diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 594191d275d..08dbdb6f1a0 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -66,9 +66,9 @@ struct bounds_checker { * @brief The operation to perform when a gather map index is out of bounds */ enum class gather_bitmask_op { - DONT_CHECK, // Don't check for out of bounds indices - PASSTHROUGH, // Preserve mask at rows with out of bounds indices - NULLIFY, // Nullify rows with out of bounds indices + DONT_CHECK, ///< Don't check for out of bounds indices + PASSTHROUGH, ///< Preserve mask at rows with out of bounds indices + NULLIFY, ///< Nullify rows with out of bounds indices }; template diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp index aece79107c6..6f32e3190bf 100644 --- a/cpp/include/cudf/detail/structs/utilities.hpp +++ b/cpp/include/cudf/detail/structs/utilities.hpp @@ -28,8 +28,8 @@ namespace structs { namespace detail { enum class column_nullability { - MATCH_INCOMING, // generate a null column if the incoming column has nulls - FORCE // always generate a null column + MATCH_INCOMING, ///< generate a null column if the incoming column has nulls + FORCE ///< always generate a null column }; /** diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index ac965e2d416..cf6be8a20af 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -87,9 +87,9 @@ enum class quote_style { * @brief Column statistics granularity type for parquet/orc writers */ enum statistics_freq { - STATISTICS_NONE = 0, //!< No column statistics - STATISTICS_ROWGROUP = 1, //!< Per-Rowgroup column statistics - STATISTICS_PAGE = 2, //!< Per-page column statistics + STATISTICS_NONE = 0, ///< No column statistics + STATISTICS_ROWGROUP = 1, ///< Per-Rowgroup column statistics + STATISTICS_PAGE = 2, ///< Per-page column statistics }; /** diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp index 2af79de0716..04d65065bd3 100644 --- a/cpp/include/cudf/strings/char_types/char_types.hpp +++ b/cpp/include/cudf/strings/char_types/char_types.hpp @@ -37,16 +37,16 @@ namespace strings { * does not match to any explicitly named enumerator. */ enum string_character_types : uint32_t { - DECIMAL = 1 << 0, /// all decimal characters - NUMERIC = 1 << 1, /// all numeric characters - DIGIT = 1 << 2, /// all digit characters - ALPHA = 1 << 3, /// all alphabetic characters - SPACE = 1 << 4, /// all space characters - UPPER = 1 << 5, /// all upper case characters - LOWER = 1 << 6, /// all lower case characters - ALPHANUM = DECIMAL | NUMERIC | DIGIT | ALPHA, /// all alphanumeric characters - CASE_TYPES = UPPER | LOWER, /// all case-able characters - ALL_TYPES = ALPHANUM | CASE_TYPES | SPACE /// all character types + DECIMAL = 1 << 0, ///< all decimal characters + NUMERIC = 1 << 1, ///< all numeric characters + DIGIT = 1 << 2, ///< all digit characters + ALPHA = 1 << 3, ///< all alphabetic characters + SPACE = 1 << 4, ///< all space characters + UPPER = 1 << 5, ///< all upper case characters + LOWER = 1 << 6, ///< all lower case characters + ALPHANUM = DECIMAL | NUMERIC | DIGIT | ALPHA, ///< all alphanumeric characters + CASE_TYPES = UPPER | LOWER, ///< all case-able characters + ALL_TYPES = ALPHANUM | CASE_TYPES | SPACE ///< all character types }; /** diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp index f6aee6d22cc..637b3b0851b 100644 --- a/cpp/include/cudf/strings/regex/flags.hpp +++ b/cpp/include/cudf/strings/regex/flags.hpp @@ -33,9 +33,9 @@ namespace strings { * and to match the Python flag values. */ enum regex_flags : uint32_t { - DEFAULT = 0, /// default - MULTILINE = 8, /// the '^' and '$' honor new-line characters - DOTALL = 16 /// the '.' matching includes new-line characters + DEFAULT = 0, ///< default + MULTILINE = 8, ///< the '^' and '$' honor new-line characters + DOTALL = 16 ///< the '.' matching includes new-line characters }; /** diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp index 72863bdf23b..fe9cd41e780 100644 --- a/cpp/include/cudf/strings/strip.hpp +++ b/cpp/include/cudf/strings/strip.hpp @@ -31,9 +31,9 @@ namespace strings { * @brief Direction identifier for strip() function. */ enum class strip_type { - LEFT, //<< strip characters from the beginning of the string - RIGHT, //<< strip characters from the end of the string - BOTH //<< strip characters from the beginning and end of the string + LEFT, ///< strip characters from the beginning of the string + RIGHT, ///< strip characters from the end of the string + BOTH ///< strip characters from the beginning and end of the string }; /** diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp index e014f88c451..0cbf6b22029 100644 --- a/cpp/include/cudf/strings/translate.hpp +++ b/cpp/include/cudf/strings/translate.hpp @@ -60,7 +60,10 @@ std::unique_ptr translate( /** * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters */ -enum class filter_type : bool { KEEP, REMOVE }; +enum class filter_type : bool { + KEEP, ///< All characters but those specified are removed + REMOVE ///< Only the specified characters are removed +}; /** * @brief Removes ranges of characters from each string in a strings column. diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp index 254a7988e2e..36f08b7f23e 100644 --- a/cpp/include/cudf/unary.hpp +++ b/cpp/include/cudf/unary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,28 +28,28 @@ namespace cudf { */ enum class unary_operator : int32_t { - SIN, // < Trigonometric sine - COS, // < Trigonometric cosine - TAN, // < Trigonometric tangent - ARCSIN, // < Trigonometric sine inverse - ARCCOS, // < Trigonometric cosine inverse - ARCTAN, // < Trigonometric tangent inverse - SINH, // < Hyperbolic sine - COSH, // < Hyperbolic cosine - TANH, // < Hyperbolic tangent - ARCSINH, // < Hyperbolic sine inverse - ARCCOSH, // < Hyperbolic cosine inverse - ARCTANH, // < Hyperbolic tangent inverse - EXP, // < Exponential (base e, Euler number) - LOG, // < Natural Logarithm (base e) - SQRT, // < Square-root (x^0.5) - CBRT, // < Cube-root (x^(1.0/3)) - CEIL, // < Smallest integer value not less than arg - FLOOR, // < largest integer value not greater than arg - ABS, // < Absolute value - RINT, // < Rounds the floating-point argument arg to an integer value - BIT_INVERT, // < Bitwise Not (~) - NOT, // < Logical Not (!) + SIN, ///< Trigonometric sine + COS, ///< Trigonometric cosine + TAN, ///< Trigonometric tangent + ARCSIN, ///< Trigonometric sine inverse + ARCCOS, ///< Trigonometric cosine inverse + ARCTAN, ///< Trigonometric tangent inverse + SINH, ///< Hyperbolic sine + COSH, ///< Hyperbolic cosine + TANH, ///< Hyperbolic tangent + ARCSINH, ///< Hyperbolic sine inverse + ARCCOSH, ///< Hyperbolic cosine inverse + ARCTANH, ///< Hyperbolic tangent inverse + EXP, ///< Exponential (base e, Euler number) + LOG, ///< Natural Logarithm (base e) + SQRT, ///< Square-root (x^0.5) + CBRT, ///< Cube-root (x^(1.0/3)) + CEIL, ///< Smallest integer value not less than arg + FLOOR, ///< largest integer value not greater than arg + ABS, ///< Absolute value + RINT, ///< Rounds the floating-point argument arg to an integer value + BIT_INVERT, ///< Bitwise Not (~) + NOT, ///< Logical Not (!) }; /** From cac53c5b7f4845faea935b29a6efb323eff56a19 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Mon, 22 Nov 2021 10:42:59 -0800 Subject: [PATCH 022/202] Enable string to decimal 128 cast (#9742) A short PR to enable String to Decimal 128 cast Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/9742 --- java/src/main/native/src/ColumnViewJni.cpp | 1 + .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index bce330ea4a3..4efac307627 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -936,6 +936,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas break; case cudf::type_id::DECIMAL32: case cudf::type_id::DECIMAL64: + case cudf::type_id::DECIMAL128: result = cudf::strings::to_fixed_point(*column, n_data_type); break; default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index f332661dc19..a582541a0d4 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3370,6 +3370,22 @@ void testFixedWidthCast() { } } + @Test + void testCastStringToBigDecimal() { + String[] bigValues = {"923121331938210123.321", + "9223372036854775808.191", + "9328323982309091029831.002" + }; + + try (ColumnVector cv = ColumnVector.fromStrings(bigValues); + ColumnVector values = cv.castTo(DType.create(DType.DTypeEnum.DECIMAL128, -3)); + ColumnVector expected = ColumnVector.fromDecimals(new BigDecimal("923121331938210123.321"), + new BigDecimal("9223372036854775808.191"), + new BigDecimal("9328323982309091029831.002"))) { + assertColumnsAreEqual(expected, values); + } + } + @Test void testCastByteToString() { From ebeb2023ce81f254aaa638c0cd308da98b15418d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 22 Nov 2021 14:23:13 -0500 Subject: [PATCH 023/202] Fix out-of-bounds memory write in decimal128-to-string conversion (#9740) This fixes an error found in a memcheck test referenced here: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=cuda115,LINUX_VER=centos7,PYTHON=3.8/5082/ This also disables the `FixedPointStringConversionOperator` which fails on a Debug build and may be a bug in `std::string`. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9740 --- cpp/src/strings/convert/utilities.cuh | 7 ++++--- cpp/tests/strings/fixed_point_tests.cpp | 4 ++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/strings/convert/utilities.cuh b/cpp/src/strings/convert/utilities.cuh index 234ecf48f2e..d9ca8159706 100644 --- a/cpp/src/strings/convert/utilities.cuh +++ b/cpp/src/strings/convert/utilities.cuh @@ -67,8 +67,9 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer) bool const is_negative = cuda::std::is_signed() ? (value < 0) : false; constexpr IntegerType base = 10; - constexpr int MAX_DIGITS = 20; // largest 64-bit integer is 20 digits - char digits[MAX_DIGITS]; // place-holder for digit chars + // largest 64-bit integer is 20 digits; largest 128-bit integer is 39 digits + constexpr int MAX_DIGITS = cuda::std::numeric_limits::digits10 + 1; + char digits[MAX_DIGITS]; // place-holder for digit chars int digits_idx = 0; while (value != 0) { assert(digits_idx < MAX_DIGITS); @@ -107,7 +108,7 @@ constexpr size_type count_digits(IntegerType value) auto const digits = [value] { // largest 8-byte unsigned value is 18446744073709551615 (20 digits) // largest 16-byte unsigned value is 340282366920938463463374607431768211455 (39 digits) - auto constexpr max_digits = std::is_same_v ? 39 : 20; + auto constexpr max_digits = cuda::std::numeric_limits::digits10 + 1; size_type digits = 1; __int128_t pow10 = 10; diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index 7c188d39f6f..ce4280e0733 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -303,7 +303,11 @@ TEST_F(StringsConvertTest, IsFixedPoint) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected64_scaled); } +#ifdef NDEBUG TEST_F(StringsConvertTest, FixedPointStringConversionOperator) +#else +TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator) +#endif { auto const max = cuda::std::numeric_limits<__int128_t>::max(); From d1811b5baf1d83f8d376a4f6e7fd84020a24506b Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 24 Nov 2021 01:22:13 +0530 Subject: [PATCH 024/202] update cuda version in local build (#9736) update cuda, ubuntu, python versions in local build using gpuci docker image. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Jordan Jacobelli (https://github.com/Ethyling) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/9736 --- ci/local/README.md | 6 +++--- ci/local/build.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/local/README.md b/ci/local/README.md index 96002802263..7754bcaf647 100644 --- a/ci/local/README.md +++ b/ci/local/README.md @@ -18,12 +18,12 @@ Build and test your local repository using a base gpuCI Docker image where: -H Show this help text -r Path to repository (defaults to working directory) - -i Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda10.1-devel-ubuntu16.04-py3.7) + -i Use Docker image (default is gpuci/rapidsai:${NIGHTLY_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8) -s Skip building and testing and start an interactive shell in a container of the Docker image ``` Example Usage: -`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:0.16-cuda10.2-devel-ubuntu16.04-py3.7` +`bash build.sh -r ~/rapids/cudf -i gpuci/rapidsai:22.02-cuda11.5-devel-ubuntu20.04-py3.8` For a full list of available gpuCI docker images, visit our [DockerHub](https://hub.docker.com/r/gpuci/rapidsai/tags) page. @@ -42,7 +42,7 @@ There are some caveats to be aware of when using this script, especially if you ### Docker Image Build Repository -The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda10.1-ubuntu16.04-py3.7/`. Feel free to remove this directory after the script is finished. +The docker image will generate build artifacts in a folder on your machine located in the `root` directory of the repository you passed to the script. For the above example, the directory is named `~/rapids/cudf/build_rapidsai_cuda11.5-ubuntu20.04-py3.8/`. Feel free to remove this directory after the script is finished. *Note*: The script *will not* override your local build repository. Your local environment stays in tact. diff --git a/ci/local/build.sh b/ci/local/build.sh index 1bfb8b63fef..345db967264 100755 --- a/ci/local/build.sh +++ b/ci/local/build.sh @@ -3,7 +3,7 @@ GIT_DESCRIBE_TAG=`git describe --tags` MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` -DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.0-devel-ubuntu18.04-py3.7" +DOCKER_IMAGE="gpuci/rapidsai:${MINOR_VERSION}-cuda11.5-devel-ubuntu20.04-py3.8" REPO_PATH=${PWD} RAPIDS_DIR_IN_CONTAINER="/rapids" CPP_BUILD_DIR="cpp/build" From 0fa0cc48a6b3b93e79f918d419a012b75765561c Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 30 Nov 2021 07:50:12 -0700 Subject: [PATCH 025/202] Support `min` and `max` in inclusive scan for structs (#9725) This PR continues to address https://github.com/rapidsai/cudf/issues/8974, adding support for structs in `min` and `max` inclusive scan. Exclusive scan support is not needed in the near future. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - David Wendt (https://github.com/davidwendt) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/9725 --- cpp/include/cudf/detail/scan.hpp | 71 ++++---- cpp/src/groupby/sort/group_scan_util.cuh | 6 + cpp/src/reductions/scan/scan_inclusive.cu | 87 +++++++++- cpp/tests/reductions/reduction_tests.cpp | 2 +- cpp/tests/reductions/scan_tests.cpp | 196 ++++++++++++++++++++++ 5 files changed, 325 insertions(+), 37 deletions(-) diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index 113c15f19a1..8e3db1c7b10 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -26,22 +26,25 @@ namespace detail { /** * @brief Computes the exclusive scan of a column. * - * The null values are skipped for the operation, and if an input element - * at `i` is null, then the output element at `i` will also be null. + * The null values are skipped for the operation, and if an input element at `i` is null, then the + * output element at `i` will also be null. * - * The identity value for the column type as per the aggregation type - * is used for the value of the first element in the output column. + * The identity value for the column type as per the aggregation type is used for the value of the + * first element in the output column. * - * @throws cudf::logic_error if column data_type is not an arithmetic type. + * Struct columns are allowed with aggregation types Min and Max. * - * @param input The input column view for the scan - * @param agg unique_ptr to aggregation operator applied by the scan - * @param null_handling Exclude null values when computing the result if - * null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE. - * Any operation with a null results in a null. + * @throws cudf::logic_error if column data_type is not an arithmetic type or struct type but the + * `agg` is not Min or Max. + * + * @param input The input column view for the scan. + * @param agg unique_ptr to aggregation operator applied by the scan. + * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE. + * Include nulls if null_policy::INCLUDE. Any operation with a null results in + * a null. * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned scalar's device memory - * @returns Column with scan results + * @param mr Device memory resource used to allocate the returned scalar's device memory. + * @returns Column with scan results. */ std::unique_ptr scan_exclusive(column_view const& input, std::unique_ptr const& agg, @@ -52,22 +55,22 @@ std::unique_ptr scan_exclusive(column_view const& input, /** * @brief Computes the inclusive scan of a column. * - * The null values are skipped for the operation, and if an input element - * at `i` is null, then the output element at `i` will also be null. + * The null values are skipped for the operation, and if an input element at `i` is null, then the + * output element at `i` will also be null. * - * String columns are allowed with aggregation types Min and Max. + * String and struct columns are allowed with aggregation types Min and Max. * - * @throws cudf::logic_error if column data_type is not an arithmetic type - * or string type but the `agg` is not Min or Max + * @throws cudf::logic_error if column data_type is not an arithmetic type or string/struct types + * but the `agg` is not Min or Max. * - * @param input The input column view for the scan - * @param agg unique_ptr to aggregation operator applied by the scan - * @param null_handling Exclude null values when computing the result if - * null_policy::EXCLUDE. Include nulls if null_policy::INCLUDE. - * Any operation with a null results in a null. + * @param input The input column view for the scan. + * @param agg unique_ptr to aggregation operator applied by the scan. + * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE. + * Include nulls if null_policy::INCLUDE. Any operation with a null results in + * a null. * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned scalar's device memory - * @returns Column with scan results + * @param mr Device memory resource used to allocate the returned scalar's device memory. + * @returns Column with scan results. */ std::unique_ptr scan_inclusive(column_view const& input, std::unique_ptr const& agg, @@ -76,24 +79,24 @@ std::unique_ptr scan_inclusive(column_view const& input, rmm::mr::device_memory_resource* mr); /** - * @brief Generate row ranks for a column + * @brief Generate row ranks for a column. * - * @param order_by Input column to generate ranks for - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return rank values + * @param order_by Input column to generate ranks for. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return rank values. */ std::unique_ptr inclusive_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** - * @brief Generate row dense ranks for a column + * @brief Generate row dense ranks for a column. * - * @param order_by Input column to generate ranks for - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned column's device memory - * @return rank values + * @param order_by Input column to generate ranks for. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return rank values. */ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, rmm::cuda_stream_view stream, diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index b565e8dc6d8..ae3e3232e06 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -239,7 +239,13 @@ struct group_scan_functor()}, gather_map.size(), gather_map.data()); + // // Gather the children elements of the prefix min/max struct elements first. + // + // Typically, we should use `get_sliced_child` for each child column to properly handle the + // input if it is a sliced view. However, since the input to this function is just generated + // from groupby internal APIs which is never a sliced view, we just use `child_begin` and + // `child_end` iterators for simplicity. auto scanned_children = cudf::detail::gather( table_view(std::vector{values.child_begin(), values.child_end()}), diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu index 02ecd6df4d9..70f5ca90539 100644 --- a/cpp/src/reductions/scan/scan_inclusive.cu +++ b/cpp/src/reductions/scan/scan_inclusive.cu @@ -14,13 +14,17 @@ * limitations under the License. */ -#include "scan.cuh" +#include +#include #include +#include #include #include #include #include +#include +#include #include #include @@ -150,6 +154,72 @@ struct scan_functor { } }; +template +struct scan_functor { + static std::unique_ptr invoke(column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + // Op is used only to determined if we want to find the min or max element. + auto constexpr is_min_op = std::is_same_v; + + // Build indices of the scan operation results (ARGMIN/ARGMAX). + // When finding ARGMIN, we need to consider nulls as larger than non-null elements, and the + // opposite for ARGMAX. + auto gather_map = rmm::device_uvector(input.size(), stream); + auto const do_scan = [&](auto const& binop) { + thrust::inclusive_scan(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + gather_map.begin(), + binop); + }; + + auto constexpr null_precedence = is_min_op ? cudf::null_order::AFTER : cudf::null_order::BEFORE; + auto const flattened_input = cudf::structs::detail::flatten_nested_columns( + table_view{{input}}, {}, std::vector{null_precedence}); + auto const d_flattened_input_ptr = table_device_view::create(flattened_input, stream); + auto const flattened_null_precedences = + is_min_op ? cudf::detail::make_device_uvector_async(flattened_input.null_orders(), stream) + : rmm::device_uvector(0, stream); + + if (input.has_nulls()) { + auto const binop = cudf::reduction::detail::row_arg_minmax_fn( + input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op); + do_scan(binop); + } else { + auto const binop = cudf::reduction::detail::row_arg_minmax_fn( + input.size(), *d_flattened_input_ptr, flattened_null_precedences.data(), is_min_op); + do_scan(binop); + } + + // Gather the children columns of the input column. Must use `get_sliced_child` to properly + // handle input in case it is a sliced view. + auto const input_children = [&] { + auto const it = cudf::detail::make_counting_transform_iterator( + 0, [structs_view = structs_column_view{input}, stream](auto const child_idx) { + return structs_view.get_sliced_child(child_idx); + }); + return std::vector(it, it + input.num_children()); + }(); + + // Gather the children elements of the prefix min/max struct elements for the output. + auto scanned_children = cudf::detail::gather(table_view{input_children}, + gather_map, + out_of_bounds_policy::DONT_CHECK, + negative_index_policy::NOT_ALLOWED, + stream, + mr) + ->release(); + + // Don't need to set a null mask because that will be handled at the caller. + return make_structs_column(input.size(), + std::move(scanned_children), + UNKNOWN_NULL_COUNT, + rmm::device_buffer{0, stream, mr}); + } +}; + /** * @brief Dispatcher for running a Scan operation on an input column * @@ -161,7 +231,11 @@ struct scan_dispatcher { template static constexpr bool is_supported() { - return std::is_invocable_v && !cudf::is_dictionary(); + if constexpr (std::is_same_v) { + return std::is_same_v || std::is_same_v; + } else { + return std::is_invocable_v && !cudf::is_dictionary(); + } } public: @@ -209,6 +283,15 @@ std::unique_ptr scan_inclusive( output->set_null_mask(mask_scan(input, scan_type::INCLUSIVE, stream, mr), UNKNOWN_NULL_COUNT); } + // If the input is a structs column, we also need to push down nulls from the parent output column + // into the children columns. + if (input.type().id() == type_id::STRUCT && output->has_nulls()) { + for (size_type idx = 0; idx < output->num_children(); ++idx) { + structs::detail::superimpose_parent_nulls( + output->view().null_mask(), output->null_count(), output->child(idx), stream, mr); + } + } + return output; } } // namespace detail diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 2c9279260e7..d8ee8f9d08d 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -28,7 +29,6 @@ #include #include #include -#include #include diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index d1e983460d5..0892436eb47 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -397,3 +398,198 @@ TYPED_TEST(ScanDurationTest, Sum) EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE), cudf::logic_error); } + +struct StructScanTest : public cudf::test::BaseFixture { +}; + +TEST_F(StructScanTest, StructScanMinMaxNoNull) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + + auto const input = [] { + auto child1 = STRINGS_CW{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"}; + auto child2 = INTS_CW{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + return STRUCTS_CW{{child1, child2}}; + }(); + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"}; + auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8}; + return STRUCTS_CW{{child1, child2}}; + }(); + auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"}; + auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3}; + return STRUCTS_CW{{child1, child2}}; + }(); + auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(StructScanTest, StructScanMinMaxSlicedInput) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + constexpr int32_t dont_care{1}; + + auto const input_original = [] { + auto child1 = STRINGS_CW{"$dont_care", + "$dont_care", + "año", + "bit", + "₹1", + "aaa", + "zit", + "bat", + "aab", + "$1", + "€1", + "wut", + "₹dont_care"}; + auto child2 = INTS_CW{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care}; + return STRUCTS_CW{{child1, child2}}; + }(); + + auto const input = cudf::slice(input_original, {2, 12})[0]; + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", "año", "año", "aaa", "aaa", "aaa", "aaa", "$1", "$1", "$1"}; + auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8}; + return STRUCTS_CW{{child1, child2}}; + }(); + auto const result = cudf::scan(input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", "bit", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1", "₹1"}; + auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3}; + return STRUCTS_CW{{child1, child2}}; + }(); + auto const result = cudf::scan(input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} + +TEST_F(StructScanTest, StructScanMinMaxWithNulls) +{ + using INTS_CW = cudf::test::fixed_width_column_wrapper; + using STRINGS_CW = cudf::test::strings_column_wrapper; + using STRUCTS_CW = cudf::test::structs_column_wrapper; + using cudf::test::iterators::nulls_at; + + auto const input = [] { + auto child1 = STRINGS_CW{{"año", + "bit", + "₹1" /*NULL*/, + "aaa" /*NULL*/, + "zit", + "bat", + "aab", + "$1" /*NULL*/, + "€1" /*NULL*/, + "wut"}, + nulls_at({2, 7})}; + auto child2 = INTS_CW{{1, 2, 3 /*NULL*/, 4 /*NULL*/, 5, 6, 7, 8 /*NULL*/, 9 /*NULL*/, 10}, + nulls_at({2, 7})}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; + }(); + + { + auto const expected = [] { + auto child1 = STRINGS_CW{ + "año", "año", "año", "" /*NULL*/, "año", "año", "aab", "aab", "" /*NULL*/, "aab"}; + auto child2 = INTS_CW{1, 1, 1, 0 /*NULL*/, 1, 1, 7, 7, 0 /*NULL*/, 7}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; + }(); + + auto const result = cudf::scan( + input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const expected = [] { + auto child1 = STRINGS_CW{ + "año", "bit", "bit", "" /*NULL*/, "zit", "zit", "zit", "zit", "" /*NULL*/, "zit"}; + auto child2 = INTS_CW{1, 2, 2, 0 /*NULL*/, 5, 5, 5, 5, 0 /*NULL*/, 5}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 8})}; + }(); + + auto const result = cudf::scan( + input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::EXCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", + "año", + "año", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}; + auto child2 = INTS_CW{1, + 1, + 1, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})}; + }(); + + auto const result = cudf::scan( + input, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } + + { + auto const expected = [] { + auto child1 = STRINGS_CW{"año", + "bit", + "bit", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}; + auto child2 = INTS_CW{1, + 2, + 2, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/, + 0 /*NULL*/}; + return STRUCTS_CW{{child1, child2}, nulls_at({3, 4, 5, 6, 7, 8, 9})}; + }(); + + auto const result = cudf::scan( + input, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, null_policy::INCLUDE); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); + } +} From dca8a0a0356e90e2b9dfa2a2cedf38d0c90935cb Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 30 Nov 2021 10:40:18 -0600 Subject: [PATCH 026/202] Fix dtype-argument bug in dask_cudf read_csv (#9796) Closes #9719 `dask_cudf.read_csv` currently fails when both `usecols` and `dtype` are specified. This PR is a simple fix. In the near future, the `_internal_read_csv` implementation should also be modified to produce a `Blockwise` HLG Layer, but I will leave that for a separate PR. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9796 --- python/dask_cudf/dask_cudf/io/csv.py | 19 +++++++++++-------- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 5 +++-- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 132201a349e..ebb02e3b6d4 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs): if chunksize is None: return read_csv_without_chunksize(path, **kwargs) + # Let dask.dataframe generate meta dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - usecols = kwargs.pop("usecols", None) - meta = dask_reader(filenames[0], **kwargs)._meta + kwargs1 = kwargs.copy() + usecols = kwargs1.pop("usecols", None) + dtype = kwargs1.pop("dtype", None) + meta = dask_reader(filenames[0], **kwargs1)._meta + names = meta.columns + if usecols or dtype: + # Regenerate meta with original kwargs if + # `usecols` or `dtype` was specified + meta = dask_reader(filenames[0], **kwargs)._meta dsk = {} i = 0 @@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs): chunksize, ) # specify which chunk of the file we care about if start != 0: - kwargs2[ - "names" - ] = meta.columns # no header in the middle of the file + kwargs2["names"] = names # no header in the middle of the file kwargs2["header"] = None - kwargs2["usecols"] = usecols dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) i += 1 divisions = [None] * (len(dsk) + 1) - if usecols is not None: - meta = meta[usecols] return dd.core.new_dd_object(dsk, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index 98061f6c624..32960a90bd7 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size): dd.assert_eq(df, df2) -def test_csv_reader_usecols(tmp_path): +@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None]) +def test_csv_reader_usecols(tmp_path, dtype): df = cudf.DataFrame( { "a": [1, 2, 3, 4] * 100, @@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path): csv_path = str(tmp_path / "usecols_data.csv") df.to_csv(csv_path, index=False) ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5) - ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"]) + ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype) dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False) From 1db05c9d889d04df113986eeee0356778ce8b003 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Nov 2021 11:45:54 -0600 Subject: [PATCH 027/202] Use Java classloader to find test resources (#9760) Updates the Java tests to use the classloader to locate test files rather than reaching directly into the source directory. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Gera Shegalov (https://github.com/gerashegalov) URL: https://github.com/rapidsai/cudf/pull/9760 --- .../src/test/java/ai/rapids/cudf/TableTest.java | 14 +++++++------- .../src/test/java/ai/rapids/cudf/TestUtils.java | 17 ++++++++++++++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 4512a08430c..b4247e9bb7c 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -70,11 +70,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue; public class TableTest extends CudfTestBase { - private static final File TEST_PARQUET_FILE = new File("src/test/resources/acq.parquet"); - private static final File TEST_ORC_FILE = new File("src/test/resources/TestOrcFile.orc"); - private static final File TEST_ORC_TIMESTAMP_DATE_FILE = new File( - "src/test/resources/timestamp-date-test.orc"); - private static final File TEST_DECIMAL_PARQUET_FILE = new File("src/test/resources/decimal.parquet"); + private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet"); + private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc"); + private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc"); + private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet"); + private static final File TEST_SIMPLE_CSV_FILE = TestUtils.getResourceAsFile("simple.csv"); private static final Schema CSV_DATA_BUFFER_SCHEMA = Schema.builder() .column(DType.INT32, "A") @@ -548,7 +548,7 @@ void testReadCSVPrune() { .column(0, 1, 2, 3, 4, 5, 6, 7, 8, 9) .column(110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.2, 119.8) .build(); - Table table = Table.readCSV(schema, opts, new File("./src/test/resources/simple.csv"))) { + Table table = Table.readCSV(schema, opts, TEST_SIMPLE_CSV_FILE)) { assertTablesAreEqual(expected, table); } } @@ -675,7 +675,7 @@ void testReadCSV() { .column(120L, 121L, 122L, 123L, 124L, 125L, 126L, 127L, 128L, 129L) .column("one", "two", "three", "four", "five", "six", "seven\ud801\uddb8", "eight\uBF68", "nine\u03E8", "ten") .build(); - Table table = Table.readCSV(schema, new File("./src/test/resources/simple.csv"))) { + Table table = Table.readCSV(schema, TEST_SIMPLE_CSV_FILE)) { assertTablesAreEqual(expected, table); } } diff --git a/java/src/test/java/ai/rapids/cudf/TestUtils.java b/java/src/test/java/ai/rapids/cudf/TestUtils.java index 5a799c666c2..a1acab5883b 100644 --- a/java/src/test/java/ai/rapids/cudf/TestUtils.java +++ b/java/src/test/java/ai/rapids/cudf/TestUtils.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,9 @@ package ai.rapids.cudf; +import java.io.File; +import java.net.URISyntaxException; +import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -211,4 +214,16 @@ static Double[] getDoubles(final long seed, final int size, int specialValues) { }); return result; } + + public static File getResourceAsFile(String resourceName) { + URL url = TestUtils.class.getClassLoader().getResource(resourceName); + if (url == null) { + throw new IllegalArgumentException("Unable to locate resource: " + resourceName); + } + try { + return new File(url.toURI()); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + } } From 1697f63b9e6e80695cb157f479fada72d053fa1a Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 30 Nov 2021 23:39:13 +0530 Subject: [PATCH 028/202] Run compute-sanitizer in nightly build (#9641) Addresses part of https://github.com/rapidsai/cudf/issues/904 - This PR enables run of `compute-sanitizer --tool memcheck` on libcudf unit tests when env `COMPUTE_SANITIZER_ENABLE=true` This env `COMPUTE_SANITIZER_ENABLE` will be enabled only in nightly builds of cudf. (To be Enabled in PR https://github.com/rapidsai/gpuci-scripts/pull/675) - This PR also adds script to parse compute-sanitizer log to junit xml file which can be processed by Jenkins. Reports only failures. If no errors, no tests are reported under memcheck results. Note: Only `memcheck` is enabled now. when required, other checks of compute-sanitizer could be enabled later. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - MithunR (https://github.com/mythrocks) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/9641 --- ci/gpu/build.sh | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 664e774c68a..8f83c169330 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. ############################################## # cuDF GPU build and test script for CI # ############################################## @@ -176,6 +176,28 @@ else ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" done + ################################################################################ + # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds) + ################################################################################ + if [[ "$BUILD_MODE" == "branch" && "$BUILD_TYPE" == "gpu" ]]; then + if [[ "$COMPUTE_SANITIZER_ENABLE" == "true" ]]; then + gpuci_logger "Memcheck on GoogleTests with rmm_mode=cuda" + export GTEST_CUDF_RMM_MODE=cuda + COMPUTE_SANITIZER_CMD="compute-sanitizer --tool memcheck" + mkdir -p "$WORKSPACE/test-results/" + for gt in gtests/*; do + test_name=$(basename ${gt}) + if [[ "$test_name" == "ERROR_TEST" ]]; then + continue + fi + echo "Running GoogleTest $test_name" + ${COMPUTE_SANITIZER_CMD} ${gt} | tee "$WORKSPACE/test-results/${test_name}.cs.log" + done + unset GTEST_CUDF_RMM_MODE + # test-results/*.cs.log are processed in gpuci + fi + fi + CUDF_CONDA_FILE=`find ${CONDA_ARTIFACT_PATH} -name "libcudf-*.tar.bz2"` CUDF_CONDA_FILE=`basename "$CUDF_CONDA_FILE" .tar.bz2` #get filename without extension CUDF_CONDA_FILE=${CUDF_CONDA_FILE//-/=} #convert to conda install From 69d576543b5414372f36d02a189a7217d3bb8006 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 30 Nov 2021 14:40:34 -0500 Subject: [PATCH 029/202] Update check for inf/nan strings in libcudf float conversion to ignore case (#9694) Reference https://github.com/rapidsai/cudf/pull/9613/files#r743579126 Add support to ignore case for strings `INF`, `INFINITY` and `NAN` to `cudf::strings::is_float` and `cudf::strings::to_float` for consistency with https://en.cppreference.com/w/cpp/string/basic_string/stof Also, remove the expensive `replace` call in the cudf before calling this from Python. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Robert Maynard (https://github.com/robertmaynard) - Nghia Truong (https://github.com/ttnghia) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9694 --- cpp/include/cudf/strings/string.cuh | 64 +++++++++++++--- cpp/src/strings/convert/convert_floats.cu | 13 ++-- cpp/tests/strings/floats_tests.cpp | 51 ++++--------- .../java/ai/rapids/cudf/ColumnVectorTest.java | 23 +++--- python/cudf/cudf/core/column/string.py | 73 ------------------- 5 files changed, 85 insertions(+), 139 deletions(-) diff --git a/cpp/include/cudf/strings/string.cuh b/cpp/include/cudf/strings/string.cuh index 82da5ad8f10..d85d19d7f10 100644 --- a/cpp/include/cudf/strings/string.cuh +++ b/cpp/include/cudf/strings/string.cuh @@ -52,6 +52,43 @@ inline __device__ bool is_integer(string_view const& d_str) thrust::seq, begin, end, [] __device__(auto chr) { return chr >= '0' && chr <= '9'; }); } +/** + * @brief Returns true if input contains the not-a-number string. + * + * The following are valid for this function: "NAN" and "NaN" + * @param d_str input string + * @return true if input is as valid NaN string. + */ +inline __device__ bool is_nan_str(string_view const& d_str) +{ + auto const ptr = d_str.data(); + return (d_str.size_bytes() == 3) && (ptr[0] == 'N' || ptr[0] == 'n') && + (ptr[1] == 'A' || ptr[1] == 'a') && (ptr[2] == 'N' || ptr[2] == 'n'); +} + +/** + * @brief Returns true if input contains the infinity string. + * + * The following are valid for this function: "INF", "INFINITY", and "Inf" + * @param d_str input string + * @return true if input is as valid Inf string. + */ +inline __device__ bool is_inf_str(string_view const& d_str) +{ + auto const ptr = d_str.data(); + auto const size = d_str.size_bytes(); + + if (size != 3 && size != 8) return false; + + auto const prefix_valid = (ptr[0] == 'I' || ptr[0] == 'i') && (ptr[1] == 'N' || ptr[1] == 'n') && + (ptr[2] == 'F' || ptr[2] == 'f'); + + return prefix_valid && + ((size == 3) || ((ptr[3] == 'I' || ptr[3] == 'i') && (ptr[4] == 'N' || ptr[4] == 'n') && + (ptr[5] == 'I' || ptr[5] == 'i') && (ptr[6] == 'T' || ptr[6] == 't') && + (ptr[7] == 'Y' || ptr[7] == 'y'))); +} + /** * @brief Returns `true` if all characters in the string * are valid for conversion to a float type. @@ -65,8 +102,8 @@ inline __device__ bool is_integer(string_view const& d_str) * An empty string returns `false`. * No bounds checking is performed to verify if the value would fit * within a specific float type. - * The following strings are also allowed "NaN", "Inf" and, "-Inf" - * and will return true. + * The following strings are also allowed and will return true: + * "NaN", "NAN", "Inf", "INF", "INFINITY" * * @param d_str String to check. * @return true if string has valid float characters @@ -74,29 +111,32 @@ inline __device__ bool is_integer(string_view const& d_str) inline __device__ bool is_float(string_view const& d_str) { if (d_str.empty()) return false; - // strings allowed by the converter - if (d_str.compare("NaN", 3) == 0) return true; - if (d_str.compare("Inf", 3) == 0) return true; - if (d_str.compare("-Inf", 4) == 0) return true; bool decimal_found = false; bool exponent_found = false; size_type bytes = d_str.size_bytes(); const char* data = d_str.data(); // sign character allowed at the beginning of the string - size_type chidx = (*data == '-' || *data == '+') ? 1 : 0; - bool result = chidx < bytes; + size_type ch_idx = (*data == '-' || *data == '+') ? 1 : 0; + + bool result = ch_idx < bytes; + // check for nan and infinity strings + if (result && data[ch_idx] > '9') { + auto const inf_nan = string_view(data + ch_idx, bytes - ch_idx); + if (is_nan_str(inf_nan) || is_inf_str(inf_nan)) return true; + } + // check for float chars [0-9] and a single decimal '.' // and scientific notation [eE][+-][0-9] - for (; chidx < bytes; ++chidx) { - auto chr = data[chidx]; + for (; ch_idx < bytes; ++ch_idx) { + auto chr = data[ch_idx]; if (chr >= '0' && chr <= '9') continue; if (!decimal_found && chr == '.') { decimal_found = true; // no more decimals continue; } if (!exponent_found && (chr == 'e' || chr == 'E')) { - if (chidx + 1 < bytes) chr = data[chidx + 1]; - if (chr == '-' || chr == '+') ++chidx; + if (ch_idx + 1 < bytes) chr = data[ch_idx + 1]; + if (chr == '-' || chr == '+') ++ch_idx; decimal_found = true; // no decimal allowed in exponent exponent_found = true; // no more exponents continue; diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 366d4fe7d42..70b5f528213 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -45,7 +45,7 @@ namespace { * @brief This function converts the given string into a * floating point double value. * - * This will also map strings containing "NaN", "Inf" and "-Inf" + * This will also map strings containing "NaN", "Inf", etc. * to the appropriate float values. * * This function will also handle scientific notation format. @@ -55,16 +55,19 @@ __device__ inline double stod(string_view const& d_str) const char* in_ptr = d_str.data(); const char* end = in_ptr + d_str.size_bytes(); if (end == in_ptr) return 0.0; - // special strings - if (d_str.compare("NaN", 3) == 0) return std::numeric_limits::quiet_NaN(); - if (d_str.compare("Inf", 3) == 0) return std::numeric_limits::infinity(); - if (d_str.compare("-Inf", 4) == 0) return -std::numeric_limits::infinity(); double sign{1.0}; if (*in_ptr == '-' || *in_ptr == '+') { sign = (*in_ptr == '-' ? -1 : 1); ++in_ptr; } + // special strings: NaN, Inf + if ((in_ptr < end) && *in_ptr > '9') { + auto const inf_nan = string_view(in_ptr, static_cast(thrust::distance(in_ptr, end))); + if (string::is_nan_str(inf_nan)) return std::numeric_limits::quiet_NaN(); + if (string::is_inf_str(inf_nan)) return sign * std::numeric_limits::infinity(); + } + // Parse and store the mantissa as much as we can, // until we are about to exceed the limit of uint64_t constexpr uint64_t max_holding = (std::numeric_limits::max() - 9L) / 10L; diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp index 126bffa1e49..e6f4f6bb8d9 100644 --- a/cpp/tests/strings/floats_tests.cpp +++ b/cpp/tests/strings/floats_tests.cpp @@ -58,32 +58,20 @@ TEST_F(StringsConvertTest, IsFloat) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); cudf::test::strings_column_wrapper strings2( - {"+175", "-34", "9.8", "1234567890", "6.7e17", "-917.2e5"}); + {"-34", "9.8", "1234567890", "-917.2e5", "INF", "NAN", "-Inf", "INFINITY"}); results = cudf::strings::is_float(cudf::strings_column_view(strings2)); - cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1}); + cudf::test::fixed_width_column_wrapper expected2({1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(StringsConvertTest, ToFloats32) { - std::vector h_strings{"1234", - nullptr, - "-876", - "543.2", - "-0.12", - ".25", - "-.002", - "", - "-0.0", - "1.2e4", - "NaN", - "abc123", - "123abc", - "456e", - "-1.78e+5", - "-122.33644782123456789", - "12e+309", - "3.4028236E38"}; + std::vector h_strings{ + "1234", nullptr, "-876", "543.2", + "-0.12", ".25", "-.002", "", + "-0.0", "1.2e4", "NAN", "abc123", + "123abc", "456e", "-1.78e+5", "-122.33644782123456789", + "12e+309", "3.4028236E38", "INF", "Infinity"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), @@ -135,24 +123,11 @@ TEST_F(StringsConvertTest, FromFloats32) TEST_F(StringsConvertTest, ToFloats64) { - std::vector h_strings{"1234", - nullptr, - "-876", - "543.2", - "-0.12", - ".25", - "-.002", - "", - "-0.0", - "1.28e256", - "NaN", - "abc123", - "123abc", - "456e", - "-1.78e+5", - "-122.33644782", - "12e+309", - "1.7976931348623159E308"}; + std::vector h_strings{ + "1234", nullptr, "-876", "543.2", "-0.12", ".25", + "-.002", "", "-0.0", "1.28e256", "NaN", "abc123", + "123abc", "456e", "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308", + "-Inf", "-INFINITY"}; cudf::test::strings_column_wrapper strings( h_strings.begin(), h_strings.end(), diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index a582541a0d4..cf602c26717 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4919,11 +4919,12 @@ void testIsFloat() { try (ColumnVector floatStringCV = ColumnVector.fromStrings(floatStrings); ColumnVector isFloat = floatStringCV.isFloat(); ColumnVector floats = floatStringCV.asFloats(); - ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, 0f, Float.POSITIVE_INFINITY, - Float.NEGATIVE_INFINITY, 0f, 0f, -0f, 0f, Float.MAX_VALUE, Float.POSITIVE_INFINITY, - -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, 1.2e-24f, 0f, 0f, null, 423f); - ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false, - false, true, true, true, true, true, true, true, false, false, null, true)) { + ColumnVector expectedFloats = ColumnVector.fromBoxedFloats(0f, Float.NaN, Float.POSITIVE_INFINITY, + Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY, Float.POSITIVE_INFINITY, -0f, 0f, + Float.MAX_VALUE, Float.POSITIVE_INFINITY, -Float.MAX_VALUE, Float.NEGATIVE_INFINITY, + 1.2e-24f, 0f, 0f, null, 423f); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true, + true, true, true, true, true, true, true, true, false, false, null, true)) { assertColumnsAreEqual(expected, isFloat); assertColumnsAreEqual(expectedFloats, floats); } @@ -4944,12 +4945,12 @@ void testIsDouble() { try (ColumnVector doubleStringCV = ColumnVector.fromStrings(doubleStrings); ColumnVector isDouble = doubleStringCV.isFloat(); ColumnVector doubles = doubleStringCV.asDoubles(); - ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, 0d, - Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, 0d, 0d, -0d, 0d, Double.MAX_VALUE, - Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, 1.2e-234d, 0d, - 0d, null, 423d); - ColumnVector expected = ColumnVector.fromBoxedBooleans(false, false, true, true, false, - false, true, true, true, true, true, true, true, false, false, null, true)) { + ColumnVector expectedDoubles = ColumnVector.fromBoxedDoubles(0d, Double.NaN, + Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, + -0d, 0d, Double.MAX_VALUE, Double.POSITIVE_INFINITY, -Double.MAX_VALUE, Double.NEGATIVE_INFINITY, + 1.2e-234d, 0d, 0d, null, 423d); + ColumnVector expected = ColumnVector.fromBoxedBooleans(false, true, true, true, true, + true, true, true, true, true, true, true, true, false, false, null, true)) { assertColumnsAreEqual(expected, isDouble); assertColumnsAreEqual(expectedDoubles, doubles); } diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index a167383c65c..2a91abc5701 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -97,69 +97,6 @@ def str_to_boolean(column: StringColumn): cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta, } -_NAN_INF_VARIATIONS = [ - "nan", - "NAN", - "Nan", - "naN", - "nAN", - "NAn", - "nAn", - "-inf", - "-INF", - "-InF", - "-inF", - "-iNF", - "-INf", - "-iNf", - "+inf", - "+INF", - "+InF", - "+inF", - "+iNF", - "+INf", - "+Inf", - "+iNf", - "inf", - "INF", - "InF", - "inF", - "iNF", - "INf", - "iNf", -] -_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [ - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "NaN", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "-Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", - "Inf", -] - def _is_supported_regex_flags(flags): return flags == 0 or ( @@ -5309,16 +5246,6 @@ def as_numerical_column( "type due to presence of non-integer values." ) elif out_dtype.kind == "f": - # TODO: Replace this `replace` call with a - # case-insensitive method once following - # issue is fixed: https://github.com/rapidsai/cudf/issues/5217 - old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS) - new_values = cudf.core.column.as_column( - _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS - ) - string_col = libcudf.replace.replace( - string_col, old_values, new_values - ) if not libstrings.is_float(string_col).all(): raise ValueError( "Could not convert strings to float " From 00a8845780ae9289f483f1113e5c62d4acd7dfe7 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Nov 2021 14:02:24 -0600 Subject: [PATCH 030/202] Refactor TableTest assertion methods to a separate utility class (#9762) TableTest has a number of dependencies, e.g.: Parquet, Hadoop, etc., that make it less ideal to be used in an external project. This moves the column and table assertion methods to a separate AssertUtils utility class that avoids the extra dependencies. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Gera Shegalov (https://github.com/gerashegalov) URL: https://github.com/rapidsai/cudf/pull/9762 --- .../ai/rapids/cudf/ArrowColumnVectorTest.java | 3 +- .../test/java/ai/rapids/cudf/AssertUtils.java | 272 ++++++++++++++++++ .../java/ai/rapids/cudf/BinaryOpTest.java | 2 +- .../ai/rapids/cudf/ByteColumnVectorTest.java | 6 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 38 +-- .../test/java/ai/rapids/cudf/IfElseTest.java | 2 +- .../ai/rapids/cudf/IntColumnVectorTest.java | 4 +- .../test/java/ai/rapids/cudf/ScalarTest.java | 2 +- .../test/java/ai/rapids/cudf/TableTest.java | 251 +--------------- .../cudf/TimestampColumnVectorTest.java | 2 +- .../test/java/ai/rapids/cudf/UnaryOpTest.java | 2 +- .../cudf/ast/CompiledExpressionTest.java | 2 +- 12 files changed, 309 insertions(+), 277 deletions(-) create mode 100644 java/src/test/java/ai/rapids/cudf/AssertUtils.java diff --git a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java index d5d4059d18d..2a11b24b3a8 100644 --- a/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ArrowColumnVectorTest.java @@ -21,7 +21,6 @@ import java.nio.ByteBuffer; import java.util.ArrayList; -import ai.rapids.cudf.HostColumnVector.BasicType; import ai.rapids.cudf.HostColumnVector.ListType; import ai.rapids.cudf.HostColumnVector.StructType; @@ -40,7 +39,7 @@ import org.junit.jupiter.api.Test; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; diff --git a/java/src/test/java/ai/rapids/cudf/AssertUtils.java b/java/src/test/java/ai/rapids/cudf/AssertUtils.java new file mode 100644 index 00000000000..184e7dd0c57 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/AssertUtils.java @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** Utility methods for asserting in unit tests */ +public class AssertUtils { + + /** + * Checks and asserts that passed in columns match + * @param expect The expected result column + * @param cv The input column + */ + public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) { + assertColumnsAreEqual(expect, cv, "unnamed"); + } + + /** + * Checks and asserts that passed in columns match + * @param expected The expected result column + * @param cv The input column + * @param colName The name of the column + */ + public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) { + assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); + } + + /** + * Checks and asserts that passed in host columns match + * @param expected The expected result host column + * @param cv The input host column + * @param colName The name of the host column + */ + public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) { + assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); + } + + /** + * Checks and asserts that passed in Struct columns match + * @param expected The expected result Struct column + * @param cv The input Struct column + */ + public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) { + assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false); + } + + /** + * Checks and asserts that passed in Struct columns match + * @param expected The expected result Struct column + * @param rowOffset The row number to look from + * @param length The number of rows to consider + * @param cv The input Struct column + * @param colName The name of the column + * @param enableNullCountCheck Whether to check for nulls in the Struct column + * @param enableNullabilityCheck Whether the table have a validity mask + */ + public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length, + ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { + try (HostColumnVector hostExpected = expected.copyToHost(); + HostColumnVector hostcv = cv.copyToHost()) { + assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck); + } + } + + /** + * Checks and asserts that passed in columns match + * @param expected The expected result column + * @param cv The input column + * @param colName The name of the column + * @param enableNullCheck Whether to check for nulls in the column + * @param enableNullabilityCheck Whether the table have a validity mask + */ + public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length, + ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) { + try (HostColumnVector hostExpected = expected.copyToHost(); + HostColumnVector hostcv = cv.copyToHost()) { + assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck); + } + } + + /** + * Checks and asserts that passed in host columns match + * @param expected The expected result host column + * @param rowOffset start row index + * @param length number of rows from starting offset + * @param cv The input host column + * @param colName The name of the host column + * @param enableNullCountCheck Whether to check for nulls in the host column + */ + public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length, + HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { + assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName); + assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName); + assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName); + if (enableNullCountCheck) { + assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName); + } else { + // TODO add in a proper check when null counts are supported by serializing a partitioned column + } + if (enableNullabilityCheck) { + assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected"); + } + DType type = expected.getType(); + for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) { + long tableRow = expectedRow - rowOffset; + assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow), + "NULL for Column " + colName + " Row " + tableRow); + if (!expected.isNull(expectedRow)) { + switch (type.typeId) { + case BOOL8: // fall through + case INT8: // fall through + case UINT8: + assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case INT16: // fall through + case UINT16: + assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case INT32: // fall through + case UINT32: // fall through + case TIMESTAMP_DAYS: + case DURATION_DAYS: + case DECIMAL32: + assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case INT64: // fall through + case UINT64: // fall through + case DURATION_MICROSECONDS: // fall through + case DURATION_MILLISECONDS: // fall through + case DURATION_NANOSECONDS: // fall through + case DURATION_SECONDS: // fall through + case TIMESTAMP_MICROSECONDS: // fall through + case TIMESTAMP_MILLISECONDS: // fall through + case TIMESTAMP_NANOSECONDS: // fall through + case TIMESTAMP_SECONDS: + case DECIMAL64: + assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case DECIMAL128: + assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case FLOAT32: + CudfTestBase.assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001, + "Column " + colName + " Row " + tableRow); + break; + case FLOAT64: + CudfTestBase.assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001, + "Column " + colName + " Row " + tableRow); + break; + case STRING: + assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow), + "Column " + colName + " Row " + tableRow); + break; + case LIST: + HostMemoryBuffer expectedOffsets = expected.getOffsets(); + HostMemoryBuffer cvOffsets = cv.getOffsets(); + int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) - + expectedOffsets.getInt(expectedRow * 4); + int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) - + cvOffsets.getInt(tableRow * 4); + assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " + + colName + " Row " + tableRow); + break; + case STRUCT: + // parent column only has validity which was checked above + break; + default: + throw new IllegalArgumentException(type + " is not supported yet"); + } + } + } + + if (type.isNestedType()) { + switch (type.typeId) { + case LIST: + int expectedChildRowOffset = 0; + int numChildRows = 0; + if (length > 0) { + HostMemoryBuffer expectedOffsets = expected.getOffsets(); + HostMemoryBuffer cvOffsets = cv.getOffsets(); + expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4); + numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) - + expectedChildRowOffset; + } + assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset, + numChildRows, cv.getNestedChildren().get(0), colName + " list child", + enableNullCountCheck, enableNullabilityCheck); + break; + case STRUCT: + List expectedChildren = expected.getNestedChildren(); + List cvChildren = cv.getNestedChildren(); + for (int i = 0; i < expectedChildren.size(); i++) { + HostColumnVectorCore expectedChild = expectedChildren.get(i); + HostColumnVectorCore cvChild = cvChildren.get(i); + String childName = colName + " child " + i; + assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName); + assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild, + colName, enableNullCountCheck, enableNullabilityCheck); + } + break; + default: + throw new IllegalArgumentException(type + " is not supported yet"); + } + } + } + + /** + * Checks and asserts that the two tables from a given rowindex match based on a provided schema. + * @param expected the expected result table + * @param rowOffset the row number to start checking from + * @param length the number of rows to check + * @param table the input table to compare against expected + * @param enableNullCheck whether to check for nulls or not + * @param enableNullabilityCheck whether the table have a validity mask + */ + public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table, + boolean enableNullCheck, boolean enableNullabilityCheck) { + assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns()); + assertEquals(length, table.getRowCount(), "ROW COUNT"); + for (int col = 0; col < expected.getNumberOfColumns(); col++) { + ColumnVector expect = expected.getColumn(col); + ColumnVector cv = table.getColumn(col); + String name = String.valueOf(col); + if (rowOffset != 0 || length != expected.getRowCount()) { + name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1); + } + assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck); + } + } + + /** + * Checks and asserts that the two tables match + * @param expected the expected result table + * @param table the input table to compare against expected + */ + public static void assertTablesAreEqual(Table expected, Table table) { + assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false); + } + + public static void assertTableTypes(DType[] expectedTypes, Table t) { + int len = t.getNumberOfColumns(); + assertEquals(expectedTypes.length, len); + for (int i = 0; i < len; i++) { + ColumnVector vec = t.getColumn(i); + DType type = vec.getType(); + assertEquals(expectedTypes[i], type, "Types don't match at " + i); + } + } +} diff --git a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java index 894861b8c44..0ca997d3c80 100644 --- a/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java +++ b/java/src/test/java/ai/rapids/cudf/BinaryOpTest.java @@ -27,7 +27,7 @@ import java.util.Arrays; import java.util.stream.IntStream; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static ai.rapids.cudf.TestUtils.*; import static org.junit.jupiter.api.Assertions.assertThrows; diff --git a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java index 878fa7e4516..a26dbec4907 100644 --- a/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ByteColumnVectorTest.java @@ -127,9 +127,9 @@ public void testCastToByte() { ColumnVector expected1 = ColumnVector.fromBytes((byte)4, (byte)3, (byte)8); ColumnVector expected2 = ColumnVector.fromBytes((byte)100); ColumnVector expected3 = ColumnVector.fromBytes((byte)-23)) { - TableTest.assertColumnsAreEqual(expected1, byteColumnVector1); - TableTest.assertColumnsAreEqual(expected2, byteColumnVector2); - TableTest.assertColumnsAreEqual(expected3, byteColumnVector3); + AssertUtils.assertColumnsAreEqual(expected1, byteColumnVector1); + AssertUtils.assertColumnsAreEqual(expected2, byteColumnVector2); + AssertUtils.assertColumnsAreEqual(expected3, byteColumnVector3); } } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index cf602c26717..fa9052029cc 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -34,8 +34,10 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertStructColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual; import static ai.rapids.cudf.QuantileMethod.*; -import static ai.rapids.cudf.TableTest.*; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assumptions.assumeTrue; @@ -86,8 +88,8 @@ void testTransformVector() { ColumnVector cv1 = cv.transform(ptx, true); ColumnVector cv2 = cv.transform(cuda, false); ColumnVector expected = ColumnVector.fromBoxedInts(2*2-2, 3*3-3, null, 4*4-4)) { - TableTest.assertColumnsAreEqual(expected, cv1); - TableTest.assertColumnsAreEqual(expected, cv2); + assertColumnsAreEqual(expected, cv1); + assertColumnsAreEqual(expected, cv2); } } @@ -252,7 +254,7 @@ void testStringCreation() { try (ColumnVector cv = ColumnVector.fromStrings("d", "sd", "sde", null, "END"); HostColumnVector host = cv.copyToHost(); ColumnVector backAgain = host.copyToDevice()) { - TableTest.assertColumnsAreEqual(cv, backAgain); + assertColumnsAreEqual(cv, backAgain); } } @@ -265,7 +267,7 @@ void testUTF8StringCreation() { null, "END".getBytes(StandardCharsets.UTF_8)); ColumnVector expected = ColumnVector.fromStrings("d", "sd", "sde", null, "END")) { - TableTest.assertColumnsAreEqual(expected, cv); + assertColumnsAreEqual(expected, cv); } } @@ -299,7 +301,7 @@ void testConcatNoNulls() { ColumnVector v2 = ColumnVector.fromInts(8, 9); ColumnVector v = ColumnVector.concatenate(v0, v1, v2); ColumnVector expected = ColumnVector.fromInts(1, 2, 3, 4, 5, 6, 7, 8, 9)) { - TableTest.assertColumnsAreEqual(expected, v); + assertColumnsAreEqual(expected, v); } } @@ -310,7 +312,7 @@ void testConcatWithNulls() { ColumnVector v2 = ColumnVector.fromBoxedDoubles(null, 9.0); ColumnVector v = ColumnVector.concatenate(v0, v1, v2); ColumnVector expected = ColumnVector.fromBoxedDoubles(1., 2., 3., 4., 5., 6., 7., null, 9.)) { - TableTest.assertColumnsAreEqual(expected, v); + assertColumnsAreEqual(expected, v); } } @@ -1882,13 +1884,13 @@ void testSubvector() { try (ColumnVector vec = ColumnVector.fromBoxedInts(1, 2, 3, null, 5); ColumnVector expected = ColumnVector.fromBoxedInts(2, 3, null, 5); ColumnVector found = vec.subVector(1, 5)) { - TableTest.assertColumnsAreEqual(expected, found); + assertColumnsAreEqual(expected, found); } try (ColumnVector vec = ColumnVector.fromStrings("1", "2", "3", null, "5"); ColumnVector expected = ColumnVector.fromStrings("2", "3", null, "5"); ColumnVector found = vec.subVector(1, 5)) { - TableTest.assertColumnsAreEqual(expected, found); + assertColumnsAreEqual(expected, found); } } @@ -2014,7 +2016,7 @@ void testTrimStringsWhiteSpace() { try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n"); ColumnVector trimmed = cv.strip(); ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "123")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2024,7 +2026,7 @@ void testTrimStrings() { Scalar one = Scalar.fromString(" 1"); ColumnVector trimmed = cv.strip(one); ColumnVector expected = ColumnVector.fromStrings("23", "23", null, "23", "\t\t123\n\n")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2033,7 +2035,7 @@ void testLeftTrimStringsWhiteSpace() { try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n"); ColumnVector trimmed = cv.lstrip(); ColumnVector expected = ColumnVector.fromStrings("123", "123 ", null, "123 ", "123\n\n")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2043,7 +2045,7 @@ void testLeftTrimStrings() { Scalar one = Scalar.fromString(" 1"); ColumnVector trimmed = cv.lstrip(one); ColumnVector expected = ColumnVector.fromStrings("23", "23 ", null, "231", "\t\t123\n\n")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2052,7 +2054,7 @@ void testRightTrimStringsWhiteSpace() { try (ColumnVector cv = ColumnVector.fromStrings(" 123", "123 ", null, " 123 ", "\t\t123\n\n"); ColumnVector trimmed = cv.rstrip(); ColumnVector expected = ColumnVector.fromStrings(" 123", "123", null, " 123", "\t\t123")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2062,7 +2064,7 @@ void testRightTrimStrings() { Scalar one = Scalar.fromString(" 1"); ColumnVector trimmed = cv.rstrip(one); ColumnVector expected = ColumnVector.fromStrings("123", "123", null, "123", "\t\t123\n\n")) { - TableTest.assertColumnsAreEqual(expected, trimmed); + assertColumnsAreEqual(expected, trimmed); } } @@ -2108,7 +2110,7 @@ void testCountElements() { Arrays.asList(1, 2, 3), Arrays.asList(1, 2, 3, 4)); ColumnVector lengths = cv.countElements(); ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 2, 3, 4)) { - TableTest.assertColumnsAreEqual(expected, lengths); + assertColumnsAreEqual(expected, lengths); } } @@ -2117,7 +2119,7 @@ void testStringLengths() { try (ColumnVector cv = ColumnVector.fromStrings("1", "12", null, "123", "1234"); ColumnVector lengths = cv.getCharLengths(); ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, null, 3, 4)) { - TableTest.assertColumnsAreEqual(expected, lengths); + assertColumnsAreEqual(expected, lengths); } } @@ -2126,7 +2128,7 @@ void testGetByteCount() { try (ColumnVector cv = ColumnVector.fromStrings("1", "12", "123", null, "1234"); ColumnVector byteLengthVector = cv.getByteCount(); ColumnVector expected = ColumnVector.fromBoxedInts(1, 2, 3, null, 4)) { - TableTest.assertColumnsAreEqual(expected, byteLengthVector); + assertColumnsAreEqual(expected, byteLengthVector); } } diff --git a/java/src/test/java/ai/rapids/cudf/IfElseTest.java b/java/src/test/java/ai/rapids/cudf/IfElseTest.java index 86ddcc23416..a078befdf40 100644 --- a/java/src/test/java/ai/rapids/cudf/IfElseTest.java +++ b/java/src/test/java/ai/rapids/cudf/IfElseTest.java @@ -25,7 +25,7 @@ import java.util.stream.Stream; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static org.junit.jupiter.api.Assertions.assertThrows; public class IfElseTest extends CudfTestBase { diff --git a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java index dd03c4de69e..2fb8164534b 100644 --- a/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/IntColumnVectorTest.java @@ -117,8 +117,8 @@ public void testCastToInt() { ColumnVector expected1 = ColumnVector.fromInts(4, 3, 8); ColumnVector intColumnVector2 = shortColumnVector.asInts(); ColumnVector expected2 = ColumnVector.fromInts(100)) { - TableTest.assertColumnsAreEqual(expected1, intColumnVector1); - TableTest.assertColumnsAreEqual(expected2, intColumnVector2); + AssertUtils.assertColumnsAreEqual(expected1, intColumnVector1); + AssertUtils.assertColumnsAreEqual(expected2, intColumnVector2); } } diff --git a/java/src/test/java/ai/rapids/cudf/ScalarTest.java b/java/src/test/java/ai/rapids/cudf/ScalarTest.java index 0889363c2d0..86c340bb321 100644 --- a/java/src/test/java/ai/rapids/cudf/ScalarTest.java +++ b/java/src/test/java/ai/rapids/cudf/ScalarTest.java @@ -29,7 +29,7 @@ import java.nio.charset.StandardCharsets; import java.util.Arrays; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static org.junit.jupiter.api.Assertions.*; public class ScalarTest extends CudfTestBase { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index b4247e9bb7c..fa221e19387 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -57,6 +57,11 @@ import java.util.stream.Collectors; import static ai.rapids.cudf.ColumnWriterOptions.mapColumn; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual; +import static ai.rapids.cudf.AssertUtils.assertTableTypes; +import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual; import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; import static ai.rapids.cudf.ParquetWriterOptions.structBuilder; import static ai.rapids.cudf.Table.TestBuilder; @@ -94,242 +99,6 @@ public class TableTest extends CudfTestBase { "8|118.2|128\n" + "9|119.8|129").getBytes(StandardCharsets.UTF_8); - /** - * Checks and asserts that passed in columns match - * @param expect The expected result column - * @param cv The input column - */ - public static void assertColumnsAreEqual(ColumnView expect, ColumnView cv) { - assertColumnsAreEqual(expect, cv, "unnamed"); - } - - /** - * Checks and asserts that passed in columns match - * @param expected The expected result column - * @param cv The input column - * @param colName The name of the column - */ - public static void assertColumnsAreEqual(ColumnView expected, ColumnView cv, String colName) { - assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); - } - - /** - * Checks and asserts that passed in host columns match - * @param expected The expected result host column - * @param cv The input host column - * @param colName The name of the host column - */ - public static void assertColumnsAreEqual(HostColumnVector expected, HostColumnVector cv, String colName) { - assertPartialColumnsAreEqual(expected, 0, expected.getRowCount(), cv, colName, true, false); - } - - /** - * Checks and asserts that passed in Struct columns match - * @param expected The expected result Struct column - * @param cv The input Struct column - */ - public static void assertStructColumnsAreEqual(ColumnView expected, ColumnView cv) { - assertPartialStructColumnsAreEqual(expected, 0, expected.getRowCount(), cv, "unnamed", true, false); - } - - /** - * Checks and asserts that passed in Struct columns match - * @param expected The expected result Struct column - * @param rowOffset The row number to look from - * @param length The number of rows to consider - * @param cv The input Struct column - * @param colName The name of the column - * @param enableNullCountCheck Whether to check for nulls in the Struct column - * @param enableNullabilityCheck Whether the table have a validity mask - */ - public static void assertPartialStructColumnsAreEqual(ColumnView expected, long rowOffset, long length, - ColumnView cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { - try (HostColumnVector hostExpected = expected.copyToHost(); - HostColumnVector hostcv = cv.copyToHost()) { - assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCountCheck, enableNullabilityCheck); - } - } - - /** - * Checks and asserts that passed in columns match - * @param expected The expected result column - * @param cv The input column - * @param colName The name of the column - * @param enableNullCheck Whether to check for nulls in the column - * @param enableNullabilityCheck Whether the table have a validity mask - */ - public static void assertPartialColumnsAreEqual(ColumnView expected, long rowOffset, long length, - ColumnView cv, String colName, boolean enableNullCheck, boolean enableNullabilityCheck) { - try (HostColumnVector hostExpected = expected.copyToHost(); - HostColumnVector hostcv = cv.copyToHost()) { - assertPartialColumnsAreEqual(hostExpected, rowOffset, length, hostcv, colName, enableNullCheck, enableNullabilityCheck); - } - } - - /** - * Checks and asserts that passed in host columns match - * @param expected The expected result host column - * @param rowOffset start row index - * @param length number of rows from starting offset - * @param cv The input host column - * @param colName The name of the host column - * @param enableNullCountCheck Whether to check for nulls in the host column - */ - public static void assertPartialColumnsAreEqual(HostColumnVectorCore expected, long rowOffset, long length, - HostColumnVectorCore cv, String colName, boolean enableNullCountCheck, boolean enableNullabilityCheck) { - assertEquals(expected.getType(), cv.getType(), "Type For Column " + colName); - assertEquals(length, cv.getRowCount(), "Row Count For Column " + colName); - assertEquals(expected.getNumChildren(), cv.getNumChildren(), "Child Count for Column " + colName); - if (enableNullCountCheck) { - assertEquals(expected.getNullCount(), cv.getNullCount(), "Null Count For Column " + colName); - } else { - // TODO add in a proper check when null counts are supported by serializing a partitioned column - } - if (enableNullabilityCheck) { - assertEquals(expected.hasValidityVector(), cv.hasValidityVector(), "Column nullability is different than expected"); - } - DType type = expected.getType(); - for (long expectedRow = rowOffset; expectedRow < (rowOffset + length); expectedRow++) { - long tableRow = expectedRow - rowOffset; - assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow), - "NULL for Column " + colName + " Row " + tableRow); - if (!expected.isNull(expectedRow)) { - switch (type.typeId) { - case BOOL8: // fall through - case INT8: // fall through - case UINT8: - assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case INT16: // fall through - case UINT16: - assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case INT32: // fall through - case UINT32: // fall through - case TIMESTAMP_DAYS: - case DURATION_DAYS: - case DECIMAL32: - assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case INT64: // fall through - case UINT64: // fall through - case DURATION_MICROSECONDS: // fall through - case DURATION_MILLISECONDS: // fall through - case DURATION_NANOSECONDS: // fall through - case DURATION_SECONDS: // fall through - case TIMESTAMP_MICROSECONDS: // fall through - case TIMESTAMP_MILLISECONDS: // fall through - case TIMESTAMP_NANOSECONDS: // fall through - case TIMESTAMP_SECONDS: - case DECIMAL64: - assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case DECIMAL128: - assertEquals(expected.getBigDecimal(expectedRow), cv.getBigDecimal(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case FLOAT32: - assertEqualsWithinPercentage(expected.getFloat(expectedRow), cv.getFloat(tableRow), 0.0001, - "Column " + colName + " Row " + tableRow); - break; - case FLOAT64: - assertEqualsWithinPercentage(expected.getDouble(expectedRow), cv.getDouble(tableRow), 0.0001, - "Column " + colName + " Row " + tableRow); - break; - case STRING: - assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow), - "Column " + colName + " Row " + tableRow); - break; - case LIST: - HostMemoryBuffer expectedOffsets = expected.getOffsets(); - HostMemoryBuffer cvOffsets = cv.getOffsets(); - int expectedChildRows = expectedOffsets.getInt((expectedRow + 1) * 4) - - expectedOffsets.getInt(expectedRow * 4); - int cvChildRows = cvOffsets.getInt((tableRow + 1) * 4) - - cvOffsets.getInt(tableRow * 4); - assertEquals(expectedChildRows, cvChildRows, "Child row count for Column " + - colName + " Row " + tableRow); - break; - case STRUCT: - // parent column only has validity which was checked above - break; - default: - throw new IllegalArgumentException(type + " is not supported yet"); - } - } - } - - if (type.isNestedType()) { - switch (type.typeId) { - case LIST: - int expectedChildRowOffset = 0; - int numChildRows = 0; - if (length > 0) { - HostMemoryBuffer expectedOffsets = expected.getOffsets(); - HostMemoryBuffer cvOffsets = cv.getOffsets(); - expectedChildRowOffset = expectedOffsets.getInt(rowOffset * 4); - numChildRows = expectedOffsets.getInt((rowOffset + length) * 4) - - expectedChildRowOffset; - } - assertPartialColumnsAreEqual(expected.getNestedChildren().get(0), expectedChildRowOffset, - numChildRows, cv.getNestedChildren().get(0), colName + " list child", - enableNullCountCheck, enableNullabilityCheck); - break; - case STRUCT: - List expectedChildren = expected.getNestedChildren(); - List cvChildren = cv.getNestedChildren(); - for (int i = 0; i < expectedChildren.size(); i++) { - HostColumnVectorCore expectedChild = expectedChildren.get(i); - HostColumnVectorCore cvChild = cvChildren.get(i); - String childName = colName + " child " + i; - assertEquals(length, cvChild.getRowCount(), "Row Count for Column " + colName); - assertPartialColumnsAreEqual(expectedChild, rowOffset, length, cvChild, - colName, enableNullCountCheck, enableNullabilityCheck); - } - break; - default: - throw new IllegalArgumentException(type + " is not supported yet"); - } - } - } - - /** - * Checks and asserts that the two tables from a given rowindex match based on a provided schema. - * @param expected the expected result table - * @param rowOffset the row number to start checking from - * @param length the number of rows to check - * @param table the input table to compare against expected - * @param enableNullCheck whether to check for nulls or not - * @param enableNullabilityCheck whether the table have a validity mask - */ - public static void assertPartialTablesAreEqual(Table expected, long rowOffset, long length, Table table, - boolean enableNullCheck, boolean enableNullabilityCheck) { - assertEquals(expected.getNumberOfColumns(), table.getNumberOfColumns()); - assertEquals(length, table.getRowCount(), "ROW COUNT"); - for (int col = 0; col < expected.getNumberOfColumns(); col++) { - ColumnVector expect = expected.getColumn(col); - ColumnVector cv = table.getColumn(col); - String name = String.valueOf(col); - if (rowOffset != 0 || length != expected.getRowCount()) { - name = name + " PART " + rowOffset + "-" + (rowOffset + length - 1); - } - assertPartialColumnsAreEqual(expect, rowOffset, length, cv, name, enableNullCheck, enableNullabilityCheck); - } - } - - /** - * Checks and asserts that the two tables match - * @param expected the expected result table - * @param table the input table to compare against expected - */ - public static void assertTablesAreEqual(Table expected, Table table) { - assertPartialTablesAreEqual(expected, 0, expected.getRowCount(), table, true, false); - } - void assertTablesHaveSameValues(HashMap[] expectedTable, Table table) { assertEquals(expectedTable.length, table.getNumberOfColumns()); int numCols = table.getNumberOfColumns(); @@ -358,16 +127,6 @@ void assertTablesHaveSameValues(HashMap[] expectedTable, Table } } - public static void assertTableTypes(DType[] expectedTypes, Table t) { - int len = t.getNumberOfColumns(); - assertEquals(expectedTypes.length, len); - for (int i = 0; i < len; i++) { - ColumnVector vec = t.getColumn(i); - DType type = vec.getType(); - assertEquals(expectedTypes[i], type, "Types don't match at " + i); - } - } - @Test void testMergeSimple() { try (Table table1 = new Table.TestBuilder() diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java index 8bf1370a0f7..9a929cec98d 100644 --- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java @@ -22,7 +22,7 @@ import java.util.function.Function; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static org.junit.jupiter.api.Assertions.assertEquals; public class TimestampColumnVectorTest extends CudfTestBase { diff --git a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java index 76970e8bf76..7fcb7cbd85b 100644 --- a/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java +++ b/java/src/test/java/ai/rapids/cudf/UnaryOpTest.java @@ -22,7 +22,7 @@ import ai.rapids.cudf.HostColumnVector.Builder; import org.junit.jupiter.api.Test; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; public class UnaryOpTest extends CudfTestBase { private static final Double[] DOUBLES_1 = new Double[]{1.0, 10.0, -100.1, 5.3, 50.0, 100.0, null, Double.NaN, Double.POSITIVE_INFINITY, 1/9.0, Double.NEGATIVE_INFINITY, 500.0, -500.0}; diff --git a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java index 2fb6792b409..e50da0a4d4d 100644 --- a/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ast/CompiledExpressionTest.java @@ -36,7 +36,7 @@ import java.util.function.Function; import java.util.stream.Stream; -import static ai.rapids.cudf.TableTest.assertColumnsAreEqual; +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; public class CompiledExpressionTest extends CudfTestBase { @Test From 554ac817498e64ba1c7ef054873fab7dc658d25c Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Nov 2021 15:50:56 -0600 Subject: [PATCH 031/202] Load native dependencies when Java ColumnView is loaded (#9800) The Java ColumnView class has native methods but does not ensure the corresponding native libraries that implement those methods are loaded. This adds a static code block to the ColumnView class to load the native libraries when the ColumnView class is loaded. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Kuhu Shukla (https://github.com/kuhushukla) URL: https://github.com/rapidsai/cudf/pull/9800 --- java/src/main/java/ai/rapids/cudf/ColumnView.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 729444f460c..6d0d24baf99 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -30,6 +30,10 @@ */ public class ColumnView implements AutoCloseable, BinaryOperable { + static { + NativeDepsLoader.loadNativeDeps(); + } + public static final long UNKNOWN_NULL_COUNT = -1; protected long viewHandle; From 20d6723fcb5eaffb6398e5cf6c14de8d774ca917 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 30 Nov 2021 15:51:12 -0600 Subject: [PATCH 032/202] Copy Java native dependencies directly into classpath (#9787) Eliminates the intermediate copy of the native libraries for the Java bindings into target/native-deps, instead copying libcudf.so and libcudfjni.so directly into the classpath resources. This eliminates the need to search target/native-deps at runtime when the native libraries are not in the classpath in the case of running tests before the jar is built. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/9787 --- java/pom.xml | 7 ++----- .../main/java/ai/rapids/cudf/NativeDepsLoader.java | 11 ++--------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 87d43ec1272..c5a3bc64fad 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -297,9 +297,6 @@ LICENSE - - ${project.build.directory}/native-deps/ - @@ -499,14 +496,14 @@ copy-native-libs - validate + generate-resources copy-resources true ${skipNativeCopy} - ${project.build.directory}/native-deps/${os.arch}/${os.name} + ${project.build.outputDirectory}/${os.arch}/${os.name} ${native.build.path} diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java index 8780ecc3aa3..9663fbcafb4 100755 --- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java +++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java @@ -81,9 +81,7 @@ public static synchronized void loadNativeDeps() { /** * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched - * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. It - * will also look for the libraries under ./target/native-deps/${os.arch}/${os.name} to help - * facilitate testing while building. + * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class. *
* Because this just loads the libraries and loading the libraries themselves needs to be a * singleton operation it is recommended that any library using this provide their own wrapper @@ -203,12 +201,7 @@ private static File createFile(String os, String arch, String baseName) throws I File loc; URL resource = loader.getResource(path); if (resource == null) { - // It looks like we are not running from the jar, or there are issues with the jar - File f = new File("./target/native-deps/" + path); - if (!f.exists()) { - throw new FileNotFoundException("Could not locate native dependency " + path); - } - resource = f.toURI().toURL(); + throw new FileNotFoundException("Could not locate native dependency " + path); } try (InputStream in = resource.openStream()) { loc = File.createTempFile(baseName, ".so"); From 991136c78be01d4de20387086a185cfd5a21713b Mon Sep 17 00:00:00 2001 From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com> Date: Tue, 30 Nov 2021 15:31:53 -0800 Subject: [PATCH 033/202] Add Pearson correlation for sort groupby (python) (#9166) Fixes: https://github.com/rapidsai/cudf/issues/8691 Authors: - Sheilah Kirui (https://github.com/skirui-source) - Karthikeyan (https://github.com/karthikeyann) - Ashwin Srinath (https://github.com/shwina) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Michael Wang (https://github.com/isVoid) - Mayank Anand (https://github.com/mayankanand007) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9166 --- docs/cudf/source/api_docs/groupby.rst | 1 + docs/cudf/source/basics/groupby.rst | 10 ++ python/cudf/cudf/_lib/aggregation.pyx | 55 +++++++++- python/cudf/cudf/_lib/cpp/aggregation.pxd | 15 ++- python/cudf/cudf/_lib/groupby.pyx | 4 +- python/cudf/cudf/core/groupby/groupby.py | 121 +++++++++++++++++++++- python/cudf/cudf/tests/test_dataframe.py | 115 ++++++++++++++++++++ 7 files changed, 314 insertions(+), 7 deletions(-) diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst index cf08d1d791b..575d7442cdf 100644 --- a/docs/cudf/source/api_docs/groupby.rst +++ b/docs/cudf/source/api_docs/groupby.rst @@ -59,6 +59,7 @@ Computations / descriptive stats GroupBy.std GroupBy.sum GroupBy.var + GroupBy.corr The following methods are available in both ``SeriesGroupBy`` and ``DataFrameGroupBy`` objects, but may differ slightly, usually in that diff --git a/docs/cudf/source/basics/groupby.rst b/docs/cudf/source/basics/groupby.rst index 04c4d42fa2a..f3269768025 100644 --- a/docs/cudf/source/basics/groupby.rst +++ b/docs/cudf/source/basics/groupby.rst @@ -127,6 +127,13 @@ Aggregations on groups is supported via the ``agg`` method: a 1 4 1 2.0 2 5 2 4.5 + >>> df.groupby("a").corr(method="pearson") + b c + a + 1 b 1.000000 0.866025 + c 0.866025 1.000000 + 2 b 1.000000 1.000000 + c 1.000000 1.000000 The following table summarizes the available aggregations and the types that support them: @@ -169,6 +176,9 @@ that support them: +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ | unique | ✅ | ✅ | ✅ | ✅ | | | | | +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + | corr | ✅ | | | | | | | ✅ | + +------------------------------------+-----------+------------+----------+---------------+--------+----------+------------+-----------+ + GroupBy apply ------------- diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 4f703724cef..68f7101b6ee 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -1,6 +1,6 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. -from enum import Enum +from enum import Enum, IntEnum import numba import numpy as np @@ -30,6 +30,7 @@ from cudf._lib.types import Interpolation cimport cudf._lib.cpp.aggregation as libcudf_aggregation cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type import cudf @@ -57,6 +58,22 @@ class AggregationKind(Enum): UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA + CORRELATION = libcudf_aggregation.aggregation.Kind.CORRELATION + + +class CorrelationType(IntEnum): + PEARSON = ( + + libcudf_aggregation.correlation_type.PEARSON + ) + KENDALL = ( + + libcudf_aggregation.correlation_type.KENDALL + ) + SPEARMAN = ( + + libcudf_aggregation.correlation_type.SPEARMAN + ) cdef class Aggregation: @@ -321,6 +338,22 @@ cdef class Aggregation: )) return agg + @classmethod + def corr(cls, method, libcudf_types.size_type min_periods): + cdef Aggregation agg = cls() + cdef libcudf_aggregation.correlation_type c_method = ( + ( + ( + CorrelationType[method.upper()] + ) + ) + ) + agg.c_obj = move( + libcudf_aggregation.make_correlation_aggregation[aggregation]( + c_method, min_periods + )) + return agg + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -692,6 +725,24 @@ cdef class GroupbyAggregation: ) return agg + @classmethod + def corr(cls, method, libcudf_types.size_type min_periods): + cdef GroupbyAggregation agg = cls() + cdef libcudf_aggregation.correlation_type c_method = ( + ( + ( + CorrelationType[method.upper()] + ) + ) + ) + agg.c_obj = move( + libcudf_aggregation. + make_correlation_aggregation[groupby_aggregation]( + c_method, min_periods + )) + return agg + + cdef class GroupbyScanAggregation: """A Cython wrapper for groupby scan aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 13bfa49057c..3982b4fecbb 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,5 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - +# Copyright (c) 2020-2021, NVIDIA CORPORATION. +from libc.stdint cimport int32_t from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -11,6 +11,7 @@ from cudf._lib.cpp.types cimport ( size_type, ) +ctypedef int32_t underlying_type_t_correlation_type cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: @@ -38,6 +39,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' + CORRELATION 'cudf::aggregation::CORRELATION' + Kind kind cdef cppclass rolling_aggregation: @@ -53,6 +56,11 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: CUDA 'cudf::udf_type::CUDA' PTX 'cudf::udf_type::PTX' + ctypedef enum correlation_type: + PEARSON 'cudf::correlation_type::PEARSON' + KENDALL 'cudf::correlation_type::KENDALL' + SPEARMAN 'cudf::correlation_type::SPEARMAN' + cdef unique_ptr[T] make_sum_aggregation[T]() except + cdef unique_ptr[T] make_product_aggregation[T]() except + @@ -106,3 +114,6 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: udf_type type, string user_defined_aggregator, data_type output_type) except + + + cdef unique_ptr[T] make_correlation_aggregation[T]( + correlation_type type, size_type min_periods) except + diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 0968d22d465..314542c9549 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. from collections import defaultdict @@ -54,7 +54,7 @@ _CATEGORICAL_AGGS = {"COUNT", "SIZE", "NUNIQUE", "UNIQUE"} _STRING_AGGS = {"COUNT", "SIZE", "MAX", "MIN", "NUNIQUE", "NTH", "COLLECT", "UNIQUE"} _LIST_AGGS = {"COLLECT"} -_STRUCT_AGGS = set() +_STRUCT_AGGS = {"CORRELATION"} _INTERVAL_AGGS = set() _DECIMAL_AGGS = {"COUNT", "SUM", "ARGMIN", "ARGMAX", "MIN", "MAX", "NUNIQUE", "NTH", "COLLECT"} diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 7f9f61ed3fd..f1d622362e2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import collections +import itertools import pickle import warnings @@ -13,7 +14,8 @@ from cudf._typing import DataFrameOrSeries from cudf.api.types import is_list_like from cudf.core.abc import Serializable -from cudf.core.column.column import arange +from cudf.core.column.column import arange, as_column +from cudf.core.multiindex import MultiIndex from cudf.utils.utils import GetAttrGetItemMixin, cached_property @@ -69,6 +71,8 @@ def __init__( """ self.obj = obj self._as_index = as_index + self._by = by + self._level = level self._sort = sort self._dropna = dropna @@ -777,6 +781,121 @@ def median(self): """Get the column-wise median of the values in each group.""" return self.agg("median") + def corr(self, method="pearson", min_periods=1): + """ + Compute pairwise correlation of columns, excluding NA/null values. + + Parameters + ---------- + method: {"pearson", "kendall", "spearman"} or callable, + default "pearson". Currently only the pearson correlation + coefficient is supported. + + min_periods: int, optional + Minimum number of observations required per pair of columns + to have a valid result. + + Returns + ---------- + DataFrame + Correlation matrix. + + Examples + -------- + >>> import cudf + >>> gdf = cudf.DataFrame({ + ... "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + ... "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + ... "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + ... "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]}) + >>> gdf + id val1 val2 val3 + 0 a 5 4 4 + 1 a 4 5 5 + 2 a 6 6 6 + 3 b 4 1 1 + 4 b 8 2 2 + 5 b 7 9 9 + 6 c 4 8 8 + 7 c 5 5 5 + 8 c 2 1 1 + >>> gdf.groupby("id").corr(method="pearson") + val1 val2 val3 + id + a val1 1.000000 0.500000 0.500000 + val2 0.500000 1.000000 1.000000 + val3 0.500000 1.000000 1.000000 + b val1 1.000000 0.385727 0.385727 + val2 0.385727 1.000000 1.000000 + val3 0.385727 1.000000 1.000000 + c val1 1.000000 0.714575 0.714575 + val2 0.714575 1.000000 1.000000 + val3 0.714575 1.000000 1.000000 + """ + + if not method.lower() in ("pearson",): + raise NotImplementedError( + "Only pearson correlation is currently supported" + ) + + # create expanded dataframe consisting all combinations of the + # struct columns-pairs to be correlated + # i.e (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) + _cols = self.grouping.values.columns.tolist() + len_cols = len(_cols) + + new_df_data = {} + for x, y in itertools.combinations_with_replacement(_cols, 2): + new_df_data[(x, y)] = cudf.DataFrame._from_data( + {"x": self.obj._data[x], "y": self.obj._data[y]} + ).to_struct() + new_gb = cudf.DataFrame._from_data(new_df_data).groupby( + by=self.grouping.keys + ) + + try: + gb_corr = new_gb.agg(lambda x: x.corr(method, min_periods)) + except RuntimeError as e: + if "Unsupported groupby reduction type-agg combination" in str(e): + raise TypeError( + "Correlation accepts only numerical column-pairs" + ) + raise + + # ensure that column-pair labels are arranged in ascending order + cols_list = [ + (y, x) if i > j else (x, y) + for j, y in enumerate(_cols) + for i, x in enumerate(_cols) + ] + cols_split = [ + cols_list[i : i + len_cols] + for i in range(0, len(cols_list), len_cols) + ] + + # interleave: combine the correlation results for each column-pair + # into a single column + res = cudf.DataFrame._from_data( + { + x: gb_corr.loc[:, i].interleave_columns() + for i, x in zip(cols_split, _cols) + } + ) + + # create a multiindex for the groupby correlated dataframe, + # to match pandas behavior + unsorted_idx = gb_corr.index.repeat(len_cols) + idx_sort_order = unsorted_idx._get_sorted_inds() + sorted_idx = unsorted_idx._gather(idx_sort_order) + if len(gb_corr): + # TO-DO: Should the operation below be done on the CPU instead? + sorted_idx._data[None] = as_column( + cudf.Series(_cols).tile(len(gb_corr.index)) + ) + res.index = MultiIndex._from_data(sorted_idx._data) + + return res + def var(self, ddof=1): """Compute the column-wise variance of the values in each group. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d07caef11d5..d555b5c4033 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8924,3 +8924,118 @@ def test_frame_series_where_other(data): expected = gdf.where(gdf["b"] == 1, 0) actual = pdf.where(pdf["b"] == 1, 0) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data, gkey", + [ + ( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + }, + ["id", "val1", "val2"], + ), + ( + { + "id": [0] * 4 + [1] * 3, + "a": [10, 3, 4, 2, -3, 9, 10], + "b": [10, 23, -4, 2, -3, 9, 19], + }, + ["id", "a"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val": [None, None, None, None, None, None], + }, + ["id"], + ), + ( + { + "id": ["a", "a", "b", "b", "c", "c"], + "val1": [None, 4, 6, 8, None, 2], + "val2": [4, 5, None, 2, 9, None], + }, + ["id"], + ), + ({"id": [1.0], "val1": [2.0], "val2": [3.0]}, ["id"]), + ], +) +@pytest.mark.parametrize( + "min_per", [0, 1, 2, 3, 4], +) +def test_pearson_corr_passing(data, gkey, min_per): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + expected = pdf.groupby(gkey).corr(method="pearson", min_periods=min_per) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize("method", ["kendall", "spearman"]) +def test_pearson_corr_unsupported_methods(method): + gdf = cudf.DataFrame( + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2], + "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1], + "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1], + } + ) + + with pytest.raises( + NotImplementedError, + match="Only pearson correlation is currently supported", + ): + gdf.groupby("id").corr(method) + + +def test_pearson_corr_empty_columns(): + gdf = cudf.DataFrame(columns=["id", "val1", "val2"]) + pdf = gdf.to_pandas() + + actual = gdf.groupby("id").corr("pearson") + expected = pdf.groupby("id").corr("pearson") + + assert_eq( + expected, actual, check_dtype=False, check_index_type=False, + ) + + +@pytest.mark.parametrize( + "data", + [ + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": [1, 1, 1, 2, 2, 2, 3, 3, 3], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + }, + ], +) +@pytest.mark.parametrize("gkey", ["id", "val1", "val2"]) +def test_pearson_corr_invalid_column_types(data, gkey): + with pytest.raises( + TypeError, match="Correlation accepts only numerical column-pairs", + ): + cudf.DataFrame(data).groupby(gkey).corr("pearson") + + +def test_pearson_corr_multiindex_dataframe(): + gdf = cudf.DataFrame( + {"a": [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [2, 3, 4, 5]} + ).set_index(["a", "b"]) + + actual = gdf.groupby(level="a").corr("pearson") + expected = gdf.to_pandas().groupby(level="a").corr("pearson") + + assert_eq(expected, actual) From 1eabcb73b7df235de9985e207e2087af9dfb0e14 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 1 Dec 2021 17:03:36 +0530 Subject: [PATCH 034/202] Fix some doxygen warnings and add missing documentation (#9770) fix to ignore `__device__ void` return type warnings. add missing documentation on some functions Correct doxygen doc style comment fixes Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9770 --- cpp/doxygen/Doxyfile | 7 ++++--- cpp/include/cudf/lists/combine.hpp | 2 +- cpp/include/cudf/scalar/scalar_device_view.cuh | 16 ++++++++++++++++ .../cudf/strings/convert/convert_lists.hpp | 2 +- cpp/include/cudf/table/row_operators.cuh | 3 ++- cpp/include/cudf_test/base_fixture.hpp | 3 +++ cpp/include/cudf_test/column_wrapper.hpp | 3 +++ cpp/include/cudf_test/file_utilities.hpp | 9 +++++++++ cpp/include/cudf_test/table_utilities.hpp | 2 +- 9 files changed, 40 insertions(+), 7 deletions(-) diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 55e5119040e..6a556bb4b34 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -2089,7 +2089,7 @@ ENABLE_PREPROCESSING = YES # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -MACRO_EXPANSION = NO +MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then # the macro expansion is limited to the macros specified with the PREDEFINED and @@ -2097,7 +2097,7 @@ MACRO_EXPANSION = NO # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -EXPAND_ONLY_PREDEF = NO +EXPAND_ONLY_PREDEF = YES # If the SEARCH_INCLUDES tag is set to YES, the include files in the # INCLUDE_PATH will be searched if a #include is found. @@ -2129,7 +2129,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = __device__= \ + __host__= # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp index a9407ed57ca..61a81e8a745 100644 --- a/cpp/include/cudf/lists/combine.hpp +++ b/cpp/include/cudf/lists/combine.hpp @@ -26,7 +26,7 @@ namespace lists { * @file */ -/* +/** * @brief Flag to specify whether a null list element will be ignored from concatenation, or the * entire concatenation result involving null list elements will be a null element. */ diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index 884b412d3e2..56afa150dfc 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -91,6 +91,12 @@ class fixed_width_scalar_device_view_base : public detail::scalar_device_view_ba return *data(); } + /** + * @brief Stores the value in scalar + * + * @tparam T The desired type + * @param value The value to store in scalar + */ template __device__ void set_value(T value) { @@ -159,6 +165,11 @@ class fixed_width_scalar_device_view : public detail::fixed_width_scalar_device_ return fixed_width_scalar_device_view_base::value(); } + /** + * @brief Stores the value in scalar + * + * @param value The value to store in scalar + */ __device__ void set_value(T value) { fixed_width_scalar_device_view_base::set_value(value); } /** @@ -218,6 +229,11 @@ class fixed_point_scalar_device_view : public detail::scalar_device_view_base { { } + /** + * @brief Stores the value in scalar + * + * @param value The value to store in scalar + */ __device__ void set_value(rep_type value) { *_data = value; } /** diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp index ec22186ea99..279bf44e7fc 100644 --- a/cpp/include/cudf/strings/convert/convert_lists.hpp +++ b/cpp/include/cudf/strings/convert/convert_lists.hpp @@ -50,7 +50,7 @@ namespace strings { * * @param input Lists column to format. * @param na_rep Replacment string for null elements. - * @param separator Strings to use for enclosing list components and separating elements. + * @param separators Strings to use for enclosing list components and separating elements. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index c719c564a87..70ccac2f75d 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -67,7 +67,7 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs) } } // namespace detail -/* +/** * @brief A specialization for floating-point `Element` type relational comparison * to derive the order of the elements with respect to `lhs`. Specialization is to * handle `nan` in the order shown below. @@ -187,6 +187,7 @@ class element_equality_comparator { * * @param lhs_element_index The index of the first element * @param rhs_element_index The index of the second element + * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal * */ template ()>* = nullptr> T generate() @@ -211,6 +213,7 @@ class TempDirTestEnvironment : public ::testing::Environment { /** * @brief Get a temporary filepath to use for the specified filename * + * @param filename name of the file to be placed in temporary directory. * @return std::string The temporary filepath */ std::string get_temp_filepath(std::string filename) { return tmpdir.path() + filename; } diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index f291b04776a..cd2ac9f3ec1 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -79,6 +79,7 @@ class column_wrapper { /** * @brief Releases internal unique_ptr to wrapped column + * @return unique_ptr to wrapped column */ std::unique_ptr release() { return std::move(wrapped); } @@ -1040,11 +1041,13 @@ class dictionary_column_wrapper : public detail::column_wrapper { /** * @brief Access keys column view + * @return column_view to keys column */ column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); } /** * @brief Access indices column view + * @return column_view to indices column */ column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); } diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp index 90bf0cd99dc..8e242e5a4f3 100644 --- a/cpp/include/cudf_test/file_utilities.hpp +++ b/cpp/include/cudf_test/file_utilities.hpp @@ -24,6 +24,10 @@ #include +/** + * @brief RAII class for creating a temporary directory. + * + */ class temp_directory { std::string _path; @@ -49,5 +53,10 @@ class temp_directory { nftw(_path.c_str(), rm_files, 10, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); } + /** + * @brief Returns the path of the temporary directory + * + * @return string path of the temporary directory + */ const std::string& path() const { return _path; } }; diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp index 831c9f5ac14..f2427c5b8c6 100644 --- a/cpp/include/cudf_test/table_utilities.hpp +++ b/cpp/include/cudf_test/table_utilities.hpp @@ -39,7 +39,7 @@ void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs); */ void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs); -/* +/** * @brief Verifies the equivalency of two tables. * * Treats null elements as equivalent. Columns that have nullability but no nulls, From 1ceb8ab01120ffe463600db14e6893e196cbb991 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 1 Dec 2021 10:10:10 -0500 Subject: [PATCH 035/202] Improve build time of libcudf iterator tests (#9788) While working on #9641 I noticed that building the iterator gtests takes alot of time in CI. Here is a link to the individual build times for libcudf including the gtests: https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5173/testReport/(root)/BuildTime/ (you can sort by Duration by clicking on table colum header). Here is a table of the top 20 compile time offenders as recorded on my local machine. Note that like the CI build output, 6 of the top 20 are just building the `ITERATOR_TEST` | rank | time (ms) | file | | ---:| ---:|:--- | | 1 | 814334 | /cudf.dir/src/search/search.cu.o | 2 | 755375 | /cudf.dir/src/sort/sort_column.cu.o | 3 | 686235 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_numeric.cu.o | 4 | 670587 | /cudf.dir/src/groupby/sort/group_nunique.cu.o | 5 | 585524 | /cudf.dir/src/reductions/scan/scan_inclusive.cu.o | 6 | 582677 | /ITERATOR_TEST.dir/iterator/pair_iterator_test_numeric.cu.o | 7 | 568418 | /ITERATOR_TEST.dir/iterator/scalar_iterator_test.cu.o | 8 | 563196 | /cudf.dir/src/sort/sort.cu.o | 9 | 548816 | /ITERATOR_TEST.dir/iterator/value_iterator_test_numeric.cu.o | 10 | 535315 | /cudf.dir/src/groupby/sort/sort_helper.cu.o | 11 | 531384 | /cudf.dir/src/sort/is_sorted.cu.o | 12 | 530382 | /ITERATOR_TEST.dir/iterator/value_iterator_test_chrono.cu.o | 13 | 525187 | /cudf.dir/src/join/semi_join.cu.o | 14 | 523726 | /cudf.dir/src/rolling/rolling.cu.o | 15 | 517909 | /cudf.dir/src/reductions/product.cu.o | 16 | 513119 | /cudf.dir/src/stream_compaction/distinct_count.cu.o | 17 | 512569 | /ITERATOR_TEST.dir/iterator/optional_iterator_test_chrono.cu.o | 18 | 508978 | /cudf.dir/src/reductions/sum_of_squares.cu.o | 19 | 508460 | /cudf.dir/src/lists/drop_list_duplicates.cu.o | 20 | 505247 | /cudf.dir/src/reductions/sum.cu.o I made some simple changes to the iterator code logic to use different thrust functions along with a temporary device vector. This approach improved the compile time of the `ITERATOR_TEST` by about 3x. Here are the results of compiling the above 6 files with the changes in this PR. | new rank | new time (ms) | file | | ---:| ---:|:--- | | 59 | 232691 (2.9x) | optional_iterator_test_numeric.cu.o | | 26 | 416951 (1.4x) | pair_iterator_test_numeric.cu.o | | 92 | 165947 (3.4x) | scalar_iterator_test.cu.o | | 65 | 216364 (2.5x) | value_iterator_test_numeric.cu.o | | 77 | 186583 (2.8x) | value_iterator_test_chrono.cu.o | | 111 | 137789 (3.7x) | optional_iterator_test_chrono.cu.o | Total overall build time improved locally by ~3m (10%) using `ninja -j48 install` on a Dell 5820. Here are the build time results of a CI build with these changes. https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-gpu-test/CUDA=11.5,GPU_LABEL=driver-495,LINUX_VER=ubuntu20.04,PYTHON=3.8/5190/testReport/(root)/BuildTime/ Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/9788 --- cpp/tests/iterator/iterator_tests.cuh | 17 +++++++-- .../optional_iterator_test_numeric.cu | 37 +++++++++---------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh index 4ec347c4bc1..07eb595449c 100644 --- a/cpp/tests/iterator/iterator_tests.cuh +++ b/cpp/tests/iterator/iterator_tests.cuh @@ -18,8 +18,8 @@ #include #include -#include // include iterator header -#include //for meanvar +#include +#include // for meanvar #include #include @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -83,7 +84,17 @@ struct IteratorTest : public cudf::test::BaseFixture { EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items); auto dev_expected = cudf::detail::make_device_uvector_sync(expected); - bool result = thrust::equal(thrust::device, d_in, d_in_last, dev_expected.begin()); + // using a temporary vector and calling transform and all_of separately is + // equivalent to thrust::equal but compiles ~3x faster + auto dev_results = rmm::device_uvector(num_items, rmm::cuda_stream_default); + thrust::transform(thrust::device, + d_in, + d_in_last, + dev_expected.begin(), + dev_results.begin(), + thrust::equal_to{}); + auto result = thrust::all_of( + thrust::device, dev_results.begin(), dev_results.end(), thrust::identity{}); EXPECT_TRUE(result) << "thrust test"; } diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu index 6d51f4a5c14..a8c135a726f 100644 --- a/cpp/tests/iterator/optional_iterator_test_numeric.cu +++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu @@ -50,21 +50,15 @@ struct transformer_optional_meanvar { } }; -struct sum_if_not_null { - template - CUDA_HOST_DEVICE_CALLABLE thrust::optional operator()(const thrust::optional& lhs, - const thrust::optional& rhs) - { - return lhs.value_or(T{0}) + rhs.value_or(T{0}); - } +template +struct optional_to_meanvar { + CUDA_HOST_DEVICE_CALLABLE T operator()(const thrust::optional& v) { return v.value_or(T{0}); } }; // TODO: enable this test also at __CUDACC_DEBUG__ // This test causes fatal compilation error only at device debug mode. // Workaround: exclude this test only at device debug mode. #if !defined(__CUDACC_DEBUG__) -// This test computes `count`, `sum`, `sum_of_squares` at a single reduction call. -// It would be useful for `var`, `std` operation TYPED_TEST(NumericOptionalIteratorTest, mean_var_output) { using T = TypeParam; @@ -104,22 +98,27 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output) expected_value.value_squared = std::accumulate( replaced_array.begin(), replaced_array.end(), T{0}, [](T acc, T i) { return acc + i * i; }); - // std::cout << "expected = " << expected_value << std::endl; - // GPU test auto it_dev = d_col->optional_begin(cudf::contains_nulls::YES{}); auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer); - auto result = thrust::reduce(it_dev_squared, - it_dev_squared + d_col->size(), - thrust::optional{T_output{}}, - sum_if_not_null{}); + + // this can be computed with a single reduce and without a temporary output vector + // but the approach increases the compile time by ~2x + auto results = rmm::device_uvector(d_col->size(), rmm::cuda_stream_default); + thrust::transform(thrust::device, + it_dev_squared, + it_dev_squared + d_col->size(), + results.begin(), + optional_to_meanvar{}); + auto result = thrust::reduce(thrust::device, results.begin(), results.end(), T_output{}); + if (not std::is_floating_point()) { - EXPECT_EQ(expected_value, *result) << "optional iterator reduction sum"; + EXPECT_EQ(expected_value, result) << "optional iterator reduction sum"; } else { - EXPECT_NEAR(expected_value.value, result->value, 1e-3) << "optional iterator reduction sum"; - EXPECT_NEAR(expected_value.value_squared, result->value_squared, 1e-3) + EXPECT_NEAR(expected_value.value, result.value, 1e-3) << "optional iterator reduction sum"; + EXPECT_NEAR(expected_value.value_squared, result.value_squared, 1e-3) << "optional iterator reduction sum squared"; - EXPECT_EQ(expected_value.count, result->count) << "optional iterator reduction count"; + EXPECT_EQ(expected_value.count, result.count) << "optional iterator reduction count"; } } #endif From 11c3dfef2e7fe6fd67ff93bdf36a47c0a5b2eb37 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 1 Dec 2021 10:28:24 -0600 Subject: [PATCH 036/202] Remove unused masked udf cython/c++ code (#9792) This PR removes the c++ side of the original masked UDF code introduced in https://github.com/rapidsai/cudf/pull/8213. These kernels had some limitations and are now superseded by the numba-generated versions we moved to in https://github.com/rapidsai/cudf/pull/9174. As far as I can tell, cuDF python was the only thing consuming this API for the short time it has existed. However I am marking this breaking just in case. Authors: - https://github.com/brandon-b-miller Approvers: - Mark Harris (https://github.com/harrism) - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9792 --- .../Modules/JitifyPreprocessKernels.cmake | 4 +- cpp/include/cudf/transform.hpp | 6 -- cpp/src/transform/jit/masked_udf_kernel.cu | 85 --------------- cpp/src/transform/transform.cpp | 102 ------------------ python/cudf/cudf/_lib/cpp/transform.pxd | 6 -- python/cudf/cudf/_lib/transform.pyx | 24 ----- 6 files changed, 2 insertions(+), 225 deletions(-) delete mode 100644 cpp/src/transform/jit/masked_udf_kernel.cu diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake index c2ad25760b8..6ab1293ab6f 100644 --- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake +++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake @@ -51,8 +51,8 @@ function(jit_preprocess_files) endfunction() jit_preprocess_files( - SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu - transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu + SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu transform/jit/kernel.cu + rolling/jit/kernel.cu ) add_custom_target( diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp index 55e7bc84dbe..45e8ff1310c 100644 --- a/cpp/include/cudf/transform.hpp +++ b/cpp/include/cudf/transform.hpp @@ -54,12 +54,6 @@ std::unique_ptr transform( bool is_ptx, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -std::unique_ptr generalized_masked_op( - table_view const& data_view, - std::string const& binary_udf, - data_type output_type, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Creates a null_mask from `input` by converting `NaN` to null and * preserving existing null values and also returns new null_count. diff --git a/cpp/src/transform/jit/masked_udf_kernel.cu b/cpp/src/transform/jit/masked_udf_kernel.cu deleted file mode 100644 index 319ad730c53..00000000000 --- a/cpp/src/transform/jit/masked_udf_kernel.cu +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include - -namespace cudf { -namespace transformation { -namespace jit { - -template -struct Masked { - T value; - bool valid; -}; - -template -__device__ auto make_args(cudf::size_type id, TypeIn in_ptr, MaskType in_mask, OffsetType in_offset) -{ - bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true; - return cuda::std::make_tuple(in_ptr[id], valid); -} - -template -__device__ auto make_args(cudf::size_type id, - InType in_ptr, - MaskType in_mask, // in practice, always cudf::bitmask_type const* - OffsetType in_offset, // in practice, always cudf::size_type - Arguments... args) -{ - bool valid = in_mask ? cudf::bit_is_set(in_mask, in_offset + id) : true; - return cuda::std::tuple_cat(cuda::std::make_tuple(in_ptr[id], valid), make_args(id, args...)); -} - -template -__global__ void generic_udf_kernel(cudf::size_type size, - TypeOut* out_data, - bool* out_mask, - Arguments... args) -{ - int const tid = threadIdx.x; - int const blkid = blockIdx.x; - int const blksz = blockDim.x; - int const gridsz = gridDim.x; - int const start = tid + blkid * blksz; - int const step = blksz * gridsz; - - Masked output; - for (cudf::size_type i = start; i < size; i += step) { - auto func_args = cuda::std::tuple_cat( - cuda::std::make_tuple(&output.value), - make_args(i, args...) // passed int64*, bool*, int64, int64*, bool*, int64 - ); - cuda::std::apply(GENERIC_OP, func_args); - out_data[i] = output.value; - out_mask[i] = output.valid; - } -} - -} // namespace jit -} // namespace transformation -} // namespace cudf diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp index 5230b853a79..0cca6699586 100644 --- a/cpp/src/transform/transform.cpp +++ b/cpp/src/transform/transform.cpp @@ -19,12 +19,10 @@ #include #include #include -#include #include #include #include -#include #include #include @@ -65,80 +63,6 @@ void unary_operation(mutable_column_view output, cudf::jit::get_data_ptr(input)); } -std::vector make_template_types(column_view outcol_view, table_view const& data_view) -{ - std::string mskptr_type = - cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id())) + "*"; - std::string offset_type = - cudf::jit::get_type_name(cudf::data_type(cudf::type_to_id())); - - std::vector template_types; - template_types.reserve((3 * data_view.num_columns()) + 1); - - template_types.push_back(cudf::jit::get_type_name(outcol_view.type())); - for (auto const& col : data_view) { - template_types.push_back(cudf::jit::get_type_name(col.type()) + "*"); - template_types.push_back(mskptr_type); - template_types.push_back(offset_type); - } - return template_types; -} - -void generalized_operation(table_view const& data_view, - std::string const& udf, - data_type output_type, - mutable_column_view outcol_view, - mutable_column_view outmsk_view, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - auto const template_types = make_template_types(outcol_view, data_view); - - std::string generic_kernel_name = - jitify2::reflection::Template("cudf::transformation::jit::generic_udf_kernel") - .instantiate(template_types); - - std::string generic_cuda_source = cudf::jit::parse_single_function_ptx( - udf, "GENERIC_OP", cudf::jit::get_type_name(output_type), {0}); - - std::vector kernel_args; - kernel_args.reserve((data_view.num_columns() * 3) + 3); - - cudf::size_type size = outcol_view.size(); - const void* outcol_ptr = cudf::jit::get_data_ptr(outcol_view); - const void* outmsk_ptr = cudf::jit::get_data_ptr(outmsk_view); - kernel_args.insert(kernel_args.begin(), {&size, &outcol_ptr, &outmsk_ptr}); - - std::vector data_ptrs; - std::vector mask_ptrs; - std::vector offsets; - - data_ptrs.reserve(data_view.num_columns()); - mask_ptrs.reserve(data_view.num_columns()); - offsets.reserve(data_view.num_columns()); - - auto const iters = thrust::make_zip_iterator( - thrust::make_tuple(data_ptrs.begin(), mask_ptrs.begin(), offsets.begin())); - - std::for_each(iters, iters + data_view.num_columns(), [&](auto const& tuple_vals) { - kernel_args.push_back(&thrust::get<0>(tuple_vals)); - kernel_args.push_back(&thrust::get<1>(tuple_vals)); - kernel_args.push_back(&thrust::get<2>(tuple_vals)); - }); - - std::transform(data_view.begin(), data_view.end(), iters, [&](column_view const& col) { - return thrust::make_tuple(cudf::jit::get_data_ptr(col), col.null_mask(), col.offset()); - }); - - cudf::jit::get_program_cache(*transform_jit_masked_udf_kernel_cu_jit) - .get_kernel(generic_kernel_name, - {}, - {{"transform/jit/operation-udf.hpp", generic_cuda_source}}, - {"-arch=sm_."}) - ->configure_1d_max_occupancy(0, 0, 0, stream.value()) - ->launch(kernel_args.data()); -} - } // namespace jit } // namespace transformation @@ -165,24 +89,6 @@ std::unique_ptr transform(column_view const& input, return output; } -std::unique_ptr generalized_masked_op(table_view const& data_view, - std::string const& udf, - data_type output_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - std::unique_ptr output = make_fixed_width_column(output_type, data_view.num_rows()); - std::unique_ptr output_mask = - make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, data_view.num_rows()); - - transformation::jit::generalized_operation( - data_view, udf, output_type, *output, *output_mask, stream, mr); - - auto final_output_mask = cudf::bools_to_mask(*output_mask); - output.get()->set_null_mask(std::move(*(final_output_mask.first))); - return output; -} - } // namespace detail std::unique_ptr transform(column_view const& input, @@ -195,12 +101,4 @@ std::unique_ptr transform(column_view const& input, return detail::transform(input, unary_udf, output_type, is_ptx, rmm::cuda_stream_default, mr); } -std::unique_ptr generalized_masked_op(table_view const& data_view, - std::string const& udf, - data_type output_type, - rmm::mr::device_memory_resource* mr) -{ - return detail::generalized_masked_op(data_view, udf, output_type, rmm::cuda_stream_default, mr); -} - } // namespace cudf diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd index 3153427ce3c..590a371ff52 100644 --- a/python/cudf/cudf/_lib/cpp/transform.pxd +++ b/python/cudf/cudf/_lib/cpp/transform.pxd @@ -34,12 +34,6 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: bool is_ptx ) except + - cdef unique_ptr[column] generalized_masked_op( - const table_view& data_view, - string udf, - data_type output_type, - ) except + - cdef pair[unique_ptr[table], unique_ptr[column]] encode( table_view input ) except + diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index a0eb7c68183..96d25cb92c9 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -123,30 +123,6 @@ def transform(Column input, op): return Column.from_unique_ptr(move(c_output)) -def masked_udf(incols, op, output_type): - cdef table_view data_view = table_view_from_table( - incols, ignore_index=True) - cdef string c_str = op.encode("UTF-8") - cdef type_id c_tid - cdef data_type c_dtype - - c_tid = ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - output_type - ] - ) - c_dtype = data_type(c_tid) - - with nogil: - c_output = move(libcudf_transform.generalized_masked_op( - data_view, - c_str, - c_dtype, - )) - - return Column.from_unique_ptr(move(c_output)) - - def table_encode(input): cdef table_view c_input = table_view_from_table( input, ignore_index=True) From 1904d1a9ff54343471998523816c9e0a00f46797 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 1 Dec 2021 13:00:16 -0600 Subject: [PATCH 037/202] Fix overflow for min calculation in strings::from_timestamps (#9793) This fixes #9790 When converting a timestamp to a String it is possible for the %M min calculation to overflow an int32_t part way through casting. This moves that result to be an int64_t which avoids the overflow issues. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/9793 --- cpp/src/strings/convert/convert_datetime.cu | 4 ++-- cpp/tests/strings/datetime_tests.cpp | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 51a6a796ba3..8d0c5704a7b 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -707,9 +707,9 @@ struct from_timestamp_base { * scale( 61,60) -> 1 * @endcode */ - __device__ int32_t scale_time(int64_t time, int64_t base) const + __device__ int64_t scale_time(int64_t time, int64_t base) const { - return static_cast((time - ((time < 0) * (base - 1L))) / base); + return (time - ((time < 0) * (base - 1L))) / base; }; __device__ time_components get_time_components(int64_t tstamp) const diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp index 4543607614f..9a01d5dd041 100644 --- a/cpp/tests/strings/datetime_tests.cpp +++ b/cpp/tests/strings/datetime_tests.cpp @@ -311,13 +311,14 @@ TEST_F(StringsDatetimeTest, FromTimestampAmPm) TEST_F(StringsDatetimeTest, FromTimestampMillisecond) { cudf::test::fixed_width_column_wrapper timestamps_ms{ - 1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047}; + 1530705600123, 1582934461007, 1451430122421, 1318302183999, -6106017600047, 128849018880000}; auto results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%3f"); cudf::test::strings_column_wrapper expected_ms{"2018-07-04 12:00:00.123", "2020-02-29 00:01:01.007", "2015-12-29 23:02:02.421", "2011-10-11 03:03:03.999", - "1776-07-04 11:59:59.953"}; + "1776-07-04 11:59:59.953", + "6053-01-23 02:08:00.000"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms); results = cudf::strings::from_timestamps(timestamps_ms, "%Y-%m-%d %H:%M:%S.%f"); @@ -325,7 +326,8 @@ TEST_F(StringsDatetimeTest, FromTimestampMillisecond) "2020-02-29 00:01:01.007000", "2015-12-29 23:02:02.421000", "2011-10-11 03:03:03.999000", - "1776-07-04 11:59:59.953000"}; + "1776-07-04 11:59:59.953000", + "6053-01-23 02:08:00.000000"}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_ms_6f); cudf::test::fixed_width_column_wrapper timestamps_ns{ From 836f800e61acafa0fa6b3c7d9826904f0ba2ad06 Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Wed, 1 Dec 2021 16:46:14 -0500 Subject: [PATCH 038/202] Use CTAD with Thrust function objects (#9768) While reviewing another PR, I noticed unnecessary usage of explicit template parameters with Thrust function objects and decided to open a small PR to clean this up (CTAD showed up in C++17). CI depends on https://github.com/rapidsai/cudf/pull/9766 Authors: - Conor Hoekstra (https://github.com/codereport) Approvers: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Mike Wilson (https://github.com/hyperbolic2346) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9768 --- cpp/include/cudf/strings/detail/gather.cuh | 2 +- cpp/include/cudf_test/column_wrapper.hpp | 7 ++----- cpp/src/copying/concatenate.cu | 2 +- cpp/src/groupby/sort/group_merge_m2.cu | 4 ++-- cpp/src/groupby/sort/group_rank_scan.cu | 2 +- cpp/src/groupby/sort/group_scan_util.cuh | 6 +++--- .../sort/group_single_pass_reduction_util.cuh | 16 ++++++++-------- cpp/src/groupby/sort/group_tdigest.cu | 10 +++++----- cpp/src/join/hash_join.cu | 2 +- cpp/src/join/join_utils.cu | 2 +- .../lists/combine/concatenate_list_elements.cu | 2 +- cpp/src/lists/contains.cu | 7 ++----- cpp/src/lists/interleave_columns.cu | 8 ++++---- cpp/src/quantiles/tdigest/tdigest.cu | 7 ++----- cpp/src/reductions/scan/scan_inclusive.cu | 9 ++++----- cpp/src/rolling/grouped_rolling.cu | 6 +++--- cpp/src/rolling/rolling_collect_list.cu | 2 +- cpp/src/sort/rank.cu | 10 +++++----- cpp/src/strings/copying/concatenate.cu | 2 +- cpp/src/strings/findall.cu | 7 ++----- cpp/src/strings/repeat_strings.cu | 2 +- cpp/src/strings/split/split.cu | 14 ++++---------- cpp/tests/iterator/iterator_tests.cuh | 11 +++-------- .../apply_boolean_mask_tests.cpp | 4 ++-- cpp/tests/strings/fixed_point_tests.cpp | 2 +- cpp/tests/transform/row_bit_count_test.cu | 6 ++---- 26 files changed, 63 insertions(+), 89 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index ec4a88a0e46..eb7258830ce 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -315,7 +315,7 @@ std::unique_ptr gather( d_out_offsets + output_count, [] __device__(auto size) { return static_cast(size); }, size_t{0}, - thrust::plus{}); + thrust::plus{}); CUDF_EXPECTS(total_bytes < static_cast(std::numeric_limits::max()), "total size of output strings is too large for a cudf column"); diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index cd2ac9f3ec1..ccfdde2270c 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1502,11 +1502,8 @@ class lists_column_wrapper : public detail::column_wrapper { // concatenate them together, skipping children that are null. std::vector children; - thrust::copy_if(std::cbegin(cols), - std::cend(cols), - valids, // stencil - std::back_inserter(children), - thrust::identity{}); + thrust::copy_if( + std::cbegin(cols), std::cend(cols), valids, std::back_inserter(children), thrust::identity{}); auto data = children.empty() ? cudf::empty_like(expected_hierarchy) : concatenate(children); diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index f4b6a8bf5fd..34c0cea683e 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -79,7 +79,7 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi device_views.cend(), std::next(offsets.begin()), [](auto const& col) { return col.size(); }, - thrust::plus{}); + thrust::plus{}); auto d_offsets = make_device_uvector_async(offsets, stream); auto const output_size = offsets.back(); diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu index 4e2a5b68abc..bde7c985df1 100644 --- a/cpp/src/groupby/sort/group_merge_m2.cu +++ b/cpp/src/groupby/sort/group_merge_m2.cu @@ -173,8 +173,8 @@ std::unique_ptr group_merge_m2(column_view const& values, // Generate bitmask for the output. // Only mean and M2 values can be nullable. Count column must be non-nullable. - auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr); if (null_count > 0) { result_means->set_null_mask(null_mask, null_count); // copy null_mask result_M2s->set_null_mask(std::move(null_mask), null_count); // take over null_mask diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 935ef9554a9..f36bdc0a660 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -79,7 +79,7 @@ std::unique_ptr rank_generator(column_view const& order_by, group_labels.end(), mutable_ranks.begin(), mutable_ranks.begin(), - thrust::equal_to{}, + thrust::equal_to{}, scan_op); return ranks; diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index ae3e3232e06..e25fdd6fc27 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -115,7 +115,7 @@ struct group_scan_functor() group_labels.end(), inp_iter, out_iter, - thrust::equal_to{}, + thrust::equal_to{}, binop); }; @@ -160,7 +160,7 @@ struct group_scan_functor{}, + thrust::equal_to{}, binop); }; @@ -214,7 +214,7 @@ struct group_scan_functor{}, + thrust::equal_to{}, binop); }; diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index decb127b264..95a36f40e57 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -191,7 +191,7 @@ struct group_reduction_functor{}, + thrust::equal_to{}, binop); }; @@ -215,10 +215,10 @@ struct group_reduction_functor validity(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + thrust::logical_or{}); - auto [null_mask, null_count] = cudf::detail::valid_if( - validity.begin(), validity.end(), thrust::identity{}, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); result->set_null_mask(std::move(null_mask), null_count); } return result; @@ -264,7 +264,7 @@ struct group_reduction_functor< inp_iter, thrust::make_discard_iterator(), out_iter, - thrust::equal_to{}, + thrust::equal_to{}, binop); }; @@ -283,10 +283,10 @@ struct group_reduction_functor< auto validity = rmm::device_uvector(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + thrust::logical_or{}); - auto [null_mask, null_count] = cudf::detail::valid_if( - validity.begin(), validity.end(), thrust::identity{}, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{}, stream, mr); result->set_null_mask(std::move(null_mask), null_count); } else { auto const binop = diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu index 146a6a8c31c..551eb128231 100644 --- a/cpp/src/groupby/sort/group_tdigest.cu +++ b/cpp/src/groupby/sort/group_tdigest.cu @@ -625,7 +625,7 @@ std::unique_ptr compute_tdigests(int delta, centroids_begin, // values thrust::make_discard_iterator(), // key output output, // output - thrust::equal_to{}, // key equality check + thrust::equal_to{}, // key equality check merge_centroids{}); // create final tdigest column @@ -850,8 +850,8 @@ std::unique_ptr group_merge_tdigest(column_view const& input, min_iter, thrust::make_discard_iterator(), merged_min_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::minimum{}); + thrust::equal_to{}, // key equality check + thrust::minimum{}); auto merged_max_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -864,8 +864,8 @@ std::unique_ptr group_merge_tdigest(column_view const& input, max_iter, thrust::make_discard_iterator(), merged_max_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::maximum{}); + thrust::equal_to{}, // key equality check + thrust::maximum{}); // for any empty groups, set the min and max to be 0. not technically necessary but it makes // testing simpler. diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index e4bd1938ecc..c5b680f129e 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -266,7 +266,7 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, left_join_complement_size = thrust::count_if(rmm::exec_policy(stream), invalid_index_map->begin(), invalid_index_map->end(), - thrust::identity()); + thrust::identity()); } return join_size + left_join_complement_size; } diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 4aca4b4a9cf..9e98f87e7f0 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -136,7 +136,7 @@ get_left_join_indices_complement(std::unique_ptr> thrust::make_counting_iterator(end_counter), invalid_index_map->begin(), right_indices_complement->begin(), - thrust::identity()) - + thrust::identity{}) - right_indices_complement->begin(); right_indices_complement->resize(indices_count, stream); } diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index 4bef312b396..2ddede97ce4 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -225,7 +225,7 @@ std::unique_ptr concatenate_lists_nullifying_rows(column_view const& inp auto list_entries = gather_list_entries(input, offsets_view, num_rows, num_output_entries, stream, mr); auto [null_mask, null_count] = cudf::detail::valid_if( - list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); return make_lists_column(num_rows, std::move(list_offsets), diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu index bdbc9ae013c..b48982d205a 100644 --- a/cpp/src/lists/contains.cu +++ b/cpp/src/lists/contains.cu @@ -74,11 +74,8 @@ struct lookup_functor { if (!search_keys_have_nulls && !input_lists.has_nulls() && !input_lists.child().has_nulls()) { return {rmm::device_buffer{0, stream, mr}, size_type{0}}; } else { - return cudf::detail::valid_if(result_validity.begin(), - result_validity.end(), - thrust::identity{}, - stream, - mr); + return cudf::detail::valid_if( + result_validity.begin(), result_validity.end(), thrust::identity{}, stream, mr); } } diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index b9b73d98ed2..220cb25a942 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -228,8 +228,8 @@ struct interleave_list_entries_impl{}, stream, mr); + auto [null_mask, null_count] = + cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr); return make_strings_column(num_output_entries, std::move(offsets_column), @@ -306,7 +306,7 @@ struct interleave_list_entries_impl( if (data_has_null_mask) { auto [null_mask, null_count] = cudf::detail::valid_if( - validities.begin(), validities.end(), thrust::identity{}, stream, mr); + validities.begin(), validities.end(), thrust::identity{}, stream, mr); if (null_count > 0) { output->set_null_mask(null_mask, null_count); } } @@ -405,7 +405,7 @@ std::unique_ptr interleave_columns(table_view const& input, } auto [null_mask, null_count] = cudf::detail::valid_if( - list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); return make_lists_column(num_output_lists, std::move(list_offsets), std::move(list_entries), diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 57c221b15ed..18e7d02d086 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -348,11 +348,8 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, if (null_count == 0) { return std::pair{rmm::device_buffer{}, null_count}; } - return cudf::detail::valid_if(tdigest_is_empty, - tdigest_is_empty + tdv.size(), - thrust::logical_not{}, - stream, - mr); + return cudf::detail::valid_if( + tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr); }(); return cudf::make_lists_column( diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu index 70f5ca90539..b0e761c4c3b 100644 --- a/cpp/src/reductions/scan/scan_inclusive.cu +++ b/cpp/src/reductions/scan/scan_inclusive.cu @@ -50,11 +50,10 @@ rmm::device_buffer mask_scan(column_view const& input_view, auto valid_itr = detail::make_validity_iterator(*d_input); auto first_null_position = [&] { - size_type const first_null = thrust::find_if_not(rmm::exec_policy(stream), - valid_itr, - valid_itr + input_view.size(), - thrust::identity{}) - - valid_itr; + size_type const first_null = + thrust::find_if_not( + rmm::exec_policy(stream), valid_itr, valid_itr + input_view.size(), thrust::identity{}) - + valid_itr; size_type const exclusive_offset = (inclusive == scan_type::EXCLUSIVE) ? 1 : 0; return std::min(input_view.size(), first_null + exclusive_offset); }(); diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index 509f67bb5c6..5a7f15148d8 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -142,8 +142,8 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, preceding_window] __device__(size_type idx) { auto group_label = d_group_labels[idx]; auto group_start = d_group_offsets[group_label]; - return thrust::minimum{}(preceding_window, - idx - group_start + 1); // Preceding includes current row. + return thrust::minimum{}(preceding_window, + idx - group_start + 1); // Preceding includes current row. }; auto following_calculator = [d_group_offsets = group_offsets.data(), @@ -152,7 +152,7 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, auto group_label = d_group_labels[idx]; auto group_end = d_group_offsets[group_label + 1]; // Cannot fall off the end, since offsets // is capped with `input.size()`. - return thrust::minimum{}(following_window, (group_end - 1) - idx); + return thrust::minimum{}(following_window, (group_end - 1) - idx); }; if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) { diff --git a/cpp/src/rolling/rolling_collect_list.cu b/cpp/src/rolling/rolling_collect_list.cu index ecef90dc8e1..30c39bde7d2 100644 --- a/cpp/src/rolling/rolling_collect_list.cu +++ b/cpp/src/rolling/rolling_collect_list.cu @@ -75,7 +75,7 @@ std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view con per_row_mapping_begin, per_row_mapping_begin + num_child_rows, per_row_mapping_begin, - thrust::maximum{}); + thrust::maximum{}); return per_row_mapping; } diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index c8a908e44cd..e9589e6c4b3 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -117,7 +117,7 @@ void tie_break_ranks_transform(cudf::device_span dense_rank_sor tie_iter, thrust::make_discard_iterator(), tie_sorted.begin(), - thrust::equal_to{}, + thrust::equal_to{}, tie_breaker); auto sorted_tied_rank = thrust::make_transform_iterator( dense_rank_sorted.begin(), @@ -171,8 +171,8 @@ void rank_min(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::minimum{}, - thrust::identity{}, + thrust::minimum{}, + thrust::identity{}, stream); } @@ -189,8 +189,8 @@ void rank_max(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::maximum{}, - thrust::identity{}, + thrust::maximum{}, + thrust::identity{}, stream); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index db8b37a9592..3822fa8bf5a 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -96,7 +96,7 @@ auto create_strings_device_views(host_span views, rmm::cuda_s device_views_ptr + views.size(), std::next(d_partition_offsets.begin()), chars_size_transform{}, - thrust::plus{}); + thrust::plus{}); auto const output_chars_size = d_partition_offsets.back_element(stream); stream.synchronize(); // ensure copy of output_chars_size is complete before returning diff --git a/cpp/src/strings/findall.cu b/cpp/src/strings/findall.cu index 3ab5b55020c..8d96f0de415 100644 --- a/cpp/src/strings/findall.cu +++ b/cpp/src/strings/findall.cu @@ -153,11 +153,8 @@ std::unique_ptr
findall_re( std::vector> results; - size_type const columns = thrust::reduce(rmm::exec_policy(stream), - find_counts.begin(), - find_counts.end(), - 0, - thrust::maximum{}); + size_type const columns = thrust::reduce( + rmm::exec_policy(stream), find_counts.begin(), find_counts.end(), 0, thrust::maximum{}); // boundary case: if no columns, return all nulls column (issue #119) if (columns == 0) results.emplace_back(std::make_unique( diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 458f3ed885c..7820e0064a6 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -369,7 +369,7 @@ std::pair, int64_t> repeat_strings_output_sizes( thrust::make_counting_iterator(strings_count), fn, int64_t{0}, - thrust::plus{}); + thrust::plus{}); return std::make_pair(std::move(output_sizes), total_bytes); } diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 5113b418501..c6e52a79059 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -490,11 +490,8 @@ std::unique_ptr
split_fn(strings_column_view const& strings_column, }); // the columns_count is the maximum number of tokens for any string - auto const columns_count = thrust::reduce(rmm::exec_policy(stream), - token_counts.begin(), - token_counts.end(), - 0, - thrust::maximum{}); + auto const columns_count = thrust::reduce( + rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); // boundary case: if no columns, return one null column (custrings issue #119) if (columns_count == 0) { results.push_back(std::make_unique( @@ -748,11 +745,8 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, [tokenizer] __device__(size_type idx) { return tokenizer.count_tokens(idx); }); // column count is the maximum number of tokens for any string - size_type const columns_count = thrust::reduce(rmm::exec_policy(stream), - token_counts.begin(), - token_counts.end(), - 0, - thrust::maximum{}); + size_type const columns_count = thrust::reduce( + rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); std::vector> results; // boundary case: if no columns, return one null column (issue #119) diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh index 07eb595449c..d93c1275122 100644 --- a/cpp/tests/iterator/iterator_tests.cuh +++ b/cpp/tests/iterator/iterator_tests.cuh @@ -51,13 +51,8 @@ struct IteratorTest : public cudf::test::BaseFixture { // Get temporary storage size size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, - temp_storage_bytes, - d_in, - dev_result.begin(), - num_items, - thrust::minimum{}, - init); + cub::DeviceReduce::Reduce( + nullptr, temp_storage_bytes, d_in, dev_result.begin(), num_items, thrust::minimum{}, init); // Allocate temporary storage rmm::device_buffer d_temp_storage(temp_storage_bytes, rmm::cuda_stream_default); @@ -68,7 +63,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in, dev_result.begin(), num_items, - thrust::minimum{}, + thrust::minimum{}, init); evaluate(expected, dev_result, "cub test"); diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index 813cceb0861..c80a8fba55c 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -204,13 +204,13 @@ TEST_F(ApplyBooleanMask, FixedPointLargeColumnTest) dec32_data.cend(), mask_data.cbegin(), std::back_inserter(expect_dec32_data), - thrust::identity()); + thrust::identity{}); thrust::copy_if(thrust::seq, dec64_data.cbegin(), dec64_data.cend(), mask_data.cbegin(), std::back_inserter(expect_dec64_data), - thrust::identity()); + thrust::identity{}); decimal32_wrapper expect_col32( expect_dec32_data.begin(), expect_dec32_data.end(), numeric::scale_type{-3}); diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp index ce4280e0733..5872a9e5bb7 100644 --- a/cpp/tests/strings/fixed_point_tests.cpp +++ b/cpp/tests/strings/fixed_point_tests.cpp @@ -329,4 +329,4 @@ TEST_F(StringsConvertTest, DISABLED_FixedPointStringConversionOperator) auto const c = numeric::decimal128{numeric::scaled_integer{max, numeric::scale_type{-38}}}; EXPECT_EQ(static_cast(c), "1.70141183460469231731687303715884105727"); -} \ No newline at end of file +} diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 7fb7326f221..43d63c9fd22 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -239,10 +239,8 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock) // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows}; auto ints = make_numeric_column(data_type{type_id::INT32}, num_rows * 2); auto ints_view = ints->mutable_view(); - thrust::tabulate(thrust::device, - ints_view.begin(), - ints_view.end(), - thrust::identity()); + thrust::tabulate( + thrust::device, ints_view.begin(), ints_view.end(), thrust::identity{}); // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2}; auto list_offsets = make_numeric_column(data_type{type_id::INT32}, num_rows + 1); From 677e63236a81ea3c402df993845a1fdc98072c9e Mon Sep 17 00:00:00 2001 From: Conor Hoekstra <36027403+codereport@users.noreply.github.com> Date: Wed, 1 Dec 2021 16:46:25 -0500 Subject: [PATCH 039/202] Avoid overflow for `fixed_point` `cudf::cast` and performance optimization (#9772) This resolves https://github.com/rapidsai/cudf/issues/9000. When using `cudf::cast` for a wider decimal type to a narrower decimal type, you can overflow. This PR modifies the code path for this specific use case so that the "rescale" happens for the type cast. A small perf improvement was added when you have identical scales to avoid rescaling. CI depends on https://github.com/rapidsai/cudf/pull/9766 Authors: - Conor Hoekstra (https://github.com/codereport) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mike Wilson (https://github.com/hyperbolic2346) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/9772 --- cpp/src/unary/cast_ops.cu | 49 +++++++++++++++++++++------------- cpp/tests/unary/cast_tests.cpp | 13 +++++++++ 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu index e852b00796a..131fde11cf8 100644 --- a/cpp/src/unary/cast_ops.cu +++ b/cpp/src/unary/cast_ops.cu @@ -305,28 +305,39 @@ struct dispatch_unary_cast_to { rmm::mr::device_memory_resource* mr) { using namespace numeric; - - auto const size = input.size(); - auto temporary = - std::make_unique(cudf::data_type{type.id(), input.type().scale()}, - size, - rmm::device_buffer{size * cudf::size_of(type), stream}, - copy_bitmask(input, stream), - input.null_count()); - using SourceDeviceT = device_storage_type_t; using TargetDeviceT = device_storage_type_t; - mutable_column_view output_mutable = *temporary; - - thrust::transform(rmm::exec_policy(stream), - input.begin(), - input.end(), - output_mutable.begin(), - device_cast{}); - - // clearly there is a more efficient way to do this, can optimize in the future - return rescale(*temporary, numeric::scale_type{type.scale()}, stream, mr); + auto casted = [&]() { + auto const size = input.size(); + auto output = std::make_unique(cudf::data_type{type.id(), input.type().scale()}, + size, + rmm::device_buffer{size * cudf::size_of(type), stream}, + copy_bitmask(input, stream), + input.null_count()); + + mutable_column_view output_mutable = *output; + + thrust::transform(rmm::exec_policy(stream), + input.begin(), + input.end(), + output_mutable.begin(), + device_cast{}); + + return output; + }; + + if (input.type().scale() == type.scale()) return casted(); + + if constexpr (sizeof(SourceDeviceT) < sizeof(TargetDeviceT)) { + // device_cast BEFORE rescale when SourceDeviceT is < TargetDeviceT + auto temporary = casted(); + return detail::rescale(*temporary, scale_type{type.scale()}, stream, mr); + } else { + // device_cast AFTER rescale when SourceDeviceT is > TargetDeviceT to avoid overflow + auto temporary = detail::rescale(input, scale_type{type.scale()}, stream, mr); + return detail::cast(*temporary, type, stream, mr); + } } template view()); } + +TEST_F(FixedPointTestSingleType, Int32ToInt64Convert) +{ + using namespace numeric; + using fp_wrapperA = cudf::test::fixed_point_column_wrapper; + using fp_wrapperB = cudf::test::fixed_point_column_wrapper; + + auto const input = fp_wrapperB{{141230900000L}, scale_type{-10}}; + auto const expected = fp_wrapperA{{14123}, scale_type{-3}}; + auto const result = cudf::cast(input, make_fixed_point_data_type(-3)); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view()); +} From 7d8a8e53f495279ae129fa46948c07230d6e77b4 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Wed, 1 Dec 2021 13:53:05 -0800 Subject: [PATCH 040/202] Allow cast decimal128 to string and add tests (#9756) Small PR that enables Decimal128 cast Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9756 --- java/src/main/native/src/ColumnViewJni.cpp | 3 ++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 4efac307627..02d5dc4569c 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -916,7 +916,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas case cudf::type_id::INT64: case cudf::type_id::UINT64: result = cudf::strings::from_integers(*column); break; case cudf::type_id::DECIMAL32: - case cudf::type_id::DECIMAL64: result = cudf::strings::from_fixed_point(*column); break; + case cudf::type_id::DECIMAL64: + case cudf::type_id::DECIMAL128: result = cudf::strings::from_fixed_point(*column); break; default: JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Invalid data type", 0); } } else if (column->type().id() == cudf::type_id::STRING) { diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fa9052029cc..31a52eb2ec0 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3372,6 +3372,22 @@ void testFixedWidthCast() { } } + @Test + void testCastBigDecimalToString() { + BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"), + new BigDecimal("9223372036854775808.191"), + new BigDecimal("9328323982309091029831.002") + }; + + try (ColumnVector cv = ColumnVector.fromDecimals(bigValues); + ColumnVector values = cv.castTo(DType.STRING); + ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321", + "9223372036854775808.191", + "9328323982309091029831.002")) { + assertColumnsAreEqual(expected, values); + } + } + @Test void testCastStringToBigDecimal() { String[] bigValues = {"923121331938210123.321", From 5491cc789bbfbaad7099124dcfe004719e7f013c Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 2 Dec 2021 03:30:50 +0530 Subject: [PATCH 041/202] Fix memory error due to lambda return type deduction limitation (#9778) Fixes #9703 replace device lambda with device functor with return type. (due to [14. extended-lambda-restrictions](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions) ) ~add `__host__` to lambda for nvcc return type deduction to work properly.~ ~replaced `auto` (generic lambda) with `size_type`.~ fixes shared memory write error caused in #9703 Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/9778 --- cpp/src/sort/rank.cu | 13 +++++++++---- cpp/tests/sort/rank_test.cpp | 14 ++++++++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index e9589e6c4b3..de0a44e3234 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -194,6 +194,12 @@ void rank_max(cudf::device_span group_keys, stream); } +// Returns index, count +template +struct index_counter { + __device__ T operator()(size_type i) { return T{i, 1}; } +}; + void rank_average(cudf::device_span group_keys, column_view sorted_order_view, mutable_column_view rank_mutable_view, @@ -208,10 +214,9 @@ void rank_average(cudf::device_span group_keys, using MinCount = thrust::pair; tie_break_ranks_transform( group_keys, - cudf::detail::make_counting_transform_iterator(1, - [] __device__(auto i) { - return MinCount{i, 1}; - }), + // Use device functor with return type. Cannot use device lambda due to limitation. + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda-restrictions + cudf::detail::make_counting_transform_iterator(1, index_counter{}), sorted_order_view, rank_mutable_view.begin(), [] __device__(auto rank_count1, auto rank_count2) { diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp index 94e389fc7ce..926ad1e203e 100644 --- a/cpp/tests/sort/rank_test.cpp +++ b/cpp/tests/sort/rank_test.cpp @@ -410,5 +410,19 @@ TYPED_TEST(Rank, min_desc_bottom_pct) this->run_all_tests(rank_method::MIN, desc_bottom, col1_rank, col2_rank, col3_rank, true); } +struct RankLarge : public BaseFixture { +}; + +TEST_F(RankLarge, average_large) +{ + // testcase of https://github.com/rapidsai/cudf/issues/9703 + auto iter = thrust::counting_iterator(0); + fixed_width_column_wrapper col1(iter, iter + 10558); + auto result = + cudf::rank(col1, rank_method::AVERAGE, {}, null_policy::EXCLUDE, null_order::AFTER, false); + fixed_width_column_wrapper expected(iter + 1, iter + 10559); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + } // namespace test } // namespace cudf From c10966cc3847ca9837ddc7ce5df9c4d9b7c743d8 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Thu, 2 Dec 2021 18:48:03 +0800 Subject: [PATCH 042/202] Fix make_empty_scalar_like on list_type (#9759) Fixes #9758 In `make_empty_scalar_like`, we create list scalar with the list column itself, which is wrong. The correct way is with the child of list column. Authors: - Alfred Xu (https://github.com/sperlingxx) Approvers: - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) URL: https://github.com/rapidsai/cudf/pull/9759 --- cpp/src/scalar/scalar_factories.cpp | 7 +++++-- cpp/tests/reductions/reduction_tests.cpp | 8 ++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp index d2876435780..c18b57d220f 100644 --- a/cpp/src/scalar/scalar_factories.cpp +++ b/cpp/src/scalar/scalar_factories.cpp @@ -21,6 +21,7 @@ #include #include +#include #include namespace cudf { @@ -184,10 +185,12 @@ std::unique_ptr make_empty_scalar_like(column_view const& column, { std::unique_ptr result; switch (column.type().id()) { - case type_id::LIST: - result = make_list_scalar(empty_like(column)->view(), stream, mr); + case type_id::LIST: { + auto const empty_child = empty_like(lists_column_view(column).child()); + result = make_list_scalar(empty_child->view(), stream, mr); result->set_valid_async(false, stream); break; + } case type_id::STRUCT: // The input column must have at least 1 row to extract a scalar (row) from it. result = detail::get_element(column, 0, stream, mr); diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index d8ee8f9d08d..e138cd6f68e 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -1961,7 +1961,11 @@ struct ListReductionTest : public cudf::test::BaseFixture { cudf::reduce(input_data, agg, cudf::data_type(cudf::type_id::LIST)); auto list_result = dynamic_cast(result.get()); EXPECT_EQ(is_valid, list_result->is_valid()); - if (is_valid) { CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); } + if (is_valid) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_value, list_result->view()); + } else { + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_value, list_result->view()); + } }; if (succeeded_condition) { @@ -2047,7 +2051,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement) // test against empty input this->reduction_test(LCW{}, - ElementCol{{0}, {0}}, // expected_value, + ElementCol{}, // expected_value, true, false, cudf::make_nth_element_aggregation(0, cudf::null_policy::INCLUDE)); From 582cc6e466c7d941e1b34893fd56fbd42fe90d68 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Thu, 2 Dec 2021 21:12:01 +0800 Subject: [PATCH 043/202] Add sample JNI API (#9728) Add sample JNI Signed-off-by: Chong Gao Authors: - Chong Gao (https://github.com/res-life) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/9728 --- java/src/main/java/ai/rapids/cudf/Table.java | 30 +++++++++++++++++++ java/src/main/native/src/TableJni.cpp | 15 ++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 21 +++++++++++++ 3 files changed, 66 insertions(+) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b0791fb440f..b11808ed023 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -678,6 +678,8 @@ private static native ContiguousTable[] contiguousSplitGroups(long inputTable, boolean[] keysDescending, boolean[] keysNullSmallest); + private static native long[] sample(long tableHandle, long n, boolean replacement, long seed); + ///////////////////////////////////////////////////////////////////////////// // TABLE CREATION APIs ///////////////////////////////////////////////////////////////////////////// @@ -2801,6 +2803,34 @@ public static Table fromPackedTable(ByteBuffer metadata, DeviceMemoryBuffer data return result; } + + /** + * Gather `n` samples from table randomly + * Note: does not preserve the ordering + * Example: + * input: {col1: {1, 2, 3, 4, 5}, col2: {6, 7, 8, 9, 10}} + * n: 3 + * replacement: false + * + * output: {col1: {3, 1, 4}, col2: {8, 6, 9}} + * + * replacement: true + * + * output: {col1: {3, 1, 1}, col2: {8, 6, 6}} + * + * throws "logic_error" if `n` > table rows and `replacement` == FALSE. + * throws "logic_error" if `n` < 0. + * + * @param n non-negative number of samples expected from table + * @param replacement Allow or disallow sampling of the same row more than once. + * @param seed Seed value to initiate random number generator. + * + * @return Table containing samples + */ + public Table sample(long n, boolean replacement, long seed) { + return new Table(sample(nativeHandle, n, replacement, seed)); + } + ///////////////////////////////////////////////////////////////////////////// // HELPER CLASSES ///////////////////////////////////////////////////////////////////////////// diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index a78d40a58f7..f3377bb002d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -3147,4 +3148,18 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups( CATCH_STD(env, NULL); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input, + jlong n, jboolean replacement, + jlong seed) { + JNI_NULL_CHECK(env, j_input, "input table is null", 0); + try { + cudf::jni::auto_set_device(env); + cudf::table_view *input = reinterpret_cast(j_input); + auto sample_with_replacement = + replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE; + std::unique_ptr result = cudf::sample(*input, n, sample_with_replacement, seed); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index fa221e19387..0b2f56895e9 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -7584,4 +7584,25 @@ void testExplodeOuterPosition() { } } } + + @Test + void testSample() { + try (Table t = new Table.TestBuilder().column("s1", "s2", "s3", "s4", "s5").build()) { + try (Table ret = t.sample(3, false, 0); + Table expected = new Table.TestBuilder().column("s3", "s4", "s5").build()) { + assertTablesAreEqual(expected, ret); + } + + try (Table ret = t.sample(5, false, 0); + Table expected = new Table.TestBuilder().column("s3", "s4", "s5", "s2", "s1").build()) { + assertTablesAreEqual(expected, ret); + } + + try (Table ret = t.sample(8, true, 0); + Table expected = new Table.TestBuilder() + .column("s1", "s1", "s4", "s5", "s5", "s1", "s3", "s2").build()) { + assertTablesAreEqual(expected, ret); + } + } + } } From 1077daeaad8ff710de6f4fbb99f2e7371b4af8de Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Thu, 2 Dec 2021 15:51:04 -0600 Subject: [PATCH 044/202] Fix caching in `Series.applymap` (#9821) The cache key we were generating for these functions didn't take into account the constants that could be different in the bytecode. Hence certain functions were causing cache hits when they actually differ by a constant value somewhere in the logic. Authors: - https://github.com/brandon-b-miller Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9821 --- python/cudf/cudf/tests/test_udf_masked_ops.py | 19 +++++++++++++++++++ python/cudf/cudf/utils/cudautils.py | 4 +++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py index dc126546f15..c9c2c440632 100644 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ b/python/cudf/cudf/tests/test_udf_masked_ops.py @@ -593,3 +593,22 @@ def func(row, c, k): return y run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) + + +def test_masked_udf_caching(): + # Make sure similar functions that differ + # by simple things like constants actually + # recompile + + data = cudf.Series([1, 2, 3]) + expect = data ** 2 + got = data.applymap(lambda x: x ** 2) + + assert_eq(expect, got, check_dtype=False) + + # update the constant value being used and make sure + # it does not result in a cache hit + + expect = data ** 3 + got = data.applymap(lambda x: x ** 3) + assert_eq(expect, got, check_dtype=False) diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py index 5fa091a0081..f0533dcaa72 100755 --- a/python/cudf/cudf/utils/cudautils.py +++ b/python/cudf/cudf/utils/cudautils.py @@ -216,12 +216,14 @@ def make_cache_key(udf, sig): recompiling the same function for the same set of types """ codebytes = udf.__code__.co_code + constants = udf.__code__.co_consts if udf.__closure__ is not None: cvars = tuple([x.cell_contents for x in udf.__closure__]) cvarbytes = dumps(cvars) else: cvarbytes = b"" - return codebytes, cvarbytes, sig + + return constants, codebytes, cvarbytes, sig def compile_udf(udf, type_signature): From 50acf076d4a35bc57dc00a416f0d9507b1992c0f Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 2 Dec 2021 14:07:31 -0800 Subject: [PATCH 045/202] Fix stream usage in `segmented_gather()` (#9679) `detail::segmented_gather()` inadvertently uses `cuda_default_stream` in some parts of its implementation, while using the user-specified stream in others. This applies to the calls to `copy_range_in_place()`, `allocate_like()`, and `make_lists_column()`. ~This might produce race conditions, which might explain NVIDIA/spark-rapids/issues/4060. It's a rare failure that's quite hard to reproduce.~ This might lead to over-synchronization, though bad output is unlikely. The commit here should sort this out, by switching to the `detail` APIs corresponding to the calls above. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9679 --- cpp/src/lists/copying/segmented_gather.cu | 21 ++++++++++++--------- cpp/src/lists/extract.cu | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu index 8cbcddc1c58..41187b96cdb 100644 --- a/cpp/src/lists/copying/segmented_gather.cu +++ b/cpp/src/lists/copying/segmented_gather.cu @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include #include -#include #include #include #include @@ -88,14 +88,15 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, auto child = std::move(child_table->release().front()); // Create list offsets from gather_map. - auto output_offset = cudf::allocate_like( - gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, mr); + auto output_offset = cudf::detail::allocate_like( + gather_map.offsets(), gather_map.size() + 1, mask_allocation_policy::RETAIN, stream, mr); auto output_offset_view = output_offset->mutable_view(); - cudf::copy_range_in_place(gather_map.offsets(), - output_offset_view, - gather_map.offset(), - gather_map.offset() + output_offset_view.size(), - 0); + cudf::detail::copy_range_in_place(gather_map.offsets(), + output_offset_view, + gather_map.offset(), + gather_map.offset() + output_offset_view.size(), + 0, + stream); // Assemble list column & return auto null_mask = cudf::detail::copy_bitmask(value_column.parent(), stream, mr); size_type null_count = value_column.null_count(); @@ -103,7 +104,9 @@ std::unique_ptr segmented_gather(lists_column_view const& value_column, std::move(output_offset), std::move(child), null_count, - std::move(null_mask)); + std::move(null_mask), + stream, + mr); } } // namespace detail diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu index 381864e1a68..7c6c612eb25 100644 --- a/cpp/src/lists/extract.cu +++ b/cpp/src/lists/extract.cu @@ -53,7 +53,7 @@ std::unique_ptr make_index_child(column_view const& indices, // `segmented_gather()` on a null index should produce a null row. if (not indices.nullable()) { return std::make_unique(indices, stream); } - auto const d_indices = column_device_view::create(indices); + auto const d_indices = column_device_view::create(indices, stream); // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them. auto const null_replaced_iter_begin = cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits::max()); From b848dd5c9cfef7e3523810d67296e037f31945c1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 2 Dec 2021 14:40:57 -0800 Subject: [PATCH 046/202] Fix ORC writer crash with empty input columns (#9808) Fixes https://github.com/rapidsai/cudf/issues/9783 Skip some parts of writing when the input table was zero rows. Add is_empty to `hostdevice_2dvector`. Add Python test with empty columns. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) - Devavret Makkar (https://github.com/devavret) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9808 --- cpp/src/io/orc/writer_impl.cu | 338 +++++++++++---------- cpp/src/io/utilities/hostdevice_vector.hpp | 1 + python/cudf/cudf/tests/test_orc.py | 15 + 3 files changed, 188 insertions(+), 166 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index e53fb3589bc..db02125ce77 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -579,12 +579,15 @@ orc_streams writer::impl::create_streams(host_span columns, } auto const direct_data_size = - std::accumulate(segmentation.stripes.front().cbegin(), - segmentation.stripes.back().cend(), - size_t{0}, - [&](auto data_size, auto rg_idx) { - return data_size + column.host_dict_chunk(rg_idx)->string_char_count; - }); + segmentation.num_stripes() == 0 + ? 0 + : std::accumulate(segmentation.stripes.front().cbegin(), + segmentation.stripes.back().cend(), + size_t{0}, + [&](auto data_size, auto rg_idx) { + return data_size + + column.host_dict_chunk(rg_idx)->string_char_count; + }); if (enable_dict) { uint32_t dict_bits = 0; for (dict_bits = 1; dict_bits < 32; dict_bits <<= 1) { @@ -988,17 +991,19 @@ encoded_data encode_columns(orc_table_view const& orc_table, } chunk_streams.host_to_device(stream); - if (orc_table.num_string_columns() != 0) { - auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict(); - gpu::EncodeStripeDictionaries(d_stripe_dict, - chunks, - orc_table.num_string_columns(), - segmentation.num_stripes(), - chunk_streams, - stream); - } + if (orc_table.num_rows() > 0) { + if (orc_table.num_string_columns() != 0) { + auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict(); + gpu::EncodeStripeDictionaries(d_stripe_dict, + chunks, + orc_table.num_string_columns(), + segmentation.num_stripes(), + chunk_streams, + stream); + } - gpu::EncodeOrcColumnData(chunks, chunk_streams, stream); + gpu::EncodeOrcColumnData(chunks, chunk_streams, stream); + } dictionaries.data.clear(); dictionaries.index.clear(); stream.synchronize(); @@ -1803,7 +1808,7 @@ void writer::impl::write(table_view const& table) auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream); hostdevice_2dvector dict( rowgroup_bounds.size().first, orc_table.num_string_columns(), stream); - if (orc_table.num_string_columns() != 0) { + if (not dict.is_empty()) { init_dictionaries(orc_table, rowgroup_bounds, dictionaries.d_data_view, @@ -1819,7 +1824,7 @@ void writer::impl::write(table_view const& table) // Build stripe-level dictionaries hostdevice_2dvector stripe_dict( segmentation.num_stripes(), orc_table.num_string_columns(), stream); - if (orc_table.num_string_columns() != 0) { + if (not stripe_dict.is_empty()) { build_dictionaries(orc_table, segmentation.stripes, dict, @@ -1842,165 +1847,166 @@ void writer::impl::write(table_view const& table) segmentation.num_stripes(), num_data_streams, stream); auto stripes = gather_stripes(num_index_streams, segmentation, &enc_data.streams, &strm_descs); - // Gather column statistics - std::vector column_stats; - if (enable_statistics_ && table.num_columns() > 0 && num_rows > 0) { - column_stats = gather_statistic_blobs(orc_table, segmentation); - } + if (num_rows > 0) { + // Gather column statistics + auto const column_stats = enable_statistics_ && table.num_columns() > 0 + ? gather_statistic_blobs(orc_table, segmentation) + : std::vector{}; - // Allocate intermediate output stream buffer - size_t compressed_bfr_size = 0; - size_t num_compressed_blocks = 0; - size_t max_compressed_block_size = 0; - if (compression_kind_ != NONE) { - nvcompBatchedSnappyCompressGetMaxOutputChunkSize( - compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size); - } - auto stream_output = [&]() { - size_t max_stream_size = 0; - bool all_device_write = true; + // Allocate intermediate output stream buffer + size_t compressed_bfr_size = 0; + size_t num_compressed_blocks = 0; + size_t max_compressed_block_size = 0; + if (compression_kind_ != NONE) { + nvcompBatchedSnappyCompressGetMaxOutputChunkSize( + compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size); + } + auto stream_output = [&]() { + size_t max_stream_size = 0; + bool all_device_write = true; + + for (auto& ss : strm_descs.host_view().flat_view()) { + if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; } + size_t stream_size = ss.stream_size; + if (compression_kind_ != NONE) { + ss.first_block = num_compressed_blocks; + ss.bfr_offset = compressed_bfr_size; + + auto num_blocks = std::max( + (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); + stream_size += num_blocks * BLOCK_HEADER_SIZE; + num_compressed_blocks += num_blocks; + compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks; + } + max_stream_size = std::max(max_stream_size, stream_size); + } - for (auto& ss : strm_descs.host_view().flat_view()) { - if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; } - size_t stream_size = ss.stream_size; - if (compression_kind_ != NONE) { - ss.first_block = num_compressed_blocks; - ss.bfr_offset = compressed_bfr_size; - - auto num_blocks = std::max( - (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1); - stream_size += num_blocks * BLOCK_HEADER_SIZE; - num_compressed_blocks += num_blocks; - compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks; + if (all_device_write) { + return pinned_buffer{nullptr, cudaFreeHost}; + } else { + return pinned_buffer{[](size_t size) { + uint8_t* ptr = nullptr; + CUDA_TRY(cudaMallocHost(&ptr, size)); + return ptr; + }(max_stream_size), + cudaFreeHost}; } - max_stream_size = std::max(max_stream_size, stream_size); - } + }(); - if (all_device_write) { - return pinned_buffer{nullptr, cudaFreeHost}; - } else { - return pinned_buffer{[](size_t size) { - uint8_t* ptr = nullptr; - CUDA_TRY(cudaMallocHost(&ptr, size)); - return ptr; - }(max_stream_size), - cudaFreeHost}; + // Compress the data streams + rmm::device_buffer compressed_data(compressed_bfr_size, stream); + hostdevice_vector comp_out(num_compressed_blocks, stream); + hostdevice_vector comp_in(num_compressed_blocks, stream); + if (compression_kind_ != NONE) { + strm_descs.host_to_device(stream); + gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), + num_compressed_blocks, + compression_kind_, + compression_blocksize_, + max_compressed_block_size, + strm_descs, + enc_data.streams, + comp_in, + comp_out, + stream); + strm_descs.device_to_host(stream); + comp_out.device_to_host(stream, true); } - }(); - - // Compress the data streams - rmm::device_buffer compressed_data(compressed_bfr_size, stream); - hostdevice_vector comp_out(num_compressed_blocks, stream); - hostdevice_vector comp_in(num_compressed_blocks, stream); - if (compression_kind_ != NONE) { - strm_descs.host_to_device(stream); - gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), - num_compressed_blocks, - compression_kind_, - compression_blocksize_, - max_compressed_block_size, - strm_descs, - enc_data.streams, - comp_in, - comp_out, - stream); - strm_descs.device_to_host(stream); - comp_out.device_to_host(stream, true); - } - ProtobufWriter pbw_(&buffer_); - - // Write stripes - std::vector> write_tasks; - for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) { - auto const& rowgroups_range = segmentation.stripes[stripe_id]; - auto& stripe = stripes[stripe_id]; - - stripe.offset = out_sink_->bytes_written(); - - // Column (skippable) index streams appear at the start of the stripe - for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) { - write_index_stream(stripe_id, - stream_id, - orc_table.columns, - rowgroups_range, - enc_data.streams, - strm_descs, - comp_out, - &stripe, - &streams, - &pbw_); - } + ProtobufWriter pbw_(&buffer_); + + // Write stripes + std::vector> write_tasks; + for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) { + auto const& rowgroups_range = segmentation.stripes[stripe_id]; + auto& stripe = stripes[stripe_id]; + + stripe.offset = out_sink_->bytes_written(); + + // Column (skippable) index streams appear at the start of the stripe + for (size_type stream_id = 0; stream_id < num_index_streams; ++stream_id) { + write_index_stream(stripe_id, + stream_id, + orc_table.columns, + rowgroups_range, + enc_data.streams, + strm_descs, + comp_out, + &stripe, + &streams, + &pbw_); + } - // Column data consisting one or more separate streams - for (auto const& strm_desc : strm_descs[stripe_id]) { - write_tasks.push_back( - write_data_stream(strm_desc, - enc_data.streams[strm_desc.column_id][rowgroups_range.first], - static_cast(compressed_data.data()), - stream_output.get(), - &stripe, - &streams)); - } + // Column data consisting one or more separate streams + for (auto const& strm_desc : strm_descs[stripe_id]) { + write_tasks.push_back( + write_data_stream(strm_desc, + enc_data.streams[strm_desc.column_id][rowgroups_range.first], + static_cast(compressed_data.data()), + stream_output.get(), + &stripe, + &streams)); + } - // Write stripefooter consisting of stream information - StripeFooter sf; - sf.streams = streams; - sf.columns.resize(orc_table.num_columns() + 1); - sf.columns[0].kind = DIRECT; - for (size_t i = 1; i < sf.columns.size(); ++i) { - sf.columns[i].kind = orc_table.column(i - 1).orc_encoding(); - sf.columns[i].dictionarySize = - (sf.columns[i].kind == DICTIONARY_V2) - ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings - : 0; - if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; } + // Write stripefooter consisting of stream information + StripeFooter sf; + sf.streams = streams; + sf.columns.resize(orc_table.num_columns() + 1); + sf.columns[0].kind = DIRECT; + for (size_t i = 1; i < sf.columns.size(); ++i) { + sf.columns[i].kind = orc_table.column(i - 1).orc_encoding(); + sf.columns[i].dictionarySize = + (sf.columns[i].kind == DICTIONARY_V2) + ? orc_table.column(i - 1).host_stripe_dict(stripe_id)->num_strings + : 0; + if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; } + } + buffer_.resize((compression_kind_ != NONE) ? 3 : 0); + pbw_.write(sf); + stripe.footerLength = buffer_.size(); + if (compression_kind_ != NONE) { + uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1; + buffer_[0] = static_cast(uncomp_sf_len >> 0); + buffer_[1] = static_cast(uncomp_sf_len >> 8); + buffer_[2] = static_cast(uncomp_sf_len >> 16); + } + out_sink_->host_write(buffer_.data(), buffer_.size()); } - buffer_.resize((compression_kind_ != NONE) ? 3 : 0); - pbw_.write(sf); - stripe.footerLength = buffer_.size(); - if (compression_kind_ != NONE) { - uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1; - buffer_[0] = static_cast(uncomp_sf_len >> 0); - buffer_[1] = static_cast(uncomp_sf_len >> 8); - buffer_[2] = static_cast(uncomp_sf_len >> 16); + for (auto const& task : write_tasks) { + task.wait(); } - out_sink_->host_write(buffer_.data(), buffer_.size()); - } - for (auto const& task : write_tasks) { - task.wait(); - } - if (column_stats.size() != 0) { - // File-level statistics - // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls - if (single_write_mode) { - // First entry contains total number of rows - buffer_.resize(0); - pbw_.putb(1 * 8 + PB_TYPE_VARINT); - pbw_.put_uint(num_rows); - ff.statistics.reserve(1 + orc_table.num_columns()); - ff.statistics.emplace_back(std::move(buffer_)); - // Add file stats, stored after stripe stats in `column_stats` - ff.statistics.insert( - ff.statistics.end(), - std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(), - std::make_move_iterator(column_stats.end())); - } - // Stripe-level statistics - size_t first_stripe = md.stripeStats.size(); - md.stripeStats.resize(first_stripe + stripes.size()); - for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) { - md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns()); - buffer_.resize(0); - pbw_.putb(1 * 8 + PB_TYPE_VARINT); - pbw_.put_uint(stripes[stripe_id].numberOfRows); - md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_); - for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) { - size_t idx = stripes.size() * col_idx + stripe_id; - if (idx < column_stats.size()) { - md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] = - std::move(column_stats[idx]); + if (not column_stats.empty()) { + // File-level statistics + // NOTE: Excluded from chunked write mode to avoid the need for merging stats across calls + if (single_write_mode) { + // First entry contains total number of rows + buffer_.resize(0); + pbw_.putb(1 * 8 + PB_TYPE_VARINT); + pbw_.put_uint(num_rows); + ff.statistics.reserve(1 + orc_table.num_columns()); + ff.statistics.emplace_back(std::move(buffer_)); + // Add file stats, stored after stripe stats in `column_stats` + ff.statistics.insert( + ff.statistics.end(), + std::make_move_iterator(column_stats.begin()) + stripes.size() * orc_table.num_columns(), + std::make_move_iterator(column_stats.end())); + } + // Stripe-level statistics + size_t first_stripe = md.stripeStats.size(); + md.stripeStats.resize(first_stripe + stripes.size()); + for (size_t stripe_id = 0; stripe_id < stripes.size(); stripe_id++) { + md.stripeStats[first_stripe + stripe_id].colStats.resize(1 + orc_table.num_columns()); + buffer_.resize(0); + pbw_.putb(1 * 8 + PB_TYPE_VARINT); + pbw_.put_uint(stripes[stripe_id].numberOfRows); + md.stripeStats[first_stripe + stripe_id].colStats[0] = std::move(buffer_); + for (size_t col_idx = 0; col_idx < orc_table.num_columns(); col_idx++) { + size_t idx = stripes.size() * col_idx + stripe_id; + if (idx < column_stats.size()) { + md.stripeStats[first_stripe + stripe_id].colStats[1 + col_idx] = + std::move(column_stats[idx]); + } } } } diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 283715478a0..a7f9aec7bb4 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -179,6 +179,7 @@ class hostdevice_2dvector { auto size() const noexcept { return _size; } auto count() const noexcept { return _size.first * _size.second; } + auto is_empty() const noexcept { return count() == 0; } T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); } T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); } diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 6b02874146e..dc176992434 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1526,3 +1526,18 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir): # Segfaults when RLE stream sizes don't account for varint length pa_out = pa.orc.ORCFile(reencoded).read() assert_eq(df.to_pandas(), pa_out) + + +def test_empty_columns(): + buffer = BytesIO() + # string and decimal columns have additional steps that need to be skipped + expected = cudf.DataFrame( + { + "string": cudf.Series([], dtype="str"), + "decimal": cudf.Series([], dtype=cudf.Decimal64Dtype(10, 1)), + } + ) + expected.to_orc(buffer, compression="snappy") + + got_df = cudf.read_orc(buffer) + assert_eq(expected, got_df) From 0c08543955a01470baa4fbdbab927298dcf6afd9 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 3 Dec 2021 04:53:37 +0530 Subject: [PATCH 047/202] Update cmake and conda to 22.02 (#9746) Changes related to update to 22.02 in one conda environment recipe (only 11.5) was missed. This adds that. Also makes project version changes in cmake related to update from 21.12 to 22.02. Authors: - Devavret Makkar (https://github.com/devavret) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Ray Douglass (https://github.com/raydouglass) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/9746 --- ci/release/update-version.sh | 6 +++--- cpp/CMakeLists.txt | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index eeb76a15fcc..86432a92128 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -30,13 +30,13 @@ function sed_runner() { } # cpp update -sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt +sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt # cpp libcudf_kafka update -sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt +sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt # cpp cudf_jni update -sed_runner 's/'"CUDF_JNI VERSION .* LANGUAGES"'/'"CUDF_JNI VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' java/src/main/native/CMakeLists.txt +sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' java/src/main/native/CMakeLists.txt # rapids-cmake version sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' fetch_rapids.cmake diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50bdc30b292..e2b317f2e03 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF) project( CUDF - VERSION 21.12.00 + VERSION 22.02.00 LANGUAGES C CXX CUDA ) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 435ff3b5987..d0874b57c2d 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -22,7 +22,7 @@ include(rapids-find) project( CUDA_KAFKA - VERSION 21.12.00 + VERSION 22.02.00 LANGUAGES CXX ) From ce64e53264d21c6e59fe98548796a7b6bae24c07 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 2 Dec 2021 20:19:12 -0600 Subject: [PATCH 048/202] Add directory-partitioned data support to cudf.read_parquet (#9720) Closes #9684 Closes #9690 This PR refactors path handling in `cudf.read_parquet` and uses `pyarrow.dataset` to support for directory-partitioned datasets (with full filterings support at row-group granularity). Since it is my understanding that some users may wish for directory-partitioned columns to be represented as a raw dtype (rather than always becoming categorical), I also added an optional `categorical_partitions` argument (open to suggestions on a better name). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/9720 --- python/cudf/cudf/io/json.py | 2 +- python/cudf/cudf/io/orc.py | 2 +- python/cudf/cudf/io/parquet.py | 286 +++++++++++++++++++---- python/cudf/cudf/tests/test_parquet.py | 94 +++++++- python/cudf/cudf/tests/test_s3.py | 9 +- python/cudf/cudf/utils/ioutils.py | 26 ++- python/dask_cudf/dask_cudf/io/parquet.py | 7 +- 7 files changed, 355 insertions(+), 71 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index a48cfd07d3f..1f876214b16 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -37,7 +37,7 @@ def read_json( for source in path_or_buf: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( - passed_filesystem=None, path=source + passed_filesystem=None, path=source, **kwargs ) source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.json"]) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index ecb1b0cd185..c1cce3f996f 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -316,7 +316,7 @@ def read_orc( for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( - passed_filesystem=None, path=source + passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 9d665d9a0a5..04d64969a16 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -12,6 +12,7 @@ import cudf from cudf._lib import parquet as libparquet from cudf.api.types import is_list_like +from cudf.core.column import as_column, build_categorical_column from cudf.utils import ioutils @@ -80,7 +81,7 @@ def write_to_dataset( kwargs for to_parquet function. """ - fs = ioutils._ensure_filesystem(fs, root_path) + fs = ioutils._ensure_filesystem(fs, root_path, **kwargs) fs.mkdirs(root_path, exist_ok=True) metadata = [] @@ -163,11 +164,19 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names -def _process_row_groups(paths, fs, filters=None, row_groups=None): +def _process_dataset( + paths, fs, filters=None, row_groups=None, categorical_partitions=True, +): + # Returns: + # file_list - Expanded/filtered list of paths + # row_groups - Filtered list of row-group selections + # partition_keys - list of partition keys for each file + # partition_categories - Categories for each partition # The general purpose of this function is to (1) expand # directory input into a list of paths (using the pyarrow - # dataset API), and (2) to apply row-group filters. + # dataset API), (2) to apply row-group filters, and (3) + # to discover directory-partitioning information # Deal with case that the user passed in a directory name file_list = paths @@ -186,28 +195,107 @@ def _process_row_groups(paths, fs, filters=None, row_groups=None): if len(file_list) == 0: raise FileNotFoundError(f"{paths} could not be resolved to any files") - if filters is not None: - # Load IDs of filtered row groups for each file in dataset - filtered_rg_ids = defaultdict(list) - for fragment in dataset.get_fragments(filter=filters): - for rg_fragment in fragment.split_by_row_group(filters): - for rg_info in rg_fragment.row_groups: - filtered_rg_ids[rg_fragment.path].append(rg_info.id) - - # Initialize row_groups to be selected - if row_groups is None: - row_groups = [None for _ in dataset.files] - - # Store IDs of selected row groups for each file - for i, file in enumerate(dataset.files): - if row_groups[i] is None: - row_groups[i] = filtered_rg_ids[file] - else: - row_groups[i] = filter( - lambda id: id in row_groups[i], filtered_rg_ids[file] + # Deal with directory partitioning + # Get all partition keys (without filters) + partition_categories = defaultdict(list) + file_fragment = None + for file_fragment in dataset.get_fragments(): + keys = ds._get_partition_keys(file_fragment.partition_expression) + if not (keys or partition_categories): + # Bail - This is not a directory-partitioned dataset + break + for k, v in keys.items(): + if v not in partition_categories[k]: + partition_categories[k].append(v) + if not categorical_partitions: + # Bail - We don't need to discover all categories. + # We only need to save the partition keys from this + # first `file_fragment` + break + + if partition_categories and file_fragment is not None: + # Check/correct order of `categories` using last file_frag, + # because `_get_partition_keys` does NOT preserve the + # partition-hierarchy order of the keys. + cat_keys = [ + part.split("=")[0] + for part in file_fragment.path.split(fs.sep) + if "=" in part + ] + if set(partition_categories) == set(cat_keys): + partition_categories = { + k: partition_categories[k] + for k in cat_keys + if k in partition_categories + } + + # If we do not have partitioned data and + # are not filtering, we can return here + if filters is None and not partition_categories: + return file_list, row_groups, [], {} + + # Record initial row_groups input + row_groups_map = {} + if row_groups is not None: + # Make sure paths and row_groups map 1:1 + # and save the initial mapping + if len(paths) != len(file_list): + raise ValueError( + "Cannot specify a row_group selection for a directory path." + ) + row_groups_map = {path: rgs for path, rgs in zip(paths, row_groups)} + + # Apply filters and discover partition columns + partition_keys = [] + if partition_categories or filters is not None: + file_list = [] + if filters is not None: + row_groups = [] + for file_fragment in dataset.get_fragments(filter=filters): + path = file_fragment.path + + # Extract hive-partition keys, and make sure they + # are orederd the same as they are in `partition_categories` + if partition_categories: + raw_keys = ds._get_partition_keys( + file_fragment.partition_expression + ) + partition_keys.append( + [ + (name, raw_keys[name]) + for name in partition_categories.keys() + ] ) - return file_list, row_groups + # Apply row-group filtering + selection = row_groups_map.get(path, None) + if selection is not None or filters is not None: + filtered_row_groups = [ + rg_info.id + for rg_fragment in file_fragment.split_by_row_group( + filters, schema=dataset.schema, + ) + for rg_info in rg_fragment.row_groups + ] + file_list.append(path) + if filters is not None: + if selection is None: + row_groups.append(filtered_row_groups) + else: + row_groups.append( + [ + rg_id + for rg_id in filtered_row_groups + if rg_id in selection + ] + ) + + return ( + file_list, + row_groups, + partition_keys, + partition_categories if categorical_partitions else {}, + ) def _get_byte_ranges(file_list, row_groups, columns, fs, **kwargs): @@ -319,6 +407,7 @@ def read_parquet( strings_to_categorical=False, use_pandas_metadata=True, use_python_file_object=False, + categorical_partitions=True, *args, **kwargs, ): @@ -345,17 +434,29 @@ def read_parquet( # Start by trying construct a filesystem object, so we # can apply filters on remote file-systems fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs) - filepath_or_buffer = paths if paths else filepath_or_buffer - if fs is None and filters is not None: - raise ValueError("cudf cannot apply filters to open file objects.") - # Apply filters now (before converting non-local paths to buffers). - # Note that `_process_row_groups` will also expand `filepath_or_buffer` - # into a full list of files if it is a directory. - if fs is not None: - filepath_or_buffer, row_groups = _process_row_groups( - filepath_or_buffer, fs, filters=filters, row_groups=row_groups, + # Use pyarrow dataset to detect/process directory-partitioned + # data and apply filters. Note that we can only support partitioned + # data and filtering if the input is a single directory or list of + # paths. + partition_keys = [] + partition_categories = {} + if fs and paths: + ( + paths, + row_groups, + partition_keys, + partition_categories, + ) = _process_dataset( + paths, + fs, + filters=filters, + row_groups=row_groups, + categorical_partitions=categorical_partitions, ) + elif filters is not None: + raise ValueError("cudf cannot apply filters to open file objects.") + filepath_or_buffer = paths if paths else filepath_or_buffer # Check if we should calculate the specific byte-ranges # needed for each parquet file. We always do this when we @@ -380,15 +481,6 @@ def read_parquet( filepaths_or_buffers = [] for i, source in enumerate(filepath_or_buffer): - if ioutils.is_directory(source, **kwargs): - # Note: For now, we know `fs` is an fsspec filesystem - # object, but it may be an arrow object in the future - fsspec_fs = ioutils._ensure_filesystem( - passed_filesystem=fs, path=source - ) - source = ioutils.stringify_pathlike(source) - source = fsspec_fs.sep.join([source, "*.parquet"]) - tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, @@ -410,6 +502,117 @@ def read_parquet( else: filepaths_or_buffers.append(tmp_source) + # Warn user if they are not using cudf for IO + # (There is a good chance this was not the intention) + if engine != "cudf": + warnings.warn( + "Using CPU via PyArrow to read Parquet dataset." + "This option is both inefficient and unstable!" + ) + if filters is not None: + warnings.warn( + "Parquet row-group filtering is only supported with " + "'engine=cudf'. Use pandas or pyarrow API directly " + "for full CPU-based filtering functionality." + ) + + return _parquet_to_frame( + filepaths_or_buffers, + engine, + *args, + columns=columns, + row_groups=row_groups, + skiprows=skiprows, + num_rows=num_rows, + strings_to_categorical=strings_to_categorical, + use_pandas_metadata=use_pandas_metadata, + partition_keys=partition_keys, + partition_categories=partition_categories, + **kwargs, + ) + + +def _parquet_to_frame( + paths_or_buffers, + *args, + row_groups=None, + partition_keys=None, + partition_categories=None, + **kwargs, +): + + # If this is not a partitioned read, only need + # one call to `_read_parquet` + if not partition_keys: + return _read_parquet( + paths_or_buffers, *args, row_groups=row_groups, **kwargs, + ) + + # For partitioned data, we need a distinct read for each + # unique set of partition keys. Therefore, we start by + # aggregating all paths with matching keys using a dict + plan = {} + for i, (keys, path) in enumerate(zip(partition_keys, paths_or_buffers)): + rgs = row_groups[i] if row_groups else None + tkeys = tuple(keys) + if tkeys in plan: + plan[tkeys][0].append(path) + if rgs is not None: + plan[tkeys][1].append(rgs) + else: + plan[tkeys] = ([path], None if rgs is None else [rgs]) + + dfs = [] + for part_key, (key_paths, key_row_groups) in plan.items(): + # Add new DataFrame to our list + dfs.append( + _read_parquet( + key_paths, *args, row_groups=key_row_groups, **kwargs, + ) + ) + # Add partition columns to the last DataFrame + for (name, value) in part_key: + if partition_categories and name in partition_categories: + # Build the categorical column from `codes` + codes = as_column( + partition_categories[name].index(value), + length=len(dfs[-1]), + ) + dfs[-1][name] = build_categorical_column( + categories=partition_categories[name], + codes=codes, + size=codes.size, + offset=codes.offset, + ordered=False, + ) + else: + # Not building categorical columns, so + # `value` is already what we want + dfs[-1][name] = as_column(value, length=len(dfs[-1])) + + # Concatenate dfs and return. + # Assume we can ignore the index if it has no name. + return ( + cudf.concat(dfs, ignore_index=dfs[-1].index.name is None) + if len(dfs) > 1 + else dfs[0] + ) + + +def _read_parquet( + filepaths_or_buffers, + engine, + columns=None, + row_groups=None, + skiprows=None, + num_rows=None, + strings_to_categorical=None, + use_pandas_metadata=None, + *args, + **kwargs, +): + # Simple helper function to dispatch between + # cudf and pyarrow to read parquet data if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, @@ -421,7 +624,6 @@ def read_parquet( use_pandas_metadata=use_pandas_metadata, ) else: - warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index b6595be9566..516ee0d17d3 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1578,7 +1578,7 @@ def test_parquet_writer_bytes_io(simple_gdf): @pytest.mark.parametrize("filename", ["myfile.parquet", None]) @pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) -def test_parquet_write_partitioned(tmpdir_factory, cols, filename): +def test_parquet_partitioned(tmpdir_factory, cols, filename): # Checks that write_to_dataset is wrapping to_parquet # as expected gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) @@ -1597,10 +1597,14 @@ def test_parquet_write_partitioned(tmpdir_factory, cols, filename): gdf_dir, index=False, partition_cols=cols, partition_file_name=filename ) - # Use pandas since dataset may be partitioned - expect = pd.read_parquet(pdf_dir) - got = pd.read_parquet(gdf_dir) - assert_eq(expect, got) + # Read back with pandas to compare + expect_pd = pd.read_parquet(pdf_dir) + got_pd = pd.read_parquet(gdf_dir) + assert_eq(expect_pd, got_pd) + + # Check that cudf and pd return the same read + got_cudf = cudf.read_parquet(gdf_dir) + assert_eq(got_pd, got_cudf) # If filename is specified, check that it is correct if filename: @@ -1629,9 +1633,9 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): gdf.to_parquet(dir1, partition_cols=cols) cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols) - # cudf read_parquet cannot handle partitioned dataset - expect = pd.read_parquet(dir1) - got = pd.read_parquet(dir2) + # Read back with cudf + expect = cudf.read_parquet(dir1) + got = cudf.read_parquet(dir2) assert_eq(expect, got) gdf = cudf.DataFrame( @@ -1645,6 +1649,80 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols): gdf.to_parquet(dir1, partition_cols=cols) +@pytest.mark.parametrize( + "pfilters", [[("b", "==", "b")], [("b", "==", "a"), ("c", "==", 1)]], +) +@pytest.mark.parametrize("selection", ["directory", "files", "row-groups"]) +@pytest.mark.parametrize("use_cat", [True, False]) +def test_read_parquet_partitioned_filtered( + tmpdir, pfilters, selection, use_cat +): + path = str(tmpdir) + size = 100 + df = cudf.DataFrame( + { + "a": np.arange(0, stop=size, dtype="int64"), + "b": np.random.choice(list("abcd"), size=size), + "c": np.random.choice(np.arange(4), size=size), + } + ) + df.to_parquet(path, partition_cols=["c", "b"]) + + if selection == "files": + # Pass in a list of paths + fs = get_fs_token_paths(path)[0] + read_path = fs.find(path) + row_groups = None + elif selection == "row-groups": + # Pass in a list of paths AND row-group ids + fs = get_fs_token_paths(path)[0] + read_path = fs.find(path) + row_groups = [[0] for p in read_path] + else: + # Pass in a directory path + # (row-group selection not allowed in this case) + read_path = path + row_groups = None + + # Filter on partitioned columns + expect = pd.read_parquet(read_path, filters=pfilters) + got = cudf.read_parquet( + read_path, + filters=pfilters, + row_groups=row_groups, + categorical_partitions=use_cat, + ) + if use_cat: + assert got.dtypes["b"] == "category" + assert got.dtypes["c"] == "category" + else: + # Check that we didn't get categorical + # columns, but convert back to categorical + # for comparison with pandas + assert got.dtypes["b"] == "object" + assert got.dtypes["c"] == "int" + got["b"] = pd.Categorical( + got["b"].to_pandas(), categories=list("abcd") + ) + got["c"] = pd.Categorical( + got["c"].to_pandas(), categories=np.arange(4) + ) + assert_eq(expect, got) + + # Filter on non-partitioned column. + # Cannot compare to pandas, since the pyarrow + # backend will filter by row (and cudf can + # only filter by column, for now) + filters = [("a", "==", 10)] + got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,) + assert len(got) < len(df) and 10 in got["a"] + + # Filter on both kinds of columns + filters = [[("a", "==", 10)], [("c", "==", 1)]] + got = cudf.read_parquet(read_path, filters=filters, row_groups=row_groups,) + assert len(got) < len(df) and (1 in got["c"] and 10 in got["a"]) + + def test_parquet_writer_chunked_metadata(tmpdir, simple_pdf, simple_gdf): gdf_fname = tmpdir.join("gdf.parquet") test_path = "test/path" diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index dea876891f8..5738e1f0d00 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -346,12 +346,17 @@ def test_read_parquet_filters(s3_base, s3so, pdf, python_file): assert_eq(pdf.iloc[:0], got.reset_index(drop=True)) -def test_write_parquet(s3_base, s3so, pdf): +@pytest.mark.parametrize("partition_cols", [None, ["String"]]) +def test_write_parquet(s3_base, s3so, pdf, partition_cols): fname = "test_parquet_writer.parquet" bname = "parquet" gdf = cudf.from_pandas(pdf) with s3_context(s3_base=s3_base, bucket=bname) as s3fs: - gdf.to_parquet("s3://{}/{}".format(bname, fname), storage_options=s3so) + gdf.to_parquet( + "s3://{}/{}".format(bname, fname), + partition_cols=partition_cols, + storage_options=s3so, + ) assert s3fs.exists("s3://{}/{}".format(bname, fname)) got = pd.read_parquet(s3fs.open("s3://{}/{}".format(bname, fname))) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0f9d9d53b23..e6c031acac7 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -154,6 +154,9 @@ strings_to_categorical : boolean, default False If True, return string columns as GDF_CATEGORY dtype; if False, return a as GDF_STRING dtype. +categorical_partitions : boolean, default True + Whether directory-partitioned columns should be interpreted as categorical + or raw dtypes. use_pandas_metadata : boolean, default True If True and dataset has custom PANDAS schema metadata, ensure that index columns are also loaded. @@ -1129,7 +1132,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) try: - fs, _, paths = fsspec.get_fs_token_paths( + fs, _, paths = get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options ) except ValueError as e: @@ -1153,9 +1156,9 @@ def is_directory(path_or_data, **kwargs): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) try: - fs, _, paths = fsspec.get_fs_token_paths( + fs = get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options - ) + )[0] except ValueError as e: if str(e).startswith("Protocol not known"): return False @@ -1189,10 +1192,8 @@ def _get_filesystem_and_paths(path_or_data, **kwargs): else: path_or_data = [path_or_data] - # Pyarrow did not support the protocol or storage options. - # Fall back to fsspec try: - fs, _, fs_paths = fsspec.get_fs_token_paths( + fs, _, fs_paths = get_fs_token_paths( path_or_data, mode="rb", storage_options=storage_options ) return_paths = fs_paths @@ -1322,9 +1323,9 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs): if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options", {}) path_or_data = os.path.expanduser(path_or_data) - fs, _, _ = fsspec.get_fs_token_paths( + fs = get_fs_token_paths( path_or_data, mode=mode or "w", storage_options=storage_options - ) + )[0] if not _is_local_filesystem(fs): filepath_or_buffer = fsspec.open( @@ -1513,11 +1514,12 @@ def _prepare_filters(filters): return filters -def _ensure_filesystem(passed_filesystem, path): +def _ensure_filesystem(passed_filesystem, path, **kwargs): if passed_filesystem is None: - return get_fs_token_paths(path[0] if isinstance(path, list) else path)[ - 0 - ] + return get_fs_token_paths( + path[0] if isinstance(path, list) else path, + storage_options=kwargs.get("storage_options", {}), + )[0] return passed_filesystem diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index b47a5e78095..a49d73493ec 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -126,11 +126,8 @@ def _read_paths( # Build the column from `codes` directly # (since the category is often a larger dtype) - codes = ( - as_column(partitions[i].keys.index(index2)) - .as_frame() - .repeat(len(df)) - ._data[None] + codes = as_column( + partitions[i].keys.index(index2), length=len(df), ) df[name] = build_categorical_column( categories=partitions[i].keys, From e82cc62e2ea61211c64ba4784cb131d5b535644c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Dec 2021 04:46:25 -0800 Subject: [PATCH 049/202] Fix join of MultiIndex to Index with one column and overlapping name. (#9830) This PR resolves #9823 Authors: - Vyas Ramasubramani (https://github.com/vyasr) - Ashwin Srinath (https://github.com/shwina) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/9830 --- python/cudf/cudf/core/_base_index.py | 4 ++-- python/cudf/cudf/tests/test_joining.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d688b75ed14..2fcc976d8e1 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1147,14 +1147,14 @@ def join( if isinstance(lhs, cudf.MultiIndex): if level is not None and isinstance(level, int): on = lhs._data.select_by_index(level).names[0] - right_names = (on,) or right_names + right_names = (on,) if on is not None else right_names on = right_names[0] if how == "outer": how = "left" elif how == "right": how = "inner" else: - # Both are nomal indices + # Both are normal indices right_names = left_names on = right_names[0] diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index 0518cc2c9b9..d25c6130bfb 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -2150,3 +2150,16 @@ def test_join_redundant_params(): lhs.merge(rhs, right_on="a", left_index=True, right_index=True) with pytest.raises(ValueError): lhs.merge(rhs, left_on="c", right_on="b") + + +def test_join_multiindex_index(): + # test joining a MultiIndex with an Index with overlapping name + lhs = ( + cudf.DataFrame({"a": [2, 3, 1], "b": [3, 4, 2]}) + .set_index(["a", "b"]) + .index + ) + rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index("a").index + expect = lhs.to_pandas().join(rhs.to_pandas(), how="inner") + got = lhs.join(rhs, how="inner") + assert_join_results_equal(expect, got, how="inner") From 62103c6a99b4f2df00965e733542e08ce4b11448 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 3 Dec 2021 08:34:47 -0800 Subject: [PATCH 050/202] Added a few more tests for Decimal to String cast (#9818) This PR adds a few more edge cases as a sanity test on the request of @sameerz Authors: - Raza Jafri (https://github.com/razajafri) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/9818 --- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 31a52eb2ec0..7120a40a26a 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -3376,6 +3376,8 @@ void testFixedWidthCast() { void testCastBigDecimalToString() { BigDecimal[] bigValues = {new BigDecimal("923121331938210123.321"), new BigDecimal("9223372036854775808.191"), + new BigDecimal("-9.223"), + new BigDecimal("0.000"), new BigDecimal("9328323982309091029831.002") }; @@ -3383,9 +3385,21 @@ void testCastBigDecimalToString() { ColumnVector values = cv.castTo(DType.STRING); ColumnVector expected = ColumnVector.fromStrings("923121331938210123.321", "9223372036854775808.191", - "9328323982309091029831.002")) { + "-9.223", + "0.000", + "9328323982309091029831.002")) { assertColumnsAreEqual(expected, values); } + + BigDecimal[] bigValues0 = {new BigDecimal("992983283728193827182918744829283742232")}; + try { + ColumnVector cv = ColumnVector.fromDecimals(bigValues0); + if (cv != null) { + cv.close(); + } + fail("Precision check should've thrown an IllegalArgumentException"); + } catch (IllegalArgumentException iae) { + } } @Test From fdd9bb00dc0ba5ac373feaa079b782029130dae3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 3 Dec 2021 16:13:28 -0700 Subject: [PATCH 051/202] Add JNI for `cudf::drop_duplicates` (#9841) This adds Java binding for `cudf::drop_duplicates`. Note that when choosing which duplicate element to keep, only `KEEP_FIRST` or `KEEP_LAST` option can be selected. In other words, this does not support `KEEP_NONE` to remove all duplicate elements. Closes #9115. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9841 --- java/src/main/java/ai/rapids/cudf/Table.java | 92 ++++++++++++------- java/src/main/native/src/TableJni.cpp | 26 ++++++ .../test/java/ai/rapids/cudf/TableTest.java | 26 ++++++ 3 files changed, 112 insertions(+), 32 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b11808ed023..e32d466e853 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -645,6 +645,10 @@ private static native long[] conditionalLeftAntiJoinGatherMapWithCount(long left private static native long[] filter(long input, long mask); + private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns, + boolean keepFirst, boolean nullsEqual, + boolean nullsBefore) throws CudfException; + private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds); private static native long[] convertToRows(long nativeHandle); @@ -1820,6 +1824,30 @@ public Table filter(ColumnView mask) { return new Table(filter(nativeHandle, mask.getNativeView())); } + /** + * Copy rows of the current table to an output table such that duplicate rows in the key columns + * are ignored (i.e., only one row from the duplicate ones will be copied). These keys columns are + * a subset of the current table columns and their indices are specified by an input array. + * + * Currently, the output table is sorted by key columns, using stable sort. However, this is not + * guaranteed in the future. + * + * @param keyColumns Array of indices representing key columns from the current table. + * @param keepFirst If it is true, the first row with a duplicated key will be copied. Otherwise, + * copy the last row with a duplicated key. + * @param nullsEqual Flag to denote whether nulls are treated as equal when comparing rows of the + * key columns to check for uniqueness. + * @param nullsBefore Flag to specify whether nulls in the key columns will appear before or + * after non-null elements when sorting the table. + * + * @return Table with unique keys. + */ + public Table dropDuplicates(int[] keyColumns, boolean keepFirst, boolean nullsEqual, + boolean nullsBefore) { + assert keyColumns.length >= 1 : "Input keyColumns must contain indices of at least one column"; + return new Table(dropDuplicates(nativeHandle, keyColumns, keepFirst, nullsEqual, nullsBefore)); + } + /** * Split a table at given boundaries, but the result of each split has memory that is laid out * in a contiguous range of memory. This allows for us to optimize copying the data in a single @@ -3005,27 +3033,27 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) { } /** - * Computes row-based window aggregation functions on the Table/projection, + * Computes row-based window aggregation functions on the Table/projection, * based on windows specified in the argument. - * + * * This method enables queries such as the following SQL: - * - * SELECT user_id, - * MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date + * + * SELECT user_id, + * MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date * ROWS BETWEEN 1 PRECEDING and 1 FOLLOWING) * FROM my_sales_table WHERE ... - * + * * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument, * indicating: * 1. the {@link Aggregation.Kind}, * 2. the number of rows preceding and following the current row, within a window, * 3. the minimum number of observations within the defined window - * + * * This method returns a {@link Table} instance, with one result column for each specified * window aggregation. - * + * * In this example, for the following input: - * + * * [ // user_id, sales_amt * { "user1", 10 }, * { "user2", 20 }, @@ -3037,19 +3065,19 @@ public Table aggregate(GroupByAggregationOnColumn... aggregates) { * { "user1", 60 }, * { "user2", 40 } * ] - * - * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector + * + * Partitioning (grouping) by `user_id` yields the following `sales_amt` vector * (with 2 groups, one for each distinct `user_id`): - * + * * [ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] * <-------user1-------->|<------user2-------> - * + * * The SUM aggregation is applied with 1 preceding and 1 following * row, with a minimum of 1 period. The aggregation window is thus 3 rows wide, * yielding the following column: - * + * * [ 30, 40, 80, 120, 110, 50, 130, 150, 120 ] - * + * * @param windowAggregates the window-aggregations to be performed * @return Table instance, with each column containing the result of each aggregation. * @throws IllegalArgumentException if the window arguments are not of type @@ -3068,7 +3096,7 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) { for (int outputIndex = 0; outputIndex < windowAggregates.length; outputIndex++) { AggregationOverWindow agg = windowAggregates[outputIndex]; if (agg.getWindowOptions().getFrameType() != WindowOptions.FrameType.ROWS) { - throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: " + throw new IllegalArgumentException("Expected ROWS-based window specification. Unexpected window type: " + agg.getWindowOptions().getFrameType()); } ColumnWindowOps ops = groupedOps.computeIfAbsent(agg.getColumnIndex(), (idx) -> new ColumnWindowOps()); @@ -3129,27 +3157,27 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) { /** * Computes range-based window aggregation functions on the Table/projection, * based on windows specified in the argument. - * + * * This method enables queries such as the following SQL: - * - * SELECT user_id, - * MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date + * + * SELECT user_id, + * MAX(sales_amt) OVER(PARTITION BY user_id ORDER BY date * RANGE BETWEEN INTERVAL 1 DAY PRECEDING and CURRENT ROW) * FROM my_sales_table WHERE ... - * + * * Each window-aggregation is represented by a different {@link AggregationOverWindow} argument, * indicating: * 1. the {@link Aggregation.Kind}, * 2. the index for the timestamp column to base the window definitions on * 2. the number of DAYS preceding and following the current row's date, to consider in the window * 3. the minimum number of observations within the defined window - * + * * This method returns a {@link Table} instance, with one result column for each specified * window aggregation. - * + * * In this example, for the following input: - * - * [ // user, sales_amt, YYYYMMDD (date) + * + * [ // user, sales_amt, YYYYMMDD (date) * { "user1", 10, 20200101 }, * { "user2", 20, 20200101 }, * { "user1", 20, 20200102 }, @@ -3160,19 +3188,19 @@ public Table aggregateWindows(AggregationOverWindow... windowAggregates) { * { "user1", 60, 20200107 }, * { "user2", 40, 20200104 } * ] - * - * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector + * + * Partitioning (grouping) by `user_id`, and ordering by `date` yields the following `sales_amt` vector * (with 2 groups, one for each distinct `user_id`): - * + * * Date :(202001-) [ 01, 02, 03, 07, 07, 01, 01, 02, 04 ] * Input: [ 10, 20, 10, 50, 60, 20, 30, 80, 40 ] * <-------user1-------->|<---------user2---------> - * - * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period. + * + * The SUM aggregation is applied, with 1 day preceding, and 1 day following, with a minimum of 1 period. * The aggregation window is thus 3 *days* wide, yielding the following output column: - * + * * Results: [ 30, 40, 30, 110, 110, 130, 130, 130, 40 ] - * + * * @param windowAggregates the window-aggregations to be performed * @return Table instance, with each column containing the result of each aggregation. * @throws IllegalArgumentException if the window arguments are not of type diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index f3377bb002d..18e7936f322 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2676,6 +2676,32 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclas CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates( + JNIEnv *env, jclass, jlong input_jtable, jintArray key_columns, jboolean keep_first, + jboolean nulls_equal, jboolean nulls_before) { + JNI_NULL_CHECK(env, input_jtable, "input table is null", 0); + JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_jtable); + + static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched."); + auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns); + auto const keys_indices = + std::vector(native_keys_indices.begin(), native_keys_indices.end()); + + auto result = cudf::drop_duplicates( + *input, keys_indices, + keep_first ? cudf::duplicate_keep_option::KEEP_FIRST : + cudf::duplicate_keep_option::KEEP_LAST, + nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL, + nulls_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER, + rmm::mr::get_current_device_resource()); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input, jlong j_map, jboolean check_bounds) { JNI_NULL_CHECK(env, j_input, "input table is null", 0); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 0b2f56895e9..a5779bf9dbb 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -6592,6 +6592,32 @@ void testTableBasedFilter() { } } + @Test + void testDropDuplicates() { + int[] keyColumns = new int[]{ 1 }; + + try (ColumnVector col1 = ColumnVector.fromBoxedInts(5, null, 3, 5, 8, 1); + ColumnVector col2 = ColumnVector.fromBoxedInts(20, null, null, 19, 21, 19); + Table input = new Table(col1, col2)) { + + // Keep the first duplicate element. + try (Table result = input.dropDuplicates(keyColumns, true, true, true); + ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(null, 5, 5, 8); + ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21); + Table expected = new Table(expectedCol1, expectedCol2)) { + assertTablesAreEqual(expected, result); + } + + // Keep the last duplicate element. + try (Table result = input.dropDuplicates(keyColumns, false, true, true); + ColumnVector expectedCol1 = ColumnVector.fromBoxedInts(3, 1, 5, 8); + ColumnVector expectedCol2 = ColumnVector.fromBoxedInts(null, 19, 20, 21); + Table expected = new Table(expectedCol1, expectedCol2)) { + assertTablesAreEqual(expected, result); + } + } + } + private enum Columns { BOOL("BOOL"), INT("INT"), From 8002cbd87367135a941d1145c9d489a8f82dc76d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 6 Dec 2021 08:38:41 -0500 Subject: [PATCH 052/202] Allow runtime has_nulls parameter for row operators (#9623) Closes #6952 This PR allows the `has_nulls` template parameter for row operators to be used a runtime parameter in places where the null-handling logic has little to no affect on runtime performance. This can improve compile time as described in #6952. This will also close #9152 and #9580 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Nghia Truong (https://github.com/ttnghia) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/cudf/pull/9623 --- cpp/benchmarks/groupby/group_sum_benchmark.cu | 7 +- cpp/benchmarks/hashing/hash_benchmark.cpp | 37 +- .../cudf/column/column_device_view.cuh | 262 ++++---------- cpp/include/cudf/detail/iterator.cuh | 330 +++++------------- cpp/include/cudf/detail/merge.cuh | 4 +- cpp/include/cudf/table/row_operators.cuh | 173 +++++---- cpp/src/copying/copy.cu | 12 +- cpp/src/copying/segmented_shift.cu | 19 +- cpp/src/groupby/hash/groupby.cu | 49 ++- cpp/src/groupby/sort/group_nunique.cu | 4 +- cpp/src/groupby/sort/group_rank_scan.cu | 34 +- cpp/src/groupby/sort/group_scan_util.cuh | 21 +- .../sort/group_single_pass_reduction_util.cuh | 21 +- cpp/src/groupby/sort/sort_helper.cu | 34 +- cpp/src/hash/hashing.cu | 55 +-- cpp/src/hash/murmur_hash.cu | 34 +- cpp/src/join/hash_join.cu | 10 +- cpp/src/join/hash_join.cuh | 4 +- cpp/src/join/join_common_utils.cuh | 6 +- cpp/src/join/join_common_utils.hpp | 4 +- cpp/src/join/semi_join.cu | 8 +- cpp/src/partitioning/partitioning.cu | 3 +- cpp/src/reductions/arg_minmax_util.cuh | 19 +- cpp/src/reductions/scan/rank_scan.cu | 30 +- cpp/src/reductions/scan/scan_inclusive.cu | 15 +- cpp/src/reductions/simple.cuh | 16 +- cpp/src/replace/clamp.cu | 10 +- cpp/src/replace/nans.cu | 4 +- cpp/src/search/search.cu | 18 +- cpp/src/sort/is_sorted.cu | 18 +- cpp/src/sort/rank.cu | 28 +- cpp/src/sort/sort_impl.cuh | 49 +-- cpp/src/stream_compaction/distinct_count.cu | 30 +- cpp/src/stream_compaction/drop_duplicates.cu | 41 +-- cpp/src/transform/one_hot_encode.cu | 20 +- cpp/tests/iterator/optional_iterator_test.cuh | 22 +- .../optional_iterator_test_numeric.cu | 2 +- cpp/tests/table/table_view_tests.cu | 4 +- cpp/tests/utilities/column_utilities.cu | 16 +- 39 files changed, 542 insertions(+), 931 deletions(-) diff --git a/cpp/benchmarks/groupby/group_sum_benchmark.cu b/cpp/benchmarks/groupby/group_sum_benchmark.cu index f64022690d9..0e9f5061a1a 100644 --- a/cpp/benchmarks/groupby/group_sum_benchmark.cu +++ b/cpp/benchmarks/groupby/group_sum_benchmark.cu @@ -44,7 +44,6 @@ void BM_basic_sum(benchmark::State& state) { using wrapper = cudf::test::fixed_width_column_wrapper; - // const cudf::size_type num_columns{(cudf::size_type)state.range(0)}; const cudf::size_type column_size{(cudf::size_type)state.range(0)}; auto data_it = cudf::detail::make_counting_transform_iterator( @@ -53,7 +52,7 @@ void BM_basic_sum(benchmark::State& state) wrapper keys(data_it, data_it + column_size); wrapper vals(data_it, data_it + column_size); - cudf::groupby::groupby gb_obj(cudf::table_view({keys})); + cudf::groupby::groupby gb_obj(cudf::table_view({keys, keys, keys})); std::vector requests; requests.emplace_back(cudf::groupby::aggregation_request()); @@ -73,7 +72,9 @@ BENCHMARK_REGISTER_F(Groupby, Basic) ->UseManualTime() ->Unit(benchmark::kMillisecond) ->Arg(10000) - ->Arg(10000000); + ->Arg(1000000) + ->Arg(10000000) + ->Arg(100000000); void BM_pre_sorted_sum(benchmark::State& state) { diff --git a/cpp/benchmarks/hashing/hash_benchmark.cpp b/cpp/benchmarks/hashing/hash_benchmark.cpp index 77b10399693..4ccb0bfad9d 100644 --- a/cpp/benchmarks/hashing/hash_benchmark.cpp +++ b/cpp/benchmarks/hashing/hash_benchmark.cpp @@ -25,10 +25,14 @@ class HashBenchmark : public cudf::benchmark { }; -static void BM_hash(benchmark::State& state, cudf::hash_id hid) +enum contains_nulls { no_nulls, nulls }; + +static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls has_nulls) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const data = create_random_table({cudf::type_id::INT64}, 1, row_count{n_rows}); + if (has_nulls == contains_nulls::no_nulls) + data->get_column(0).set_null_mask(rmm::device_buffer{}, 0); for (auto _ : state) { cuda_event_timer raii(state, true, rmm::cuda_stream_default); @@ -36,16 +40,25 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid) } } -#define HASH_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(HashBenchmark, name) \ - (::benchmark::State & st) { BM_hash(st, cudf::hash_id::name); } \ - BENCHMARK_REGISTER_F(HashBenchmark, name) \ - ->RangeMultiplier(4) \ - ->Ranges({{1 << 14, 1 << 24}}) \ - ->UseManualTime() \ +#define concat(a, b, c) a##b##c + +#define H_BENCHMARK_DEFINE(name, hid, n) \ + BENCHMARK_DEFINE_F(HashBenchmark, name) \ + (::benchmark::State & st) { BM_hash(st, cudf::hash_id::hid, contains_nulls::n); } \ + BENCHMARK_REGISTER_F(HashBenchmark, name) \ + ->RangeMultiplier(4) \ + ->Ranges({{1 << 14, 1 << 24}}) \ + ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -HASH_BENCHMARK_DEFINE(HASH_MURMUR3) -HASH_BENCHMARK_DEFINE(HASH_MD5) -HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3) -HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3) +#define HASH_BENCHMARK_DEFINE(hid, n) H_BENCHMARK_DEFINE(concat(hid, _, n), hid, n) + +HASH_BENCHMARK_DEFINE(HASH_MURMUR3, nulls) +HASH_BENCHMARK_DEFINE(HASH_MD5, nulls) +HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, nulls) +HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, nulls) + +HASH_BENCHMARK_DEFINE(HASH_MURMUR3, no_nulls) +HASH_BENCHMARK_DEFINE(HASH_MD5, no_nulls) +HASH_BENCHMARK_DEFINE(HASH_SERIAL_MURMUR3, no_nulls) +HASH_BENCHMARK_DEFINE(HASH_SPARK_MURMUR3, no_nulls) diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 6ecb0796283..a15f20ef52d 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -44,23 +44,30 @@ namespace cudf { /** - * @brief Policy for what assumptions the optional iterator has about null values + * @brief Indicates the presence of nulls at compile-time or runtime. * - * - `YES` means that the column supports nulls and has null values, therefore - * the optional might not contain a value + * If used at compile-time, this indicator can tell the optimizer + * to include or exclude any null-checking clauses. * - * - `NO` means that the column has no null values, therefore the optional will - * always have a value - * - * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating - * on construction of the iterator if column has nulls. */ -struct contains_nulls { - struct YES { +struct nullate { + struct YES : std::bool_constant { }; - struct NO { + struct NO : std::bool_constant { }; struct DYNAMIC { + DYNAMIC() = delete; + /** + * @brief Create a runtime nullate object. + * + * @see cudf::column_device_view::optional_begin for example usage + * + * @param b True if nulls are expected in the operation in which this + * object is applied. + */ + constexpr explicit DYNAMIC(bool b) noexcept : value{b} {} + constexpr operator bool() const noexcept { return value; } + bool value; ///< True if nulls are expected }; }; @@ -282,7 +289,7 @@ class alignas(16) column_device_view_base { // Forward declaration template struct value_accessor; -template +template struct optional_accessor; template struct pair_accessor; @@ -493,11 +500,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base { } /** - * @brief optional iterator for navigating this column + * @brief Optional iterator for navigating this column */ - template + template using const_optional_iterator = - thrust::transform_iterator, count_it>; + thrust::transform_iterator, count_it>; /** * @brief Pair iterator for navigating this column @@ -520,117 +527,57 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * * Dereferencing the returned iterator returns a `thrust::optional`. * - * When the element of an iterator contextually converted to bool, the conversion returns true + * The element of this iterator contextually converts to bool. The conversion returns true * if the object contains a value and false if it does not contain a value. * - * optional_begin with mode `DYNAMIC` defers the assumption of nullability to - * runtime, with the user stating on construction of the iterator if column has nulls. - * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple - * iterators and you don't want to compile all the combinations of iterator types - * - * Example: + * Calling this method with `nullate::DYNAMIC` defers the assumption of nullability to + * runtime with the caller indicating if the column has nulls. The `nullate::DYNAMIC` is + * useful when an algorithm is going to execute on multiple iterators and all the combinations of + * iterator types are not required at compile time. * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view){ * auto d_col = cudf::column_device_view::create(col_view); * // Create a `DYNAMIC` optional iterator - * auto optional_iterator = d_col->optional_begin(cudf::contains_nulls::DYNAMIC{}, - * col_view.has_nulls()); - * } - * \endcode - * - * This function does not participate in overload resolution if - * `column_device_view::has_element_accessor()` is false. - * - * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and - * the user has stated nulls exist - * @throws cudf::logic_error if column datatype and Element type mismatch. - */ - template ())> - auto optional_begin(contains_nulls::DYNAMIC, bool has_nulls) const - { - return const_optional_iterator{ - count_it{0}, detail::optional_accessor{*this, has_nulls}}; - } - - /** - * @brief Return an optional iterator to the first element of the column. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * optional_begin with mode `YES` means that the column supports nulls and - * potentially has null values, therefore the optional might not contain a value - * - * Example: - * - * \code{.cpp} - * template - * void some_function(cudf::column_view const& col_view){ - * auto d_col = cudf::column_device_view::create(col_view); - * if constexpr(has_nulls) { - * auto optional_iterator = d_col->optional_begin(cudf::contains_nulls::YES{}); - * //use optional_iterator - * } else { - * auto optional_iterator = d_col->optional_begin(cudf::contains_nulls::NO{}); - * //use optional_iterator - * } + * auto optional_iterator = + * d_col->optional_begin(cudf::nullate::DYNAMIC{col_view.has_nulls()}); * } - * \endcode + * @endcode * - * This function does not participate in overload resolution if - * `column_device_view::has_element_accessor()` is false. + * Calling this method with `nullate::YES` means that the column supports nulls and + * the optional returned might not contain a value. * - * @throws cudf::logic_error if the column is not nullable, and `YES` mode used - * @throws cudf::logic_error if column datatype and Element type mismatch. - */ - template ())> - auto optional_begin(contains_nulls::YES) const - { - return const_optional_iterator{ - count_it{0}, detail::optional_accessor{*this}}; - } - - /** - * @brief Return an optional iterator to the first element of the column. + * Calling this method with `nullate::NO` means that the column has no null values + * and the optional returned will always contain a value. * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * optional_begin with mode `NO` means that the column has no null values, - * therefore the optional will always contain a value. - * - * Example: - * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view){ * auto d_col = cudf::column_device_view::create(col_view); * if constexpr(has_nulls) { - * auto optional_iterator = d_col->optional_begin(cudf::contains_nulls::YES{}); + * auto optional_iterator = d_col->optional_begin(cudf::nullate::YES{}); * //use optional_iterator * } else { - * auto optional_iterator = d_col->optional_begin(cudf::contains_nulls::NO{}); + * auto optional_iterator = d_col->optional_begin(cudf::nullate::NO{}); * //use optional_iterator * } * } - * \endcode + * @endcode * * This function does not participate in overload resolution if * `column_device_view::has_element_accessor()` is false. * + * @throws cudf::logic_error if the column is not nullable and `has_nulls` evaluates to true. * @throws cudf::logic_error if column datatype and Element type mismatch. */ - template ())> - auto optional_begin(contains_nulls::NO) const + template ())> + auto optional_begin(Nullate has_nulls) const { - return const_optional_iterator{ - count_it{0}, detail::optional_accessor{*this}}; + return const_optional_iterator{ + count_it{0}, detail::optional_accessor{*this, has_nulls}}; } /** @@ -695,57 +642,21 @@ class alignas(16) column_device_view : public detail::column_device_view_base { * @brief Return an optional iterator to the element following the last element of * the column. * - * Dereferencing the returned iterator returns a `thrust::optional`. + * The returned iterator represents a `thrust::optional` element. * * This function does not participate in overload resolution if * `column_device_view::has_element_accessor()` is false. * - * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and - * the user has stated nulls exist + * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true * @throws cudf::logic_error if column datatype and Element type mismatch. */ - template ())> - auto optional_end(contains_nulls::DYNAMIC, bool has_nulls) const - { - return const_optional_iterator{ - count_it{size()}, detail::optional_accessor{*this, has_nulls}}; - } - - /** - * @brief Return an optional iterator to the element following the last element of - * the column. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * This function does not participate in overload resolution if - * `column_device_view::has_element_accessor()` is false. - * - * @throws cudf::logic_error if the column is not nullable, and `YES` mode used - * @throws cudf::logic_error if column datatype and Element type mismatch. - */ - template ())> - auto optional_end(contains_nulls::YES) const - { - return const_optional_iterator{ - count_it{size()}, detail::optional_accessor{*this}}; - } - - /** - * @brief Return an optional iterator to the element following the last element of - * the column. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * This function does not participate in overload resolution if - * `column_device_view::has_element_accessor()` is false. - * - * @throws cudf::logic_error if column datatype and Element type mismatch. - */ - template ())> - auto optional_end(contains_nulls::NO) const + template ())> + auto optional_end(Nullate has_nulls) const { - return const_optional_iterator{ - count_it{size()}, detail::optional_accessor{*this}}; + return const_optional_iterator{ + count_it{size()}, detail::optional_accessor{*this, has_nulls}}; } /** @@ -1201,77 +1112,56 @@ struct value_accessor { * @brief optional accessor of a column * * - * The optional_accessor always returns a thrust::optional of column[i]. The validity - * of the optional is determined by the contains_nulls_mode template parameter - * which has the following modes: + * The optional_accessor always returns a `thrust::optional` of `column[i]`. The validity + * of the optional is determined by the `Nullate` parameter which may be one of the following: * - * - `YES` means that the column supports nulls and has null values, therefore - * the optional might be valid or invalid + * - `nullate::YES` means that the column supports nulls and the optional returned + * might be valid or invalid. * - * - `NO` the user has attested that the column has no null values, + * - `nullate::NO` means the caller attests that the column has no null values, * no checks will occur and `thrust::optional{column[i]}` will be * return for each `i`. * - * - `DYNAMIC` defers the assumption of nullability to runtime with the users stating - * on construction of the iterator if column has nulls. - * When `with_nulls=true` the return value validity will be determined if column[i] - * is not null. - * When `with_nulls=false` the return value will always be valid + * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller + * specifies if the column has nulls at runtime. + * For `DYNAMIC{true}` the return value will be `thrust::optional{column[i]}` if + * element `i` is not null and `thrust::optional{}` if element `i` is null. + * For `DYNAMIC{false}` the return value will always be `thrust::optional{column[i]}`. * * @throws cudf::logic_error if column datatype and template T type mismatch. - * @throws cudf::logic_error if the column is not nullable, and `with_nulls=true` - * + * @throws cudf::logic_error if the column is not nullable and `with_nulls` evaluates to true * * @tparam T The type of elements in the column - * @tparam contains_nulls_mode Specifies if nulls are checked at runtime or compile time. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. */ -template +template struct optional_accessor { column_device_view const col; ///< column view of column in device /** - * @brief constructor - * @param[in] _col column device view of cudf column + * @brief Constructor + * + * @param col Column on which to iterator over its elements. + * @param with_nulls Indicates if the `col` should be checked for nulls. */ - optional_accessor(column_device_view const& _col) : col{_col} + optional_accessor(column_device_view const& _col, Nullate with_nulls) + : col{_col}, has_nulls{with_nulls} { CUDF_EXPECTS(type_id_matches_device_storage_type(col.type().id()), "the data type mismatch"); + if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } } CUDA_DEVICE_CALLABLE thrust::optional operator()(cudf::size_type i) const { - if constexpr (std::is_same_v) { + if (has_nulls) { return (col.is_valid_nocheck(i)) ? thrust::optional{col.element(i)} : thrust::optional{thrust::nullopt}; } return thrust::optional{col.element(i)}; } -}; - -template -struct optional_accessor { - column_device_view const col; ///< column view of column in device - bool has_nulls; - - /** - * @brief constructor - * @param[in] _col column device view of cudf column - * @param[in] with_nulls Indicates if @p _col has nulls - */ - optional_accessor(column_device_view const& _col, bool with_nulls) - : col{_col}, has_nulls{with_nulls} - { - CUDF_EXPECTS(type_id_matches_device_storage_type(col.type().id()), "the data type mismatch"); - if (with_nulls) { CUDF_EXPECTS(_col.nullable(), "Unexpected non-nullable column."); } - } - CUDA_DEVICE_CALLABLE - thrust::optional operator()(cudf::size_type i) const - { - return (has_nulls and col.is_null_nocheck(i)) ? thrust::optional{thrust::nullopt} - : thrust::optional{col.element(i)}; - } + Nullate has_nulls{}; }; /** diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 3e789299716..01742384972 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -171,127 +171,61 @@ auto make_null_replacement_iterator(column_device_view const& column, * * Dereferencing the returned iterator returns a `thrust::optional`. * - * When the element of an iterator contextually converted to bool, the conversion returns true + * The element of this iterator contextually converts to bool. The conversion returns true * if the object contains a value and false if it does not contain a value. * - * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to - * runtime, with the user stating on construction of the iterator if column has nulls. - * `DYNAMIC` mode is nice when an algorithm is going to execute on multiple - * iterators and you don't want to compile all the combinations of iterator types + * Calling this function with `nullate::DYNAMIC` defers the assumption + * of nullability to runtime with the caller indicating if the column has nulls. + * This is useful when an algorithm is going to execute on multiple iterators and all + * the combinations of iterator types are not required at compile time. * - * Example: - * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view){ * auto d_col = cudf::column_device_view::create(col_view); * // Create a `DYNAMIC` optional iterator - * auto optional_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::DYNAMIC{}, - * col_view.has_nulls()); + * auto optional_iterator = + * cudf::detail::make_optional_iterator( + * d_col, cudf::nullate::DYNAMIC{col_view.has_nulls()}); * } - * \endcode - * - * @throws cudf::logic_error if the column is not nullable, and `DYNAMIC` mode used and - * the user has stated nulls exist - * @throws cudf::logic_error if column datatype and Element type mismatch. - * - * @tparam Element The type of elements in the column - * @param column The column to iterate - * @return Iterator that returns valid column elements and the validity of the - * element in a thrust::optional - */ -template -auto make_optional_iterator(column_device_view const& column, - contains_nulls::DYNAMIC, - bool has_nulls) -{ - return column.optional_begin(contains_nulls::DYNAMIC{}, has_nulls); -} - -/** - * @brief Constructs an optional iterator over a column's values and its validity. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * make_optional_iterator with mode `YES` means that the column supports nulls and - * potentially has null values, therefore the optional might not contain a value + * @endcode * - * Example: + * Calling this function with `nullate::YES` means that the column supports + * nulls and the optional returned might not contain a value. + * Calling this function with `nullate::NO` means that the column has no + * null values and the optional returned will always contain a value. * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view){ * auto d_col = cudf::column_device_view::create(col_view); * if constexpr(has_nulls) { - * auto optional_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::YES{}); + * auto optional_iterator = + * cudf::detail::make_optional_iterator(d_col, cudf::nullate::YES{}); * //use optional_iterator * } else { - * auto optional_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::NO{}); + * auto optional_iterator = + * cudf::detail::make_optional_iterator(d_col, cudf::nullate::NO{}); * //use optional_iterator * } * } - * \endcode + * @endcode * - * @throws cudf::logic_error if the column is not nullable, and `YES` mode used + * @throws cudf::logic_error if the column is not nullable and `has_nulls` is true. * @throws cudf::logic_error if column datatype and Element type mismatch. * - * @tparam Element The type of elements in the column - * @param column The column to iterate - * @return Iterator that returns column elements and the validity of the - * element as a thrust::optional - */ -template -auto make_optional_iterator(column_device_view const& column, contains_nulls::YES) -{ - return column.optional_begin(contains_nulls::YES{}); -} - -/** - * @brief Constructs an optional iterator over a column's values and its validity. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * make_optional_iterator with mode `NO` means that the column has no null values, - * therefore the optional will always contain a value. - * - * Example: - * - * \code{.cpp} - * template - * void some_function(cudf::column_view const& col_view){ - * auto d_col = cudf::column_device_view::create(col_view); - * if constexpr(has_nulls) { - * auto optional_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::YES{}); - * //use optional_iterator - * } else { - * auto optional_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::NO{}); - * //use optional_iterator - * } - * } - * \endcode - * - * @throws cudf::logic_error if column datatype and Element type mismatch. + * @tparam Element The type of elements in the column. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. * - * @tparam Element The type of elements in the column * @param column The column to iterate - * @return Iterator that returns column elements and the validity of the - * element in a thrust::optional + * @param has_nulls Indicates whether `column` is checked for nulls. + * @return Iterator that returns valid column elements and the validity of the + * element in a `thrust::optional` */ -template -auto make_optional_iterator(column_device_view const& column, contains_nulls::NO) +template +auto make_optional_iterator(column_device_view const& column, Nullate has_nulls) { - return column.optional_begin(contains_nulls::NO{}); + return column.optional_begin(has_nulls); } /** @@ -447,40 +381,38 @@ auto inline make_scalar_iterator(scalar const& scalar_value) scalar_value_accessor{scalar_value}); } -template -struct scalar_optional_accessor; - /** - * @brief optional accessor of a maybe-nullable scalar - * - * The scalar_optional_accessor always returns a thrust::optional of the scalar. - * The validity of the optional is determined by the contains_nulls_mode template parameter - * which has the following modes: + * @brief Optional accessor for a scalar * - * `DYNAMIC`: Defer nullability checks to runtime + * The `scalar_optional_accessor` always returns a `thrust::optional` of the scalar. + * The validity of the optional is determined by the `Nullate` parameter which may + * be one of the following: * - * - When `with_nulls=true` the return value will be a `thrust::optional{scalar}` - * when scalar is valid, and `thrust::optional{}` when the scalar is invalid. + * - `nullate::YES` means that the scalar may be valid or invalid and the optional returned + * will contain a value only if the scalar is valid. * - * - When `with_nulls=false` the return value will always be `thrust::optional{scalar}` + * - `nullate::NO` means the caller attests that the scalar will always be valid, + * no checks will occur and `thrust::optional{column[i]}` will return a value + * for each `i`. * - * `NO`: No null values will occur for this scalar, no checks will occur - * and `thrust::optional{scalar}` will always be returned. - * - * `YES`: null values will occur for this scalar, - * and `thrust::optional{scalar}` will always be returned. + * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller + * specifies if the scalar may be valid or invalid. + * For `DYNAMIC{true}` the return value will be a `thrust::optional{scalar}` when the + * scalar is valid and a `thrust::optional{}` when the scalar is invalid. + * For `DYNAMIC{false}` the return value will always be a `thrust::optional{scalar}`. * * @throws `cudf::logic_error` if scalar datatype and Element type mismatch. * * @tparam Element The type of return type of functor + * @tparam Nullate A cudf::nullate type describing how to check for nulls. */ -template +template struct scalar_optional_accessor : public scalar_value_accessor { using super_t = scalar_value_accessor; using value_type = thrust::optional; - scalar_optional_accessor(scalar const& scalar_value) - : scalar_value_accessor(scalar_value) + scalar_optional_accessor(scalar const& scalar_value, Nullate with_nulls) + : scalar_value_accessor(scalar_value), has_nulls{with_nulls} { } @@ -494,32 +426,14 @@ struct scalar_optional_accessor : public scalar_value_accessor { CUDA_HOST_DEVICE_CALLABLE const value_type operator()(size_type) const { - if constexpr (std::is_same_v) { + if (has_nulls) { return (super_t::dscalar.is_valid()) ? Element{super_t::dscalar.value()} : value_type{thrust::nullopt}; } return Element{super_t::dscalar.value()}; } -}; -template -struct scalar_optional_accessor - : public scalar_value_accessor { - using super_t = scalar_value_accessor; - using value_type = thrust::optional; - bool has_nulls; - - scalar_optional_accessor(scalar const& scalar_value, bool with_nulls) - : scalar_value_accessor(scalar_value), has_nulls{with_nulls} - { - } - - CUDA_HOST_DEVICE_CALLABLE - const value_type operator()(size_type) const - { - return (has_nulls and !super_t::dscalar.is_valid()) ? value_type{thrust::nullopt} - : Element{super_t::dscalar.value()}; - } + Nullate has_nulls{}; }; /** @@ -622,156 +536,70 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor`. * - * When the element of an iterator contextually converted to bool, the conversion returns true + * The element of this iterator contextually converts to bool. The conversion returns true * if the object contains a value and false if it does not contain a value. * * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing. * - * make_optional_iterator with mode `DYNAMIC` defers the assumption of nullability to - * runtime, with the user stating on construction of the iterator if scalar has nulls. - * - * Example: + * Calling this function with `nullate::DYNAMIC` defers the assumption + * of nullability to runtime with the caller indicating if the scalar is valid. * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view, * scalar const& scalar_value, * bool col_has_nulls){ * auto d_col = cudf::column_device_view::create(col_view); - * auto column_iterator = cudf::detail::make_optional_iterator(d_col, - cudf::contains_nulls::DYNAMIC{}, col_has_nulls); - * auto scalar_iterator = cudf::detail::make_optional_iterator(scalar_value, - cudf::contains_nulls::DYNAMIC{}, scalar_value.is_valid()); + * auto column_iterator = cudf::detail::make_optional_iterator( + * d_col, cudf::nullate::DYNAMIC{col_has_nulls}); + * auto scalar_iterator = cudf::detail::make_optional_iterator( + * scalar_value, cudf::nullate::DYNAMIC{scalar_value.is_valid()}); * //use iterators * } - * \endcode - * - * @throws cudf::logic_error if the scalar is not nullable, and `DYNAMIC` mode used and - * the user has stated nulls exist - * @throws cudf::logic_error if scalar datatype and Element type mismatch. - * - * @tparam Element The type of elements in the scalar - * @tparam has_nulls If the scalar value will have a null at runtime - * @param scalar_value The scalar to iterate - * @return Iterator that returns scalar elements and validity of the - * element in a thrust::optional - */ -template -auto inline make_optional_iterator(scalar const& scalar_value, - contains_nulls::DYNAMIC, - bool has_nulls) -{ - CUDF_EXPECTS(type_id_matches_device_storage_type(scalar_value.type().id()), - "the data type mismatch"); - return thrust::make_transform_iterator( - thrust::make_constant_iterator(0), - scalar_optional_accessor{scalar_value, has_nulls}); -} - -/** - * @brief Constructs an optional iterator over a scalar's values and its validity. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing. - * - * make_optional_iterator ith mode `YES` means that the scalar supports nulls and - * potentially has null values, therefore the optional might not contain a value - * therefore the optional will always contain a value. + * @endcode * - * Example: + * Calling this function with `nullate::YES` means that the scalar maybe invalid + * and the optional return might not contain a value. + * Calling this function with `nullate::NO` means that the scalar is valid + * and the optional returned will always contain a value. * - * \code{.cpp} + * @code{.cpp} * template * void some_function(cudf::column_view const& col_view, scalar const& scalar_value){ * auto d_col = cudf::column_device_view::create(col_view); * if constexpr(any_nulls) { - * auto column_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::YES{}); - * auto scalar_iterator = cudf::detail::make_optional_iterator(scalar_value, - * cudf::contains_nulls::YES{}); + * auto column_iterator = + * cudf::detail::make_optional_iterator(d_col, cudf::nullate::YES{}); + * auto scalar_iterator = + * cudf::detail::make_optional_iterator(scalar_value, cudf::nullate::YES{}); * //use iterators * } else { - * auto column_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::NO{}); - * auto scalar_iterator = cudf::detail::make_optional_iterator(scalar_value, - * cudf::contains_nulls::NO{}); + * auto column_iterator = + * cudf::detail::make_optional_iterator(d_col, cudf::nullate::NO{}); + * auto scalar_iterator = + * cudf::detail::make_optional_iterator(scalar_value, cudf::nullate::NO{}); * //use iterators * } * } - * \endcode + * @endcode * - * @throws cudf::logic_error if the scalar is not nullable, and `YES` mode used * @throws cudf::logic_error if scalar datatype and Element type mismatch. * * @tparam Element The type of elements in the scalar - * @param scalar_value The scalar to iterate - * @return Iterator that returns scalar elements and the validity of the - * element in a thrust::optional - */ -template -auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::YES) -{ - CUDF_EXPECTS(type_id_matches_device_storage_type(scalar_value.type().id()), - "the data type mismatch"); - return thrust::make_transform_iterator( - thrust::make_constant_iterator(0), - scalar_optional_accessor{scalar_value}); -} - -/** - * @brief Constructs an optional iterator over a scalar's values and its validity. - * - * Dereferencing the returned iterator returns a `thrust::optional`. - * - * When the element of an iterator contextually converted to bool, the conversion returns true - * if the object contains a value and false if it does not contain a value. - * - * The iterator behavior is undefined if the scalar is destroyed before iterator dereferencing. - * - * make_optional_iterator with mode `NO` means that the scalar has no null values, - * therefore the optional will always contain a value. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. * - * Example: - * - * \code{.cpp} - * template - * void some_function(cudf::column_view const& col_view, scalar const& scalar_value){ - * auto d_col = cudf::column_device_view::create(col_view); - * if constexpr(any_nulls) { - * auto column_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::YES{}); - * auto scalar_iterator = cudf::detail::make_optional_iterator(scalar_value, - * cudf::contains_nulls::YES{}); - * //use iterators - * } else { - * auto column_iterator = cudf::detail::make_optional_iterator(d_col, - * cudf::contains_nulls::NO{}); - * auto scalar_iterator = cudf::detail::make_optional_iterator(scalar_value, - * cudf::contains_nulls::NO{}); - * //use iterators - * } - * } - * \endcode - * - * @throws cudf::logic_error if scalar datatype and Element type mismatch. - * - * @tparam Element The type of elements in the scalar - * @param scalar_value The scalar to iterate - * @return Iterator that returns scalar elements and the validity of the - * element in a thrust::optional + * @param scalar_value The scalar to be returned by the iterator. + * @param has_nulls Indicates if the scalar value may be invalid. + * @return Iterator that returns scalar and the validity of the scalar in a thrust::optional */ -template -auto inline make_optional_iterator(scalar const& scalar_value, contains_nulls::NO) +template +auto inline make_optional_iterator(scalar const& scalar_value, Nullate has_nulls) { CUDF_EXPECTS(type_id_matches_device_storage_type(scalar_value.type().id()), "the data type mismatch"); return thrust::make_transform_iterator( thrust::make_constant_iterator(0), - scalar_optional_accessor{scalar_value}); + scalar_optional_accessor{scalar_value, has_nulls}); } /** diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh index ec83e348e33..f141d9b5d59 100644 --- a/cpp/include/cudf/detail/merge.cuh +++ b/cpp/include/cudf/detail/merge.cuh @@ -90,8 +90,8 @@ struct tagged_element_relational_comparator { column_device_view const* ptr_right_dview{r_side == side::LEFT ? &lhs : &rhs}; - auto erl_comparator = - element_relational_comparator(*ptr_left_dview, *ptr_right_dview, null_precedence); + auto erl_comparator = element_relational_comparator( + nullate::DYNAMIC{has_nulls}, *ptr_left_dview, *ptr_right_dview, null_precedence); return cudf::type_dispatcher(lhs.type(), erl_comparator, l_indx, r_indx); } diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 70ccac2f75d..0f3ca073380 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -50,9 +50,9 @@ namespace detail { /** * @brief Compare the elements ordering with respect to `lhs`. * - * @param[in] lhs first element - * @param[in] rhs second element - * @return weak_ordering Indicates the relationship between the elements in + * @param lhs first element + * @param rhs second element + * @return Indicates the relationship between the elements in * the `lhs` and `rhs` columns. */ template @@ -69,14 +69,15 @@ __device__ weak_ordering compare_elements(Element lhs, Element rhs) /** * @brief A specialization for floating-point `Element` type relational comparison - * to derive the order of the elements with respect to `lhs`. Specialization is to - * handle `nan` in the order shown below. + * to derive the order of the elements with respect to `lhs`. + * + * This Specialization handles `nan` in the following order: * `[-Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN, null] (for null_order::AFTER)` * `[null, -Inf, -ve, 0, -0, +ve, +Inf, NaN, NaN] (for null_order::BEFORE)` * - * @param[in] lhs first element - * @param[in] rhs second element - * @return weak_ordering Indicates the relationship between the elements in + * @param lhs first element + * @param rhs second element + * @return Indicates the relationship between the elements in * the `lhs` and `rhs` columns. */ template ::value>* = nullptr> @@ -119,7 +120,7 @@ inline __device__ auto null_compare(bool lhs_is_null, bool rhs_is_null, null_ord * * @param[in] lhs first element * @param[in] rhs second element - * @return weak_ordering Indicates the relationship between the elements in + * @return Indicates the relationship between the elements in * the `lhs` and `rhs` columns. */ template ::value>* = nullptr> @@ -132,9 +133,9 @@ __device__ weak_ordering relational_compare(Element lhs, Element rhs) * @brief A specialization for floating-point `Element` type to check if * `lhs` is equivalent to `rhs`. `nan == nan`. * - * @param[in] lhs first element - * @param[in] rhs second element - * @return bool `true` if `lhs` == `rhs` else `false`. + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. */ template ::value>* = nullptr> __device__ bool equality_compare(Element lhs, Element rhs) @@ -147,9 +148,9 @@ __device__ bool equality_compare(Element lhs, Element rhs) * @brief A specialization for non-floating-point `Element` type to check if * `lhs` is equivalent to `rhs`. * - * @param[in] lhs first element - * @param[in] rhs second element - * @return bool `true` if `lhs` == `rhs` else `false`. + * @param lhs first element + * @param rhs second element + * @return `true` if `lhs` == `rhs` else `false`. */ template ::value>* = nullptr> __device__ bool equality_compare(Element const lhs, Element const rhs) @@ -160,9 +161,9 @@ __device__ bool equality_compare(Element const lhs, Element const rhs) /** * @brief Performs an equality comparison between two elements in two columns. * - * @tparam has_nulls Indicates the potential for null values in either column. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. */ -template +template class element_equality_comparator { public: /** @@ -171,14 +172,17 @@ class element_equality_comparator { * * @note `lhs` and `rhs` may be the same. * + * @param has_nulls Indicates if either input column contains nulls. * @param lhs The column containing the first element * @param rhs The column containing the second element (may be the same as lhs) * @param nulls_are_equal Indicates if two null elements are treated as equivalent */ - __host__ __device__ element_equality_comparator(column_device_view lhs, - column_device_view rhs, - bool nulls_are_equal = true) - : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal} + __host__ __device__ + element_equality_comparator(Nullate has_nulls, + column_device_view lhs, + column_device_view rhs, + null_equality nulls_are_equal = null_equality::EQUAL) + : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal} { } @@ -188,18 +192,17 @@ class element_equality_comparator { * @param lhs_element_index The index of the first element * @param rhs_element_index The index of the second element * @return True if both lhs and rhs element are both nulls and `nulls_are_equal` is true, or equal - * */ template ()>* = nullptr> __device__ bool operator()(size_type lhs_element_index, size_type rhs_element_index) const noexcept { - if (has_nulls) { + if (nulls) { bool const lhs_is_null{lhs.is_null(lhs_element_index)}; bool const rhs_is_null{rhs.is_null(rhs_element_index)}; if (lhs_is_null and rhs_is_null) { - return nulls_are_equal; + return nulls_are_equal == null_equality::EQUAL; } else if (lhs_is_null != rhs_is_null) { return false; } @@ -220,14 +223,18 @@ class element_equality_comparator { private: column_device_view lhs; column_device_view rhs; - bool nulls_are_equal; + Nullate nulls; + null_equality nulls_are_equal; }; -template +template class row_equality_comparator { public: - row_equality_comparator(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true) - : lhs{lhs}, rhs{rhs}, nulls_are_equal{nulls_are_equal} + row_equality_comparator(Nullate has_nulls, + table_device_view lhs, + table_device_view rhs, + null_equality nulls_are_equal = true) + : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, nulls_are_equal{nulls_are_equal} { CUDF_EXPECTS(lhs.num_columns() == rhs.num_columns(), "Mismatched number of columns."); } @@ -236,7 +243,7 @@ class row_equality_comparator { { auto equal_elements = [=](column_device_view l, column_device_view r) { return cudf::type_dispatcher(l.type(), - element_equality_comparator{l, r, nulls_are_equal}, + element_equality_comparator{nulls, l, r, nulls_are_equal}, lhs_row_index, rhs_row_index); }; @@ -247,15 +254,16 @@ class row_equality_comparator { private: table_device_view lhs; table_device_view rhs; - bool nulls_are_equal; + Nullate nulls; + null_equality nulls_are_equal; }; /** * @brief Performs a relational comparison between two elements in two columns. * - * @tparam has_nulls Indicates the potential for null values in either column. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. */ -template +template class element_relational_comparator { public: /** @@ -266,13 +274,21 @@ class element_relational_comparator { * * @param lhs The column containing the first element * @param rhs The column containing the second element (may be the same as lhs) - * @param null_precedence Indicates how null values are ordered with other - * values + * @param has_nulls Indicates if either input column contains nulls. + * @param null_precedence Indicates how null values are ordered with other values */ - __host__ __device__ element_relational_comparator(column_device_view lhs, + __host__ __device__ element_relational_comparator(Nullate has_nulls, + column_device_view lhs, column_device_view rhs, null_order null_precedence) - : lhs{lhs}, rhs{rhs}, null_precedence{null_precedence} + : lhs{lhs}, rhs{rhs}, nulls{has_nulls}, null_precedence{null_precedence} + { + } + + __host__ __device__ element_relational_comparator(Nullate has_nulls, + column_device_view lhs, + column_device_view rhs) + : lhs{lhs}, rhs{rhs}, nulls{has_nulls} { } @@ -281,7 +297,7 @@ class element_relational_comparator { * * @param lhs_element_index The index of the first element * @param rhs_element_index The index of the second element - * @return weak_ordering Indicates the relationship between the elements in + * @return Indicates the relationship between the elements in * the `lhs` and `rhs` columns. */ template +template class row_lexicographic_comparator { public: /** @@ -343,6 +360,7 @@ class row_lexicographic_comparator { * * @param lhs The first table * @param rhs The second table (may be the same table as `lhs`) + * @param has_nulls Indicates if either input table contains columns with nulls. * @param column_order Optional, device array the same length as a row that * indicates the desired ascending/descending order of each column in a row. * If `nullptr`, it is assumed all columns are sorted in ascending order. @@ -351,11 +369,16 @@ class row_lexicographic_comparator { * it is nullptr, then null precedence would be `null_order::BEFORE` for all * columns. */ - row_lexicographic_comparator(table_device_view lhs, + row_lexicographic_comparator(Nullate has_nulls, + table_device_view lhs, table_device_view rhs, order const* column_order = nullptr, null_order const* null_precedence = nullptr) - : _lhs{lhs}, _rhs{rhs}, _column_order{column_order}, _null_precedence{null_precedence} + : _lhs{lhs}, + _rhs{rhs}, + _nulls{has_nulls}, + _column_order{column_order}, + _null_precedence{null_precedence} { CUDF_EXPECTS(_lhs.num_columns() == _rhs.num_columns(), "Mismatched number of columns."); CUDF_EXPECTS(detail::is_relationally_comparable(_lhs, _rhs), @@ -376,14 +399,14 @@ class row_lexicographic_comparator { for (size_type i = 0; i < _lhs.num_columns(); ++i) { bool ascending = (_column_order == nullptr) or (_column_order[i] == order::ASCENDING); - weak_ordering state{weak_ordering::EQUIVALENT}; null_order null_precedence = _null_precedence == nullptr ? null_order::BEFORE : _null_precedence[i]; auto comparator = - element_relational_comparator{_lhs.column(i), _rhs.column(i), null_precedence}; + element_relational_comparator{_nulls, _lhs.column(i), _rhs.column(i), null_precedence}; - state = cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index); + weak_ordering state = + cudf::type_dispatcher(_lhs.column(i).type(), comparator, lhs_index, rhs_index); if (state == weak_ordering::EQUIVALENT) { continue; } @@ -395,6 +418,7 @@ class row_lexicographic_comparator { private: table_device_view _lhs; table_device_view _rhs; + Nullate _nulls{}; null_order const* _null_precedence{}; order const* _column_order{}; }; // class row_lexicographic_comparator @@ -403,9 +427,9 @@ class row_lexicographic_comparator { * @brief Computes the hash value of an element in the given column. * * @tparam hash_function Hash functor to use for hashing elements. - * @tparam has_nulls Indicates the potential for null values in the column. + * @tparam Nullate A cudf::nullate type describing how to check for nulls. */ -template