Skip to content

Commit

Permalink
Gracefully CUDF_FAIL when skip_rows > 0 in Chunked Parquet reader (#…
Browse files Browse the repository at this point in the history
…16385)

This PR must merge in cudf 24.08 to avoid unhandled expections.

Gracefully CUDF_FAIL in chunked parquet reader when `skip_rows>0` which may result in runtime exceptions like segfaults or an infinite loop. See #16186 for more information.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: #16385
  • Loading branch information
mhaseeb123 authored Jul 24, 2024
1 parent 8bba6df commit 59f6584
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 10 deletions.
5 changes: 5 additions & 0 deletions cpp/src/io/parquet/reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
// TODO: skip_rows not currently supported in chunked parquet reader until
// https://github.com/rapidsai/cudf/issues/16186 is closed
CUDF_EXPECTS(options.get_skip_rows() == 0,
"skip_rows > 0 is not currently supported in the Chunked Parquet reader.");

_impl = std::make_unique<impl>(
chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
}
Expand Down
29 changes: 19 additions & 10 deletions cpp/tests/io/parquet_chunked_reader_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1544,7 +1544,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)

// Chunked-read rows_to_read rows skipping rows_to_skip from single data source
{
auto const rows_to_skip = 1'237;
// TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
auto const rows_to_skip = 0; // 1'237
auto const rows_to_read = 7'232;
auto constexpr output_read_limit = 1'500;
auto constexpr pass_read_limit = 3'500;
Expand All @@ -1571,7 +1572,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)

// Chunked-read two data sources skipping the first entire file completely
{
auto constexpr rows_to_skip = 15'723;
// TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
auto constexpr rows_to_skip = 0; // 15'723;
auto constexpr output_read_limit = 1'024'000;
auto constexpr pass_read_limit = 1'024'000;

Expand All @@ -1588,20 +1590,25 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)

auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);

// TODO: Enable code inside /* */ when https://github.com/rapidsai/cudf/issues/16186 is resolved
auto int64_col_selected =
int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release();
int64s_col(int64_data.begin() /* + rows_to_skip - num_rows */, int64_data.end()).release();

cudf::table_view const expected_selected({int64_col_selected->view()});

CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
// TODO: Enable the following check when https://github.com/rapidsai/cudf/issues/16186
// is resolved
// CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());

EXPECT_EQ(num_rows_per_source.size(), 2);
EXPECT_EQ(num_rows_per_source[0], 0);
EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip);
EXPECT_EQ(num_rows_per_source[0], num_rows /* 0 */);
EXPECT_EQ(num_rows_per_source[1], num_rows /* nsources * num_rows - rows_to_skip */);
}

// Chunked-read from single data source skipping rows_to_skip
{
auto const rows_to_skip = 1'237;
// TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
auto const rows_to_skip = 0; // 1'237;
auto constexpr output_read_limit = 1'500;
auto constexpr pass_read_limit = 1'800;

Expand Down Expand Up @@ -1736,7 +1743,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)

// Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
{
auto const rows_to_skip = 25'571;
// TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
auto const rows_to_skip = 0; // 25'571;
auto const rows_to_read = 41'232;
auto constexpr output_read_limit = 15'000;
auto constexpr pass_read_limit = 35'000;
Expand Down Expand Up @@ -1782,8 +1790,9 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)

// Chunked-read four data sources skipping three files completely
{
auto const nsources = 4;
int constexpr rows_to_skip = num_rows * 3 + 1;
auto const nsources = 4;
// TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
int constexpr rows_to_skip = 0; // num_rows * 3 + 1;
auto constexpr output_read_limit = 15'000;
auto constexpr pass_read_limit = 35'000;
std::vector<int64_t> int64_selected_data{};
Expand Down

0 comments on commit 59f6584

Please sign in to comment.