From f84cd4316eaa61e231b5fd096608ca09d5e3c08c Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:26:43 -0500 Subject: [PATCH 1/3] [BUG] xfail Polars excel test (#17731) One the Polars tests fails when `fastexcel>=0.12.1`. I opened https://github.com/pola-rs/polars/issues/20698 to track that failing test. This PR xfail that test for now. xref #17677 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17731 --- python/cudf_polars/cudf_polars/testing/plugin.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index c16df320ceb..e453a8b89b9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -8,7 +8,9 @@ from functools import partialmethod from typing import TYPE_CHECKING +import fastexcel import pytest +from packaging import version import polars @@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None: ) -EXPECTED_FAILURES: Mapping[str, str] = { +EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = { "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", @@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None: # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", + "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": ( + "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698", + version.parse(fastexcel.__version__) >= version.parse("0.12.1"), + ), } @@ -219,4 +225,12 @@ def pytest_collection_modifyitems( if item.nodeid in TESTS_TO_SKIP: item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) elif item.nodeid in EXPECTED_FAILURES: + if isinstance(EXPECTED_FAILURES[item.nodeid], tuple): + # the second entry in the tuple is the condition to xfail on + item.add_marker( + pytest.mark.xfail( + condition=EXPECTED_FAILURES[item.nodeid][1], + reason=EXPECTED_FAILURES[item.nodeid][0], + ), + ) item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) From 253fb2f10e921519502e562672d29029e844c2cf Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:41:47 -0800 Subject: [PATCH 2/3] Require to implement `AutoCloseable` for the classes derived from `HostUDFWrapper` (#17727) This adds the requirement to implement `AutoCloseable` to the classes derived from `HostUDFWrapper`, forcing them to delete the native UDF instance upon class destruction. Doing so will fix the memory leak issue when the native UDF instance never being destroyed. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/17727 --- java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java index 0b6ecf2e140..124f2c99188 100644 --- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java +++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java @@ -24,8 +24,10 @@ *

* A new host UDF aggregation implementation must extend this class and override the * {@code hashCode} and {@code equals} methods for such purposes. + * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must + * also be overridden to automatically delete the native UDF instance upon class destruction. */ -public abstract class HostUDFWrapper { +public abstract class HostUDFWrapper implements AutoCloseable { public final long udfNativeHandle; public HostUDFWrapper(long udfNativeHandle) { From 847029172ce47ef2109dc825f149ab58130a2fc6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Jan 2025 09:18:13 -0600 Subject: [PATCH 3/3] convert all nulls to nans in a specific scenario (#17677) Fixes: #17666 This PR ensures we convert all nulls to nan's in float columns only in pandas compatibility mode. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17677 --- python/cudf/cudf/core/column/column.py | 4 ++++ python/cudf/cudf/tests/test_series.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 30da8727366..19f2802553d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2413,7 +2413,11 @@ def as_column( and pa.types.is_integer(arbitrary.type) and arbitrary.null_count > 0 ): + # TODO: Need to re-visit this cast and fill_null + # calls while addressing the following issue: + # https://github.com/rapidsai/cudf/issues/14149 arbitrary = arbitrary.cast(pa.float64()) + arbitrary = pc.fill_null(arbitrary, np.nan) if ( cudf.get_option("default_integer_bitwidth") and pa.types.is_integer(arbitrary.type) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f8697c5c6b8..891c0ede9a4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import decimal import hashlib @@ -3003,3 +3003,12 @@ def test_dtype_dtypes_equal(): ser = cudf.Series([0]) assert ser.dtype is ser.dtypes assert ser.dtypes is ser.to_pandas().dtypes + + +def test_null_like_to_nan_pandas_compat(): + with cudf.option_context("mode.pandas_compatible", True): + ser = cudf.Series([1, 2, np.nan, 10, None]) + pser = pd.Series([1, 2, np.nan, 10, None]) + + assert pser.dtype == ser.dtype + assert_eq(ser, pser)