Skip to content

Commit

Permalink
Merge pull request #17771 from Matt711/fea/bump-polars-version
Browse files Browse the repository at this point in the history
Bump polars version to 1.20
  • Loading branch information
sean-frye authored Jan 28, 2025
2 parents e0fe51d + 3aa8f0f commit 328605f
Show file tree
Hide file tree
Showing 13 changed files with 103 additions and 106 deletions.
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.18
- polars>=1.20,<1.22
- pre-commit
- ptxcompiler
- pyarrow>=14.0.0,<20.0.0a0
Expand Down
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ dependencies:
- pandas
- pandas>=2.0,<2.2.4dev0
- pandoc
- polars>=1.11,<1.18
- polars>=1.20,<1.22
- pre-commit
- pyarrow>=14.0.0,<20.0.0a0
- pydata-sphinx-theme>=0.15.4
Expand Down
2 changes: 1 addition & 1 deletion conda/recipes/cudf-polars/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ requirements:
run:
- python
- pylibcudf ={{ version }}
- polars >=1.11,<1.18
- polars >=1.20,<1.22
- {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

test:
Expand Down
2 changes: 1 addition & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ dependencies:
common:
- output_types: [conda, requirements, pyproject]
packages:
- polars>=1.11,<1.18
- polars>=1.20,<1.22
run_cudf_polars_experimental:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
3 changes: 2 additions & 1 deletion python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
# TODO: remove need for this
# ruff: noqa: D101
Expand Down Expand Up @@ -58,6 +58,7 @@ class Name(IntEnum):
OrdinalDay = auto()
Quarter = auto()
ReplaceTimeZone = auto()
Replace = auto()
Round = auto()
Second = auto()
Time = auto()
Expand Down
5 changes: 3 additions & 2 deletions python/cudf_polars/cudf_polars/dsl/expressions/string.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0
# TODO: remove need for this
# ruff: noqa: D101
Expand Down Expand Up @@ -41,7 +41,7 @@ class Name(IntEnum):
ConcatHorizontal = auto()
ConcatVertical = auto()
Contains = auto()
ContainsMany = auto()
ContainsAny = auto()
CountMatches = auto()
EndsWith = auto()
EscapeRegex = auto()
Expand All @@ -57,6 +57,7 @@ class Name(IntEnum):
LenBytes = auto()
LenChars = auto()
Lowercase = auto()
Normalize = auto()
PadEnd = auto()
PadStart = auto()
Replace = auto()
Expand Down
64 changes: 28 additions & 36 deletions python/cudf_polars/cudf_polars/dsl/ir.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
from cudf_polars.dsl.nodebase import Node
from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
from cudf_polars.utils import dtypes
from cudf_polars.utils.versions import POLARS_VERSION_GT_112

if TYPE_CHECKING:
from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
Expand Down Expand Up @@ -628,12 +627,7 @@ def slice_skip(tbl: plc.Table):
) # pragma: no cover; post init trips first
if row_index is not None:
name, offset = row_index
if POLARS_VERSION_GT_112:
# If we sliced away some data from the start, that
# shifts the row index.
# But prior to 1.13, polars had this wrong, so we match behaviour
# https://github.com/pola-rs/polars/issues/19607
offset += skip_rows
offset += skip_rows
dtype = schema[name]
step = plc.interop.from_arrow(
pa.scalar(1, type=plc.interop.to_arrow(dtype))
Expand Down Expand Up @@ -763,11 +757,7 @@ def do_evaluate(
c.obj.type() == dtype
for c, dtype in zip(df.columns, schema.values(), strict=True)
)
if predicate is not None:
(mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows)
return df.filter(mask)
else:
return df
return df


class Select(IR):
Expand Down Expand Up @@ -1107,7 +1097,7 @@ class Join(IR):
right_on: tuple[expr.NamedExpr, ...]
"""List of expressions used as keys in the right frame."""
options: tuple[
Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
bool,
tuple[int, int] | None,
str,
Expand Down Expand Up @@ -1142,50 +1132,45 @@ def __init__(
# TODO: Implement maintain_order
if options[5] != "none":
raise NotImplementedError("maintain_order not implemented yet")
if any(
isinstance(e.value, expr.Literal)
for e in itertools.chain(self.left_on, self.right_on)
):
raise NotImplementedError("Join with literal as join key.")

@staticmethod
@cache
def _joiners(
how: Literal["inner", "left", "right", "full", "semi", "anti"],
how: Literal["Inner", "Left", "Right", "Full", "Semi", "Anti"],
) -> tuple[
Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
]:
if how == "inner":
if how == "Inner":
return (
plc.join.inner_join,
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
)
elif how == "left" or how == "right":
elif how == "Left" or how == "Right":
return (
plc.join.left_join,
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
plc.copying.OutOfBoundsPolicy.NULLIFY,
)
elif how == "full":
elif how == "Full":
return (
plc.join.full_join,
plc.copying.OutOfBoundsPolicy.NULLIFY,
plc.copying.OutOfBoundsPolicy.NULLIFY,
)
elif how == "semi":
elif how == "Semi":
return (
plc.join.left_semi_join,
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
None,
)
elif how == "anti":
elif how == "Anti":
return (
plc.join.left_anti_join,
plc.copying.OutOfBoundsPolicy.DONT_CHECK,
None,
)
assert_never(how)
assert_never(how) # pragma: no cover

@staticmethod
def _reorder_maps(
Expand Down Expand Up @@ -1246,7 +1231,7 @@ def do_evaluate(
left_on_exprs: Sequence[expr.NamedExpr],
right_on_exprs: Sequence[expr.NamedExpr],
options: tuple[
Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
bool,
tuple[int, int] | None,
str,
Expand All @@ -1258,7 +1243,7 @@ def do_evaluate(
) -> DataFrame:
"""Evaluate and return a dataframe."""
how, join_nulls, zlice, suffix, coalesce, _ = options
if how == "cross":
if how == "Cross":
# Separate implementation, since cross_join returns the
# result, not the gather maps
columns = plc.join.cross_join(left.table, right.table).columns()
Expand Down Expand Up @@ -1295,25 +1280,32 @@ def do_evaluate(
table = plc.copying.gather(left.table, lg, left_policy)
result = DataFrame.from_table(table, left.column_names)
else:
if how == "right":
if how == "Right":
# Right join is a left join with the tables swapped
left, right = right, left
left_on, right_on = right_on, left_on
lg, rg = join_fn(left_on.table, right_on.table, null_equality)
if how == "left" or how == "right":
if how == "Left" or how == "Right":
# Order of left table is preserved
lg, rg = cls._reorder_maps(
left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
)
if coalesce and how == "inner":
right = right.discard_columns(right_on.column_names_set)
if coalesce:
if how == "Full":
# In this case, keys must be column references,
# possibly with dtype casting. We should use them in
# preference to the columns from the original tables.
left = left.with_columns(left_on.columns, replace_only=True)
right = right.with_columns(right_on.columns, replace_only=True)
else:
right = right.discard_columns(right_on.column_names_set)
left = DataFrame.from_table(
plc.copying.gather(left.table, lg, left_policy), left.column_names
)
right = DataFrame.from_table(
plc.copying.gather(right.table, rg, right_policy), right.column_names
)
if coalesce and how != "inner":
if coalesce and how == "Full":
left = left.with_columns(
(
Column(
Expand All @@ -1329,7 +1321,7 @@ def do_evaluate(
replace_only=True,
)
right = right.discard_columns(right_on.column_names_set)
if how == "right":
if how == "Right":
# Undo the swap for right join before gluing together.
left, right = right, left
right = right.rename_columns(
Expand Down Expand Up @@ -1374,7 +1366,9 @@ def do_evaluate(
"""Evaluate and return a dataframe."""
columns = [c.evaluate(df) for c in exprs]
if should_broadcast:
columns = broadcast(*columns, target_length=df.num_rows)
columns = broadcast(
*columns, target_length=df.num_rows if df.num_columns != 0 else None
)
else:
# Polars ensures this is true, but let's make sure nothing
# went wrong. In this case, the parent node is a
Expand Down Expand Up @@ -1769,8 +1763,6 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR)
self._non_child_args = (zlice,)
self.children = children
schema = self.children[0].schema
if not all(s.schema == schema for s in self.children[1:]):
raise NotImplementedError("Schema mismatch")

@classmethod
def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
Expand Down
51 changes: 39 additions & 12 deletions python/cudf_polars/cudf_polars/dsl/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
# IR is versioned with major.minor, minor is bumped for backwards
# compatible changes (e.g. adding new nodes), major is bumped for
# incompatible changes (e.g. renaming nodes).
if (version := self.visitor.version()) >= (4, 3):
if (version := self.visitor.version()) >= (5, 1):
e = NotImplementedError(
f"No support for polars IR {version=}"
) # pragma: no cover; no such version for now.
Expand Down Expand Up @@ -302,25 +302,52 @@ def _(
# Join key dtypes are dependent on the schema of the left and
# right inputs, so these must be translated with the relevant
# input active.
def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
if literal.dtype.id() == plc.types.TypeId.INT32:
plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
return expr.Literal(
plc_int64,
pa.scalar(literal.value.as_py(), type=plc.interop.to_arrow(plc_int64)),
)
return literal

def maybe_adjust_binop(e) -> expr.Expr:
if isinstance(e.value, expr.BinOp):
left, right = e.value.children
if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
e.value.children = (left, adjust_literal_dtype(right))
elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
e.value.children = (adjust_literal_dtype(left), right)
return e

def translate_expr_and_maybe_fix_binop_args(translator, exprs):
return [
maybe_adjust_binop(translate_named_expr(translator, n=e)) for e in exprs
]

with set_node(translator.visitor, node.input_left):
inp_left = translator.translate_ir(n=None)
left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
# TODO: There's bug in the polars type coercion phase. Use
# translate_named_expr directly once it is resolved.
# Tracking issue: https://github.com/pola-rs/polars/issues/20935
left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
with set_node(translator.visitor, node.input_right):
inp_right = translator.translate_ir(n=None)
right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on)

if (how := node.options[0]) in {
"inner",
"left",
"right",
"full",
"cross",
"semi",
"anti",
"Inner",
"Left",
"Right",
"Full",
"Cross",
"Semi",
"Anti",
}:
return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
else:
how, op1, op2 = how
if how != "ie_join":
how, op1, op2 = node.options[0]
if how != "IEJoin":
raise NotImplementedError(
f"Unsupported join type {how}"
) # pragma: no cover; asof joins not yet exposed
Expand Down
20 changes: 13 additions & 7 deletions python/cudf_polars/cudf_polars/testing/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@
from functools import partialmethod
from typing import TYPE_CHECKING

import fastexcel
import pytest
from packaging import version

import polars

Expand Down Expand Up @@ -124,6 +122,9 @@ def pytest_configure(config: pytest.Config) -> None:
"tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
"tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
"tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
"tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
"tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
Expand Down Expand Up @@ -178,6 +179,7 @@ def pytest_configure(config: pytest.Config) -> None:
"tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
"tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
"tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
"tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
"tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
"tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
"tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
Expand All @@ -194,10 +196,6 @@ def pytest_configure(config: pytest.Config) -> None:
# Maybe flaky, order-dependent?
"tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
"tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
"tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
"Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
),
}


Expand All @@ -211,6 +209,11 @@ def pytest_configure(config: pytest.Config) -> None:
# polars that the requested timezone is unknown.
# Since this is random, just skip it, rather than xfailing.
"tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
# The test may segfault with the legacy streaming engine. We should
# remove this skip when all polars tests use the new streaming engine.
"tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
# Fails in CI, but passes locally
"tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
}


Expand All @@ -233,4 +236,7 @@ def pytest_collection_modifyitems(
reason=EXPECTED_FAILURES[item.nodeid][0],
),
)
item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
else:
item.add_marker(
pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
)
Loading

0 comments on commit 328605f

Please sign in to comment.