From 62472e7730b6db67c5fb67b91b21b6b96d4b97dd Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 2 Nov 2022 13:21:38 -0700 Subject: [PATCH 01/17] add from_dict and to_dict --- docs/cudf/source/api_docs/dataframe.rst | 2 + python/cudf/cudf/core/dataframe.py | 249 +++++++++++++++++++++++ python/cudf/cudf/core/indexed_frame.py | 7 - python/cudf/cudf/tests/test_dataframe.py | 85 ++++++-- 4 files changed, 317 insertions(+), 26 deletions(-) diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index f5c9053ec92..fd717f5b18d 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -249,10 +249,12 @@ Serialization / IO / conversion :toctree: api/ DataFrame.from_arrow + DataFrame.from_dict DataFrame.from_pandas DataFrame.from_records DataFrame.hash_values DataFrame.to_arrow + DataFrame.to_dict DataFrame.to_dlpack DataFrame.to_parquet DataFrame.to_csv diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5c24b222a1b..957ddc1d96e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1992,6 +1992,247 @@ def _make_operands_and_index_for_binop( operands[k] = (left_default, v, reflect, None) return operands, index + @classmethod + def from_dict( + cls, + data: dict, + orient: str = "columns", + dtype: Dtype = None, + columns: list = None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index', 'tight'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + dtype : dtype, default None + Data type to force, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ``ValueError`` + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from structured ndarray, sequence + of tuples or dicts, or DataFrame. + DataFrame : DataFrame object creation using constructor. + DataFrame.to_dict : Convert the DataFrame to a dictionary. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> import cudf + >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} + >>> cudf.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 11, 12, 13]} + >>> cudf.DataFrame.from_dict(data, orient='index') + 0 1 2 3 + row_1 3 2 1 0 + row_2 10 11 12 13 + + When using the 'index' orientation, the column names can be + specified manually: + + >>> cudf.DataFrame.from_dict(data, orient='index', + ... columns=['A', 'B', 'C', 'D']) + A B C D + row_1 3 2 1 0 + row_2 10 11 12 13 + + Specify ``orient='tight'`` to create the DataFrame using a 'tight' + format: + + >>> data = {'index': [('a', 'b'), ('a', 'c')], + ... 'columns': [('x', 1), ('y', 2)], + ... 'data': [[1, 3], [2, 4]], + ... 'index_names': ['n1', 'n2'], + ... 'column_names': ['z1', 'z2']} + >>> cudf.DataFrame.from_dict(data, orient='tight') + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 + """ # noqa: E501 + index = None + orient = orient.lower() + if orient == "index": + + if len(data) > 0: + data_list = list(data.values()) + if isinstance(data_list[0], (pd.Series, Series, dict)): + if isinstance(data_list[0], Series): + data = {key: val.to_pandas() for key, val in data} + data = _from_nested_dict(data) + else: + index = list(data.keys()) + data = data_list + df = pd.DataFrame(data=data, index=index, columns=columns) + df = cudf.from_pandas(df) + if dtype: + return df.astype(dtype) + return df + elif orient == "columns" or orient == "tight": + if columns is not None: + raise ValueError( + f"cannot use columns parameter with orient='{orient}'" + ) + else: + raise ValueError( + f"Expected 'index', 'columns' or 'tight' for orient " + f"parameter. Got '{orient}' instead" + ) + + if orient != "tight": + return cls(data, index=index, columns=columns, dtype=dtype) + else: + realdata = data["data"] + + def create_index(indexlist, namelist, library): + if len(namelist) > 1: + index = library.MultiIndex.from_tuples( + indexlist, names=namelist + ) + else: + index = library.Index(indexlist, name=namelist[0]) + return index + + index = create_index(data["index"], data["index_names"], cudf) + columns = create_index(data["columns"], data["column_names"], pd) + return cls(realdata, index=index, columns=columns, dtype=dtype) + + def to_dict( + self, + orient: str = "dict", + into: type[dict] = dict, + ) -> dict | list[dict]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + Abbreviations are allowed. `s` indicates `series` and `sp` + indicates `split`. + + into : class, default dict + The collections.abc.Mapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + Returns + ------- + dict, list or collections.abc.Mapping + Return a collections.abc.Mapping object representing the DataFrame. + The resulting transformation depends on the `orient` parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'col1': [1, 2], + ... 'col2': [0.5, 0.75]}, + ... index=['row1', 'row2']) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict('series') + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict('split') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict('records') + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict('index') + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + >>> df.to_dict('tight') + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + >>> dd = defaultdict(list) + >>> df.to_dict('records', into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ # noqa: E501 + orient = orient.lower() + + if orient == "series": + # Special case needed because we don't want + # convert the values as pd.Series, and want + # them as cudf.Series + into_c = pd.core.common.standardize_mapping(into) + return into_c((k, v) for k, v in self.items()) + + return self.to_pandas().to_dict(orient=orient, into=into) + @_cudf_nvtx_annotate def scatter_by_map( self, map_index, map_size=None, keep_index=True, **kwargs @@ -7444,3 +7685,11 @@ def _reassign_categories(categories, cols, col_idxs): offset=cols[name].offset, size=cols[name].size, ) + + +def _from_nested_dict(data) -> defaultdict: + new_data: defaultdict = defaultdict(dict) + for index, s in data.items(): + for col, v in s.items(): + new_data[col][index] = v + return new_data diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 57469c0ff72..ebcd42258f0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -270,13 +270,6 @@ def __init__(self, data=None, index=None): # to ensure that this constructor is always invoked with an index. self._index = index - def to_dict(self, *args, **kwargs): # noqa: D102 - raise TypeError( - "cuDF does not support conversion to host memory " - "via `to_dict()` method. Consider using " - "`.to_pandas().to_dict()` to construct a Python dictionary." - ) - @property def _num_rows(self) -> int: # Important to use the index because the data may be empty. diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1fcfbe5fc91..0b44c8d6a1a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9,6 +9,7 @@ import string import textwrap import warnings +from collections import OrderedDict, defaultdict from contextlib import contextmanager from copy import copy @@ -6762,27 +6763,73 @@ def test_cudf_isclose_different_index(): assert_eq(expected, cudf.isclose(s1, s2)) -def test_dataframe_to_dict_error(): - df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}) - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via `to_dict()` method. Consider using " - r"`.to_pandas().to_dict()` to construct a Python dictionary." - ), - ): - df.to_dict() +@pytest.mark.parametrize( + "orient", ["dict", "list", "split", "tight", "records", "index", "series"] +) +@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) +def test_dataframe_to_dict(orient, into): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12]) + pdf = df.to_pandas() - with pytest.raises( - TypeError, - match=re.escape( - r"cuDF does not support conversion to host memory " - r"via `to_dict()` method. Consider using " - r"`.to_pandas().to_dict()` to construct a Python dictionary." + actual = df.to_dict(orient=orient, into=into) + expected = pdf.to_dict(orient=orient, into=into) + if orient == "series": + assert actual.keys() == expected.keys() + for key in actual.keys(): + assert_eq(expected[key], actual[key]) + else: + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data, orient, dtype, columns", + [ + ( + {"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, + "columns", + None, + None, ), - ): - df["a"].to_dict() + ({"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, "index", None, None), + ( + {"col_1": [None, 2, 1, 0], "col_2": [3, None, 1, 0]}, + "index", + None, + ["A", "B", "C", "D"], + ), + ( + { + "col_1": ["ab", "cd", "ef", "gh"], + "col_2": ["zx", "one", "two", "three"], + }, + "index", + None, + ["A", "B", "C", "D"], + ), + ( + { + "index": [("a", "b"), ("a", "c")], + "columns": [("x", 1), ("y", 2)], + "data": [[1, 3], [2, 4]], + "index_names": ["n1", "n2"], + "column_names": ["z1", "z2"], + }, + "tight", + "float64", + None, + ), + ], +) +def test_dataframe_from_dict(data, orient, dtype, columns): + + expected = pd.DataFrame.from_dict( + data=data, orient=orient, dtype=dtype, columns=columns + ) + actual = pd.DataFrame.from_dict( + data=data, orient=orient, dtype=dtype, columns=columns + ) + + assert_eq(expected, actual) @pytest.mark.parametrize( From 83f29dd18d44a9bce3575c052fcf67dfc5a311f7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 2 Nov 2022 13:36:22 -0700 Subject: [PATCH 02/17] Add Series.to_dict --- docs/cudf/source/api_docs/series.rst | 1 + python/cudf/cudf/core/dataframe.py | 2 ++ python/cudf/cudf/core/series.py | 39 +++++++++++++++++++++++++++ python/cudf/cudf/tests/test_series.py | 12 +++++++++ 4 files changed, 54 insertions(+) diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 245793e5ea6..e0273d57155 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -367,6 +367,7 @@ Serialization / IO / conversion Series.to_arrow Series.to_cupy + Series.to_dict Series.to_dlpack Series.to_frame Series.to_hdf diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 957ddc1d96e..a27b3e20384 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1993,6 +1993,7 @@ def _make_operands_and_index_for_binop( return operands, index @classmethod + @_cudf_nvtx_annotate def from_dict( cls, data: dict, @@ -2125,6 +2126,7 @@ def create_index(indexlist, namelist, library): columns = create_index(data["columns"], data["column_names"], pd) return cls(realdata, index=index, columns=columns, dtype=dtype) + @_cudf_nvtx_annotate def to_dict( self, orient: str = "dict", diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 70e8c3d6860..8e3f34f2333 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -725,6 +725,45 @@ def drop( labels, axis, index, columns, level, inplace, errors ) + @_cudf_nvtx_annotate + def to_dict(self, into: type[dict] = dict) -> dict: + """ + Convert Series to {label -> value} dict or dict-like object. + + Parameters + ---------- + into : class, default dict + The collections.abc.Mapping subclass to use as the return + object. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + Returns + ------- + collections.abc.Mapping + Key-value representation of Series. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 2, 3, 4]) + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + >>> s.to_dict() + {0: 1, 1: 2, 2: 3, 3: 4} + >>> from collections import OrderedDict, defaultdict + >>> s.to_dict(OrderedDict) + OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)]) + >>> dd = defaultdict(list) + >>> s.to_dict(dd) + defaultdict(, {0: 1, 1: 2, 2: 3, 3: 4}) + """ + return self.to_pandas().to_dict(into=into) + @_cudf_nvtx_annotate def append(self, to_append, ignore_index=False, verify_integrity=False): """Append values from another ``Series`` or array-like object. diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c0b99f56238..a67816a02eb 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -3,6 +3,7 @@ import hashlib import operator import re +from collections import OrderedDict, defaultdict from string import ascii_letters, digits import cupy as cp @@ -1951,3 +1952,14 @@ def test_set_bool_error(dtype, bool_scalar): lfunc_args_and_kwargs=([bool_scalar],), rfunc_args_and_kwargs=([bool_scalar],), ) + + +@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)]) +def test_series_to_dict(into): + gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100]) + ps = gs.to_pandas() + + actual = gs.to_dict(into=into) + expected = ps.to_dict(into=into) + + assert_eq(expected, actual) From c2b0c2328ae9d118e9a4475b4b30c41efddb0d22 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 2 Nov 2022 13:52:50 -0700 Subject: [PATCH 03/17] fix docstring --- python/cudf/cudf/core/dataframe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a27b3e20384..aae1c9447ab 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2219,6 +2219,7 @@ def to_dict( ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) If you want a `defaultdict`, you need to initialize it: + >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) [defaultdict(, {'col1': 1, 'col2': 0.5}), From cd5b23fc4cb0a34eb079bb9c40b7745f7daa1a28 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 4 Nov 2022 08:36:18 -0700 Subject: [PATCH 04/17] user from_dict --- python/dask_cudf/dask_cudf/backends.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index f02c75eb3e8..ba42de36092 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -474,10 +474,9 @@ def from_dict(data, npartitions, orient="columns", **kwargs): if orient != "columns": raise ValueError(f"orient={orient} is not supported") - # TODO: Use cudf.from_dict - # (See: https://github.com/rapidsai/cudf/issues/11934) + return from_cudf( - cudf.DataFrame(data), + cudf.DataFrame.from_dict(data, orient=orient, **kwargs), npartitions=npartitions, ) From 13fbe552770fdac356cb41873b782293c2f55d1e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 4 Nov 2022 14:15:16 -0700 Subject: [PATCH 05/17] add dask_cudf.from_dict --- python/dask_cudf/dask_cudf/__init__.py | 9 ++++++++- python/dask_cudf/dask_cudf/backends.py | 16 ++++++++-------- python/dask_cudf/dask_cudf/core.py | 12 ++++++++++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 5e3a9342c25..9e87f778229 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -6,7 +6,14 @@ from cudf._version import get_versions from . import backends -from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe +from .core import ( + DataFrame, + Series, + concat, + from_cudf, + from_dask_dataframe, + from_dict, +) from .groupby import groupby_agg from .io import read_csv, read_json, read_orc, read_text, to_orc diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index ba42de36092..f50d31cfcd9 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -470,14 +470,14 @@ class CudfBackendEntrypoint(DataFrameBackendEntrypoint): @staticmethod def from_dict(data, npartitions, orient="columns", **kwargs): - from dask_cudf import from_cudf - - if orient != "columns": - raise ValueError(f"orient={orient} is not supported") - - return from_cudf( - cudf.DataFrame.from_dict(data, orient=orient, **kwargs), - npartitions=npartitions, + from dask_cudf import from_dict + + return from_dict( + data, + npartitions, + orient=orient, + dtype=kwargs.get("dtype", None), + columns=kwargs.get("columns", None), ) @staticmethod diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 04b6ff401dc..51d0705b4e3 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -702,6 +702,18 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) +def from_dict(data, npartitions, orient="columns", dtype=None, columns=None): + + return dask.dataframe.io.from_dict( + data, + npartitions, + orient=orient, + dtype=dtype, + columns=columns, + constructor=cudf.DataFrame, + ) + + from_cudf.__doc__ = ( "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__ ) From 02b88d1f2df6a41ded80143ab36092a64be1aa7e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 7 Nov 2022 05:45:42 -0800 Subject: [PATCH 06/17] import fix --- python/dask_cudf/dask_cudf/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 51d0705b4e3..565b8a9aea6 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -704,7 +704,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): def from_dict(data, npartitions, orient="columns", dtype=None, columns=None): - return dask.dataframe.io.from_dict( + return DataFrame.from_dict( data, npartitions, orient=orient, From f86a06bbd198ffb13b5c39ebcdae15dd1f2f3a40 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 7 Nov 2022 06:54:13 -0800 Subject: [PATCH 07/17] fix --- python/dask_cudf/dask_cudf/core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 565b8a9aea6..3aada8cd8ba 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -706,11 +706,10 @@ def from_dict(data, npartitions, orient="columns", dtype=None, columns=None): return DataFrame.from_dict( data, - npartitions, + npartitions=npartitions, orient=orient, dtype=dtype, columns=columns, - constructor=cudf.DataFrame, ) From 35c6f3fd857c2f9b8566dcee06032c596a788f19 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 7 Nov 2022 07:11:30 -0800 Subject: [PATCH 08/17] fix --- python/dask_cudf/dask_cudf/__init__.py | 9 +-------- python/dask_cudf/dask_cudf/backends.py | 15 +++++++++------ python/dask_cudf/dask_cudf/core.py | 11 ----------- 3 files changed, 10 insertions(+), 25 deletions(-) diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 9e87f778229..5e3a9342c25 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -6,14 +6,7 @@ from cudf._version import get_versions from . import backends -from .core import ( - DataFrame, - Series, - concat, - from_cudf, - from_dask_dataframe, - from_dict, -) +from .core import DataFrame, Series, concat, from_cudf, from_dask_dataframe from .groupby import groupby_agg from .io import read_csv, read_json, read_orc, read_text, to_orc diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index f50d31cfcd9..06e43538a32 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -469,15 +469,18 @@ class CudfBackendEntrypoint(DataFrameBackendEntrypoint): """ @staticmethod - def from_dict(data, npartitions, orient="columns", **kwargs): - from dask_cudf import from_dict + def from_dict( + data, npartitions, orient="columns", dtype=None, columns=None + ): - return from_dict( + return _default_backend( + dd.from_dict, data, - npartitions, + npartitions=npartitions, orient=orient, - dtype=kwargs.get("dtype", None), - columns=kwargs.get("columns", None), + dtype=dtype, + columns=columns, + constructor=cudf.DataFrame, ) @staticmethod diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 3aada8cd8ba..04b6ff401dc 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -702,17 +702,6 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) -def from_dict(data, npartitions, orient="columns", dtype=None, columns=None): - - return DataFrame.from_dict( - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - ) - - from_cudf.__doc__ = ( "Wraps main-line Dask from_pandas...\n" + dd.from_pandas.__doc__ ) From 34cb032a528f4958b5d70186d4e373cccf75cc9c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 8 Nov 2022 10:50:59 -0600 Subject: [PATCH 09/17] Apply suggestions from code review Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index aae1c9447ab..7d362fbf608 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2087,7 +2087,7 @@ def from_dict( data_list = list(data.values()) if isinstance(data_list[0], (pd.Series, Series, dict)): if isinstance(data_list[0], Series): - data = {key: val.to_pandas() for key, val in data} + data = {key: val.to_pandas() for key, val in data.items()} data = _from_nested_dict(data) else: index = list(data.keys()) From 49a67d04d0cc09555c76a1fb770c296658f9c5fe Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 8 Nov 2022 08:57:14 -0800 Subject: [PATCH 10/17] inline implementation --- python/cudf/cudf/core/dataframe.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7d362fbf608..e6b8f9d2b71 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2087,8 +2087,14 @@ def from_dict( data_list = list(data.values()) if isinstance(data_list[0], (pd.Series, Series, dict)): if isinstance(data_list[0], Series): - data = {key: val.to_pandas() for key, val in data.items()} - data = _from_nested_dict(data) + data = { + key: val.to_pandas() for key, val in data.items() + } + new_data: defaultdict = defaultdict(dict) + for index, s in data.items(): + for col, v in s.items(): + new_data[col][index] = v + data = new_data else: index = list(data.keys()) data = data_list @@ -7688,11 +7694,3 @@ def _reassign_categories(categories, cols, col_idxs): offset=cols[name].offset, size=cols[name].size, ) - - -def _from_nested_dict(data) -> defaultdict: - new_data: defaultdict = defaultdict(dict) - for index, s in data.items(): - for col, v in s.items(): - new_data[col][index] = v - return new_data From 37f4f9f0ddbd6bc5f8d7fef1c98a5dfe63cc5d7e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 8 Nov 2022 10:29:21 -0800 Subject: [PATCH 11/17] address reviews --- python/cudf/cudf/core/dataframe.py | 15 ++++++--------- python/cudf/cudf/tests/test_dataframe.py | 9 +++++++++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e6b8f9d2b71..e5583b3c651 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2085,21 +2085,18 @@ def from_dict( if len(data) > 0: data_list = list(data.values()) - if isinstance(data_list[0], (pd.Series, Series, dict)): - if isinstance(data_list[0], Series): - data = { - key: val.to_pandas() for key, val in data.items() - } + if isinstance(data_list[0], Series): + data = DataFrame.from_dict(data=data).T + elif isinstance(data_list[0], (pd.Series, dict)): new_data: defaultdict = defaultdict(dict) - for index, s in data.items(): + for i, s in data.items(): for col, v in s.items(): - new_data[col][index] = v + new_data[col][i] = v data = new_data else: index = list(data.keys()) data = data_list - df = pd.DataFrame(data=data, index=index, columns=columns) - df = cudf.from_pandas(df) + df = cudf.DataFrame(data=data, index=index, columns=columns) if dtype: return df.astype(dtype) return df diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 4e73aec8b71..6ea1c68ff61 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6888,6 +6888,15 @@ def test_dataframe_from_dict(data, orient, dtype, columns): assert_eq(expected, actual) +def test_dataframe_from_dict_transposed(): + pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]} + gd_data = {key: cudf.Series(val) for key, val in pd_data.items()} + + expected = pd.DataFrame.from_dict(pd_data, orient="index") + actual = cudf.DataFrame.from_dict(gd_data, orient="index") + assert_eq(expected, actual) + + @pytest.mark.parametrize( "df", [ From a68f36c683711c48297ed8528b780dde02995f3f Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 8 Nov 2022 14:15:35 -0600 Subject: [PATCH 12/17] Update python/cudf/cudf/core/dataframe.py --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e5583b3c651..356b721f494 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2112,7 +2112,7 @@ def from_dict( ) if orient != "tight": - return cls(data, index=index, columns=columns, dtype=dtype) + return cls(data, index=index, dtype=dtype) else: realdata = data["data"] From 68927bd39bd05fce78e719623e806f481643f602 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Nov 2022 16:08:43 -0600 Subject: [PATCH 13/17] Apply suggestions from code review --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 356b721f494..3364b397501 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2088,7 +2088,7 @@ def from_dict( if isinstance(data_list[0], Series): data = DataFrame.from_dict(data=data).T elif isinstance(data_list[0], (pd.Series, dict)): - new_data: defaultdict = defaultdict(dict) + new_data = defaultdict(dict) for i, s in data.items(): for col, v in s.items(): new_data[col][i] = v From dab48da216126b9ab7ffb5b65c11d48cbe4efa57 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Nov 2022 16:12:29 -0600 Subject: [PATCH 14/17] Update python/cudf/cudf/core/dataframe.py --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3364b397501..356b721f494 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2088,7 +2088,7 @@ def from_dict( if isinstance(data_list[0], Series): data = DataFrame.from_dict(data=data).T elif isinstance(data_list[0], (pd.Series, dict)): - new_data = defaultdict(dict) + new_data: defaultdict = defaultdict(dict) for i, s in data.items(): for col, v in s.items(): new_data[col][i] = v From 83003763f25e62efd7c9ba6b144ada44af9e7fcb Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Nov 2022 16:15:47 -0600 Subject: [PATCH 15/17] Update python/cudf/cudf/core/dataframe.py Co-authored-by: Vyas Ramasubramani --- python/cudf/cudf/core/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 356b721f494..b05af5e0419 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2231,9 +2231,8 @@ def to_dict( orient = orient.lower() if orient == "series": - # Special case needed because we don't want - # convert the values as pd.Series, and want - # them as cudf.Series + # Special case needed to avoid converting + # cudf.Series objects into pd.Series into_c = pd.core.common.standardize_mapping(into) return into_c((k, v) for k, v in self.items()) From 281b935e773de1b4c882a7d8adf11e402b2a4133 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 11 Nov 2022 07:41:31 -0800 Subject: [PATCH 16/17] address reviews --- python/cudf/cudf/core/dataframe.py | 83 ++++++++++----------- python/cudf/cudf/core/frame.py | 12 ++-- python/cudf/cudf/tests/test_dataframe.py | 91 +++++++++++++++++++++++- 3 files changed, 141 insertions(+), 45 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 9a88d138153..b5da2206a6b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2079,56 +2079,51 @@ def from_dict( a b 1 3 c 2 4 """ # noqa: E501 - index = None + orient = orient.lower() if orient == "index": - - if len(data) > 0: - data_list = list(data.values()) - if isinstance(data_list[0], Series): - data = DataFrame.from_dict(data=data).T - elif isinstance(data_list[0], (pd.Series, dict)): - new_data: defaultdict = defaultdict(dict) - for i, s in data.items(): - for col, v in s.items(): - new_data[col][i] = v - data = new_data - else: - index = list(data.keys()) - data = data_list - df = cudf.DataFrame(data=data, index=index, columns=columns) - if dtype: - return df.astype(dtype) - return df - elif orient == "columns" or orient == "tight": + if len(data) > 0 and isinstance( + next(iter(data.values())), (cudf.Series, cupy.ndarray) + ): + result = cls(data).T + result.columns = columns + if dtype is not None: + result = result.astype(dtype) + return result + else: + return cls.from_pandas( + pd.DataFrame.from_dict( + data=data, + orient=orient, + dtype=dtype, + columns=columns, + ) + ) + elif orient == "columns": + if columns is not None: + raise ValueError( + "Cannot use columns parameter with orient='columns'" + ) + return cls(data, columns=None, dtype=dtype) + elif orient == "tight": if columns is not None: raise ValueError( - f"cannot use columns parameter with orient='{orient}'" + "Cannot use columns parameter with orient='right'" ) + + index = _from_dict_create_index( + data["index"], data["index_names"], cudf + ) + columns = _from_dict_create_index( + data["columns"], data["column_names"], pd + ) + return cls(data["data"], index=index, columns=columns, dtype=dtype) else: raise ValueError( - f"Expected 'index', 'columns' or 'tight' for orient " + "Expected 'index', 'columns' or 'tight' for orient " f"parameter. Got '{orient}' instead" ) - if orient != "tight": - return cls(data, index=index, columns=columns, dtype=dtype) - else: - realdata = data["data"] - - def create_index(indexlist, namelist, library): - if len(namelist) > 1: - index = library.MultiIndex.from_tuples( - indexlist, names=namelist - ) - else: - index = library.Index(indexlist, name=namelist[0]) - return index - - index = create_index(data["index"], data["index_names"], cudf) - columns = create_index(data["columns"], data["column_names"], pd) - return cls(realdata, index=index, columns=columns, dtype=dtype) - @_cudf_nvtx_annotate def to_dict( self, @@ -7691,3 +7686,11 @@ def _reassign_categories(categories, cols, col_idxs): offset=cols[name].offset, size=cols[name].size, ) + + +def _from_dict_create_index(indexlist, namelist, library): + if len(namelist) > 1: + index = library.MultiIndex.from_tuples(indexlist, names=namelist) + else: + index = library.Index(indexlist, name=namelist[0]) + return index diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 29d5c9ae26d..687338f882d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -321,15 +321,19 @@ def __len__(self): @_cudf_nvtx_annotate def astype(self, dtype, copy=False, **kwargs): - result = {} + result_data = {} for col_name, col in self._data.items(): dt = dtype.get(col_name, col.dtype) if not is_dtype_equal(dt, col.dtype): - result[col_name] = col.astype(dt, copy=copy, **kwargs) + result_data[col_name] = col.astype(dt, copy=copy, **kwargs) else: - result[col_name] = col.copy() if copy else col + result_data[col_name] = col.copy() if copy else col - return result + return ColumnAccessor._create_unsafe( + data=result_data, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ) @_cudf_nvtx_annotate def equals(self, other): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6ea1c68ff61..6a7dfcbf7ff 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6881,7 +6881,8 @@ def test_dataframe_from_dict(data, orient, dtype, columns): expected = pd.DataFrame.from_dict( data=data, orient=orient, dtype=dtype, columns=columns ) - actual = pd.DataFrame.from_dict( + + actual = cudf.DataFrame.from_dict( data=data, orient=orient, dtype=dtype, columns=columns ) @@ -6894,9 +6895,97 @@ def test_dataframe_from_dict_transposed(): expected = pd.DataFrame.from_dict(pd_data, orient="index") actual = cudf.DataFrame.from_dict(gd_data, orient="index") + + gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()} + actual = cudf.DataFrame.from_dict(gd_data, orient="index") assert_eq(expected, actual) +@pytest.mark.parametrize( + "pd_data, gd_data, orient, dtype, columns", + [ + ( + {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, + { + "col_1": cupy.array([3, 2, 1, 0]), + "col_2": cupy.array([3, 2, 1, 0]), + }, + "columns", + None, + None, + ), + ( + {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])}, + { + "col_1": cupy.array([3, 2, 1, 0]), + "col_2": cupy.array([3, 2, 1, 0]), + }, + "index", + None, + None, + ), + ( + { + "col_1": np.array([None, 2, 1, 0]), + "col_2": np.array([3, None, 1, 0]), + }, + { + "col_1": cupy.array([np.nan, 2, 1, 0]), + "col_2": cupy.array([3, np.nan, 1, 0]), + }, + "index", + None, + ["A", "B", "C", "D"], + ), + ( + { + "col_1": np.array(["ab", "cd", "ef", "gh"]), + "col_2": np.array(["zx", "one", "two", "three"]), + }, + { + "col_1": np.array(["ab", "cd", "ef", "gh"]), + "col_2": np.array(["zx", "one", "two", "three"]), + }, + "index", + None, + ["A", "B", "C", "D"], + ), + ( + { + "index": [("a", "b"), ("a", "c")], + "columns": [("x", 1), ("y", 2)], + "data": [np.array([1, 3]), np.array([2, 4])], + "index_names": ["n1", "n2"], + "column_names": ["z1", "z2"], + }, + { + "index": [("a", "b"), ("a", "c")], + "columns": [("x", 1), ("y", 2)], + "data": [cupy.array([1, 3]), cupy.array([2, 4])], + "index_names": ["n1", "n2"], + "column_names": ["z1", "z2"], + }, + "tight", + "float64", + None, + ), + ], +) +def test_dataframe_from_dict_cp_np_arrays( + pd_data, gd_data, orient, dtype, columns +): + + expected = pd.DataFrame.from_dict( + data=pd_data, orient=orient, dtype=dtype, columns=columns + ) + + actual = cudf.DataFrame.from_dict( + data=gd_data, orient=orient, dtype=dtype, columns=columns + ) + + assert_eq(expected, actual, check_dtype=dtype is not None) + + @pytest.mark.parametrize( "df", [ From dc7eaa5b0d68dd02f7dcb8ed040237a1f6b3d501 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 14 Nov 2022 10:29:59 -0800 Subject: [PATCH 17/17] add coverage --- python/cudf/cudf/tests/test_dataframe.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6a7dfcbf7ff..4ec770e0d6b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -6889,15 +6889,16 @@ def test_dataframe_from_dict(data, orient, dtype, columns): assert_eq(expected, actual) -def test_dataframe_from_dict_transposed(): +@pytest.mark.parametrize("dtype", ["int64", "str", None]) +def test_dataframe_from_dict_transposed(dtype): pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]} gd_data = {key: cudf.Series(val) for key, val in pd_data.items()} - expected = pd.DataFrame.from_dict(pd_data, orient="index") - actual = cudf.DataFrame.from_dict(gd_data, orient="index") + expected = pd.DataFrame.from_dict(pd_data, orient="index", dtype=dtype) + actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()} - actual = cudf.DataFrame.from_dict(gd_data, orient="index") + actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype) assert_eq(expected, actual)