Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Add support for DataFrame.from_dict\to_dict and Series.to_dict #12048

Merged
merged 25 commits into from
Nov 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
62472e7
add from_dict and to_dict
galipremsagar Nov 2, 2022
83f29dd
Add Series.to_dict
galipremsagar Nov 2, 2022
c2b0c23
fix docstring
galipremsagar Nov 2, 2022
96fa517
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 4, 2022
cd5b23f
user from_dict
galipremsagar Nov 4, 2022
0834a89
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 4, 2022
13fbe55
add dask_cudf.from_dict
galipremsagar Nov 4, 2022
02b88d1
import fix
galipremsagar Nov 7, 2022
fb4d7f6
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 7, 2022
f86a06b
fix
galipremsagar Nov 7, 2022
35c6f3f
fix
galipremsagar Nov 7, 2022
6ad3afa
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 8, 2022
34cb032
Apply suggestions from code review
galipremsagar Nov 8, 2022
fae49cc
Merge branch '11934' of https://github.com/galipremsagar/cudf into 11934
galipremsagar Nov 8, 2022
49a67d0
inline implementation
galipremsagar Nov 8, 2022
37f4f9f
address reviews
galipremsagar Nov 8, 2022
a68f36c
Update python/cudf/cudf/core/dataframe.py
galipremsagar Nov 8, 2022
68927bd
Apply suggestions from code review
galipremsagar Nov 9, 2022
dab48da
Update python/cudf/cudf/core/dataframe.py
galipremsagar Nov 9, 2022
8300376
Update python/cudf/cudf/core/dataframe.py
galipremsagar Nov 9, 2022
deccd56
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 11, 2022
281b935
address reviews
galipremsagar Nov 11, 2022
44942e7
Merge
galipremsagar Nov 11, 2022
cbf4551
Merge remote-tracking branch 'upstream/branch-22.12' into 11934
galipremsagar Nov 14, 2022
dc7eaa5
add coverage
galipremsagar Nov 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/cudf/source/api_docs/dataframe.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,12 @@ Serialization / IO / conversion
:toctree: api/

DataFrame.from_arrow
DataFrame.from_dict
DataFrame.from_pandas
DataFrame.from_records
DataFrame.hash_values
DataFrame.to_arrow
DataFrame.to_dict
DataFrame.to_dlpack
DataFrame.to_parquet
DataFrame.to_csv
Expand Down
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ Serialization / IO / conversion

Series.to_arrow
Series.to_cupy
Series.to_dict
Series.to_dlpack
Series.to_frame
Series.to_hdf
Expand Down
249 changes: 249 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1992,6 +1992,247 @@ def _make_operands_and_index_for_binop(
operands[k] = (left_default, v, reflect, None)
return operands, index

@classmethod
@_cudf_nvtx_annotate
def from_dict(
cls,
data: dict,
orient: str = "columns",
dtype: Dtype = None,
columns: list = None,
) -> DataFrame:
"""
Construct DataFrame from dict of array-like or dicts.
Creates DataFrame object from dictionary by columns or by index
allowing dtype specification.

Parameters
----------
data : dict
Of the form {field : array-like} or {field : dict}.
orient : {'columns', 'index', 'tight'}, default 'columns'
The "orientation" of the data. If the keys of the passed dict
should be the columns of the resulting DataFrame, pass 'columns'
(default). Otherwise if the keys should be rows, pass 'index'.
If 'tight', assume a dict with keys ['index', 'columns', 'data',
'index_names', 'column_names'].
dtype : dtype, default None
Data type to force, otherwise infer.
columns : list, default None
Column labels to use when ``orient='index'``. Raises a ``ValueError``
if used with ``orient='columns'`` or ``orient='tight'``.

Returns
-------
DataFrame

See Also
--------
DataFrame.from_records : DataFrame from structured ndarray, sequence
of tuples or dicts, or DataFrame.
DataFrame : DataFrame object creation using constructor.
DataFrame.to_dict : Convert the DataFrame to a dictionary.

Examples
--------
By default the keys of the dict become the DataFrame columns:

>>> import cudf
>>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
>>> cudf.DataFrame.from_dict(data)
col_1 col_2
0 3 a
1 2 b
2 1 c
3 0 d

Specify ``orient='index'`` to create the DataFrame using dictionary
keys as rows:

>>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 11, 12, 13]}
>>> cudf.DataFrame.from_dict(data, orient='index')
0 1 2 3
row_1 3 2 1 0
row_2 10 11 12 13

When using the 'index' orientation, the column names can be
specified manually:

>>> cudf.DataFrame.from_dict(data, orient='index',
... columns=['A', 'B', 'C', 'D'])
A B C D
row_1 3 2 1 0
row_2 10 11 12 13

Specify ``orient='tight'`` to create the DataFrame using a 'tight'
format:

>>> data = {'index': [('a', 'b'), ('a', 'c')],
... 'columns': [('x', 1), ('y', 2)],
... 'data': [[1, 3], [2, 4]],
... 'index_names': ['n1', 'n2'],
... 'column_names': ['z1', 'z2']}
>>> cudf.DataFrame.from_dict(data, orient='tight')
z1 x y
z2 1 2
n1 n2
a b 1 3
c 2 4
""" # noqa: E501

orient = orient.lower()
if orient == "index":
if len(data) > 0 and isinstance(
next(iter(data.values())), (cudf.Series, cupy.ndarray)
):
result = cls(data).T
result.columns = columns
if dtype is not None:
result = result.astype(dtype)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
return result
else:
return cls.from_pandas(
pd.DataFrame.from_dict(
data=data,
orient=orient,
dtype=dtype,
columns=columns,
)
)
elif orient == "columns":
if columns is not None:
raise ValueError(
"Cannot use columns parameter with orient='columns'"
)
return cls(data, columns=None, dtype=dtype)
elif orient == "tight":
if columns is not None:
raise ValueError(
"Cannot use columns parameter with orient='right'"
)

index = _from_dict_create_index(
data["index"], data["index_names"], cudf
)
columns = _from_dict_create_index(
data["columns"], data["column_names"], pd
)
return cls(data["data"], index=index, columns=columns, dtype=dtype)
else:
raise ValueError(
"Expected 'index', 'columns' or 'tight' for orient "
f"parameter. Got '{orient}' instead"
)

@_cudf_nvtx_annotate
def to_dict(
self,
orient: str = "dict",
into: type[dict] = dict,
) -> dict | list[dict]:
"""
Convert the DataFrame to a dictionary.

The type of the key-value pairs can be customized with the parameters
(see below).

Parameters
----------
orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
Determines the type of the values of the dictionary.

- 'dict' (default) : dict like {column -> {index -> value}}
- 'list' : dict like {column -> [values]}
- 'series' : dict like {column -> Series(values)}
- 'split' : dict like
{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- 'tight' : dict like
{'index' -> [index], 'columns' -> [columns], 'data' -> [values],
'index_names' -> [index.names], 'column_names' -> [column.names]}
- 'records' : list like
[{column -> value}, ... , {column -> value}]
- 'index' : dict like {index -> {column -> value}}
Abbreviations are allowed. `s` indicates `series` and `sp`
indicates `split`.

into : class, default dict
The collections.abc.Mapping subclass used for all Mappings
in the return value. Can be the actual class or an empty
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.

Returns
-------
dict, list or collections.abc.Mapping
Return a collections.abc.Mapping object representing the DataFrame.
The resulting transformation depends on the `orient` parameter.

See Also
--------
DataFrame.from_dict: Create a DataFrame from a dictionary.
DataFrame.to_json: Convert a DataFrame to JSON format.

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'col1': [1, 2],
... 'col2': [0.5, 0.75]},
... index=['row1', 'row2'])
>>> df
col1 col2
row1 1 0.50
row2 2 0.75
>>> df.to_dict()
{'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}

You can specify the return orientation.

>>> df.to_dict('series')
{'col1': row1 1
row2 2
Name: col1, dtype: int64,
'col2': row1 0.50
row2 0.75
Name: col2, dtype: float64}

>>> df.to_dict('split')
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
'data': [[1, 0.5], [2, 0.75]]}

>>> df.to_dict('records')
[{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]

>>> df.to_dict('index')
{'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}

>>> df.to_dict('tight')
{'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}

You can also specify the mapping type.

>>> from collections import OrderedDict, defaultdict
>>> df.to_dict(into=OrderedDict)
OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])

If you want a `defaultdict`, you need to initialize it:

>>> dd = defaultdict(list)
>>> df.to_dict('records', into=dd)
[defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
""" # noqa: E501
orient = orient.lower()

if orient == "series":
# Special case needed to avoid converting
# cudf.Series objects into pd.Series
into_c = pd.core.common.standardize_mapping(into)
return into_c((k, v) for k, v in self.items())

return self.to_pandas().to_dict(orient=orient, into=into)

@_cudf_nvtx_annotate
def scatter_by_map(
self, map_index, map_size=None, keep_index=True, **kwargs
Expand Down Expand Up @@ -7444,3 +7685,11 @@ def _reassign_categories(categories, cols, col_idxs):
offset=cols[name].offset,
size=cols[name].size,
)


def _from_dict_create_index(indexlist, namelist, library):
if len(namelist) > 1:
index = library.MultiIndex.from_tuples(indexlist, names=namelist)
else:
index = library.Index(indexlist, name=namelist[0])
return index
12 changes: 8 additions & 4 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,15 +321,19 @@ def __len__(self):

@_cudf_nvtx_annotate
def astype(self, dtype, copy=False, **kwargs):
result = {}
result_data = {}
for col_name, col in self._data.items():
dt = dtype.get(col_name, col.dtype)
if not is_dtype_equal(dt, col.dtype):
result[col_name] = col.astype(dt, copy=copy, **kwargs)
result_data[col_name] = col.astype(dt, copy=copy, **kwargs)
else:
result[col_name] = col.copy() if copy else col
result_data[col_name] = col.copy() if copy else col

return result
return ColumnAccessor._create_unsafe(
data=result_data,
multiindex=self._data.multiindex,
level_names=self._data.level_names,
)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

@_cudf_nvtx_annotate
def equals(self, other):
Expand Down
7 changes: 0 additions & 7 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,13 +270,6 @@ def __init__(self, data=None, index=None):
# to ensure that this constructor is always invoked with an index.
self._index = index

def to_dict(self, *args, **kwargs): # noqa: D102
raise TypeError(
"cuDF does not support conversion to host memory "
"via `to_dict()` method. Consider using "
"`.to_pandas().to_dict()` to construct a Python dictionary."
)

@property
def _num_rows(self) -> int:
# Important to use the index because the data may be empty.
Expand Down
39 changes: 39 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,45 @@ def drop(
labels, axis, index, columns, level, inplace, errors
)

@_cudf_nvtx_annotate
def to_dict(self, into: type[dict] = dict) -> dict:
"""
Convert Series to {label -> value} dict or dict-like object.

Parameters
----------
into : class, default dict
The collections.abc.Mapping subclass to use as the return
object. Can be the actual class or an empty
instance of the mapping type you want. If you want a
collections.defaultdict, you must pass it initialized.

Returns
-------
collections.abc.Mapping
Key-value representation of Series.

Examples
--------
>>> import cudf
>>> s = cudf.Series([1, 2, 3, 4])
>>> s
0 1
1 2
2 3
3 4
dtype: int64
>>> s.to_dict()
{0: 1, 1: 2, 2: 3, 3: 4}
>>> from collections import OrderedDict, defaultdict
>>> s.to_dict(OrderedDict)
OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
>>> dd = defaultdict(list)
>>> s.to_dict(dd)
defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
"""
return self.to_pandas().to_dict(into=into)

@_cudf_nvtx_annotate
def append(self, to_append, ignore_index=False, verify_integrity=False):
"""Append values from another ``Series`` or array-like object.
Expand Down
Loading