Skip to content

Commit

Permalink
added proper tests for H2OFrame constructor and discovered a bunch of…
Browse files Browse the repository at this point in the history
… bugs…
  • Loading branch information
sebhrusen committed Nov 21, 2023
1 parent 77bf637 commit 9ba3073
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 107 deletions.
89 changes: 78 additions & 11 deletions h2o-py/h2o/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,26 @@
import datetime
import functools
from io import StringIO
import itertools
import os
import sys
import re
import tempfile
import traceback
from types import FunctionType
import warnings

import h2o
from h2o.base import Keyed
from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, in_ipy, in_zep, repr_def
from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, repr_def
from h2o.exceptions import H2OTypeError, H2OValueError, H2ODeprecationWarning
from h2o.expr import ExprNode
from h2o.group_by import GroupBy
from h2o.job import H2OJob
from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, RAISE_ON_FIGURE_ACCESS
from h2o.utils.config import get_config_value
from h2o.utils.metaclass import deprecated_fn
from h2o.utils.shared_utils import (_handle_numpy_array, _handle_pandas_data_frame, _handle_python_dicts,
_handle_python_lists, _is_list, _is_str_list, _py_tmp_key, _quoted,
can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized,
check_frame_id, can_use_datatable)
from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted,
can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized,
check_frame_id, can_use_datatable)
from h2o.utils.threading import local_context, local_env
from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray,
numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U)
Expand Down Expand Up @@ -171,7 +170,7 @@ def _upload_sparse_matrix(self, matrix, destination_frame=None):
tmp_handle, tmp_path = tempfile.mkstemp(suffix=".svmlight")
out = os.fdopen(tmp_handle, 'wt', **H2OFrame.__fdopen_kwargs)
if destination_frame is None:
destination_frame = _py_tmp_key(h2o.connection().session_id)
destination_frame = py_tmp_key(h2o.connection().session_id)

# sp.find(matrix) returns (row indices, column indices, values) of the non-zero elements of A. Unfortunately
# there is no guarantee that those elements are returned in the correct order, so need to sort
Expand Down Expand Up @@ -501,7 +500,7 @@ def _parse_raw(self, setup):
p.update({k: v for k, v in setup.items() if k in p})

# Extract only 'name' from each src in the array of srcs
p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']]
p['source_frames'] = [quoted(src['name']) for src in setup['source_frames']]

H2OJob(h2o.api("POST /3/Parse", data=p), "Parse").poll()
# Need to return a Frame here for nearly all callers
Expand Down Expand Up @@ -717,7 +716,7 @@ def __pow__(self, rhs):
return _binop(self, "^", rhs)

def __contains__(self, lhs):
return all((t == self).any() for t in lhs) if _is_list(lhs) else (lhs == self).any()
return all((t == self).any() for t in lhs) if is_list(lhs) else (lhs == self).any()

# rops
def __rmod__(self, lhs):
Expand Down Expand Up @@ -2175,7 +2174,7 @@ def _compute_ncol_update(self, item): # computes new ncol, names, and types
new_ncols = -1
if isinstance(item, list):
new_ncols = len(item)
if _is_str_list(item):
if is_str_list(item):
new_types = {k: self.types[k] for k in item}
new_names = item
else:
Expand Down Expand Up @@ -5193,3 +5192,71 @@ def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL):
ftemp = temp[newNames]
ctemp = pd.concat([ftemp, zeroFrame], axis=1)
return ctemp


### Module-scope utility functions ###

def _handle_python_lists(python_obj, check_header):
# convert all inputs to lol
if is_list_of_lists(python_obj): # do we have a list of lists: [[...], ..., [...]] ?
ncols = _check_lists_of_lists(python_obj) # must be a list of flat lists, raise ValueError if not
elif isinstance(python_obj, (list, tuple)): # single list
ncols = 1
python_obj = [[e] for e in python_obj]
else: # scalar
python_obj = [[python_obj]]
ncols = 1
# create the header
if check_header == 1:
header = python_obj[0]
python_obj = python_obj[1:]
else:
header = gen_header(ncols)
# shape up the data for csv.DictWriter
# data_to_write = [dict(list(zip(header, row))) for row in python_obj]
return header, python_obj


def _handle_python_dicts(python_obj, check_header):
header = list(python_obj.keys()) if python_obj else gen_header(1)
is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header?
if not is_valid:
raise ValueError(
"Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string
v = python_obj[k]
if isinstance(v, (tuple, list)): # if value is a tuple/list, then it must be flat
if is_list_of_lists(v):
raise ValueError("Values in the dictionary must be flattened!")
elif is_type(v, str, numeric):
python_obj[k] = [v]
else:
raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))

zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
rows = list(map(list, zipper(*list(python_obj.values()))))
data_to_write = [dict(list(zip(header, row))) for row in rows]
return header, data_to_write

def _handle_numpy_array(python_obj, header):
return _handle_python_lists(python_obj.tolist(), header)

def _handle_pandas_data_frame(python_obj, header):
data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
return list(str(c) for c in python_obj.columns), data

def _check_lists_of_lists(python_obj):
# check we have a lists of flat lists
# returns longest length of sublist
most_cols = 1
for l in python_obj:
# All items in the list must be a list!
if not isinstance(l, (tuple, list)):
raise ValueError("`python_obj` is a mixture of nested lists and other types.")
most_cols = max(most_cols, len(l))
for ll in l:
# in fact, we must have a list of flat lists!
if isinstance(ll, (tuple, list)):
raise ValueError("`python_obj` is not a list of flat lists!")
return most_cols

4 changes: 3 additions & 1 deletion h2o-py/h2o/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,8 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
if is_type(raw_frames, str): raw_frames = [raw_frames]

# temporary dictionary just to pass the following information to the parser: header, separator
kwargs = {"check_header": header, "source_frames": [quoted(frame_id) for frame_id in raw_frames],
kwargs = {"check_header": header,
"source_frames": [quoted(frame_id) for frame_id in raw_frames],
"single_quotes": quotechar == "'"}
if separator:
kwargs["separator"] = ord(separator)
Expand All @@ -844,6 +845,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co

if custom_non_data_line_markers is not None:
kwargs["custom_non_data_line_markers"] = custom_non_data_line_markers

if partition_by is not None:
kwargs["partition_by"] = partition_by

Expand Down
74 changes: 0 additions & 74 deletions h2o-py/h2o/utils/shared_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,43 +162,6 @@ def _gen_header(cols):
return ["C" + str(c) for c in range(1, cols + 1, 1)]


def _check_lists_of_lists(python_obj):
# check we have a lists of flat lists
# returns longest length of sublist
most_cols = 1
for l in python_obj:
# All items in the list must be a list!
if not isinstance(l, (tuple, list)):
raise ValueError("`python_obj` is a mixture of nested lists and other types.")
most_cols = max(most_cols, len(l))
for ll in l:
# in fact, we must have a list of flat lists!
if isinstance(ll, (tuple, list)):
raise ValueError("`python_obj` is not a list of flat lists!")
return most_cols


def _handle_python_lists(python_obj, check_header):
# convert all inputs to lol
if _is_list_of_lists(python_obj): # do we have a list of lists: [[...], ..., [...]] ?
ncols = _check_lists_of_lists(python_obj) # must be a list of flat lists, raise ValueError if not
elif isinstance(python_obj, (list, tuple)): # single list
ncols = 1
python_obj = [[e] for e in python_obj]
else: # scalar
python_obj = [[python_obj]]
ncols = 1
# create the header
if check_header == 1:
header = python_obj[0]
python_obj = python_obj[1:]
else:
header = _gen_header(ncols)
# shape up the data for csv.DictWriter
# data_to_write = [dict(list(zip(header, row))) for row in python_obj]
return header, python_obj


def stringify_dict(d):
return stringify_list(["{'key': %s, 'value': %s}" % (_quoted(k), v) for k, v in d.items()])

Expand Down Expand Up @@ -244,38 +207,6 @@ def _is_num_list(l):
def _is_list_of_lists(o):
return any(isinstance(l, (tuple, list)) for l in o)


def _handle_numpy_array(python_obj, header):
return _handle_python_lists(python_obj.tolist(), header)


def _handle_pandas_data_frame(python_obj, header):
data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
return list(str(c) for c in python_obj.columns), data


def _handle_python_dicts(python_obj, check_header):
header = list(python_obj.keys()) if python_obj else _gen_header(1)
is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header?
if not is_valid:
raise ValueError(
"Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string
v = python_obj[k]
if isinstance(v, (tuple, list)): # if value is a tuple/list, then it must be flat
if _is_list_of_lists(v):
raise ValueError("Values in the dictionary must be flattened!")
elif is_type(v, str, numeric):
python_obj[k] = [v]
else:
raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))

zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
rows = list(map(list, zipper(*list(python_obj.values()))))
data_to_write = [dict(list(zip(header, row))) for row in rows]
return header, data_to_write


def _is_fr(o):
return o.__class__.__name__ == "H2OFrame" # hack to avoid circular imports

Expand Down Expand Up @@ -426,14 +357,9 @@ def slice_is_normalized(s):
quoted = _quoted
is_list = _is_list
is_fr = _is_fr
handle_python_dicts = _handle_python_dicts
handle_pandas_data_frame = _handle_pandas_data_frame
handle_numpy_array = _handle_numpy_array
is_list_of_lists = _is_list_of_lists
is_num_list = _is_num_list
is_str_list = _is_str_list
handle_python_lists = _handle_python_lists
check_lists_of_lists = _check_lists_of_lists

gen_model_file_name = "h2o-genmodel.jar"
h2o_predictor_class = "hex.genmodel.tools.PredictCsv"
Expand Down
Loading

0 comments on commit 9ba3073

Please sign in to comment.