From 9ba30730130cfe7e77e9294e12db091864cbed54 Mon Sep 17 00:00:00 2001 From: sebhrusen Date: Tue, 21 Nov 2023 22:57:45 +0100 Subject: [PATCH] =?UTF-8?q?added=20proper=20tests=20for=20H2OFrame=20const?= =?UTF-8?q?ructor=20and=20discovered=20a=20bunch=20of=20bugs=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- h2o-py/h2o/frame.py | 89 ++++++++-- h2o-py/h2o/h2o.py | 4 +- h2o-py/h2o/utils/shared_utils.py | 74 -------- .../Data_Manipulation/pyunit_h2oH2OFrame.py | 159 +++++++++++++++--- 4 files changed, 219 insertions(+), 107 deletions(-) diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py index 8be019aa35ba..5f784bc3a140 100644 --- a/h2o-py/h2o/frame.py +++ b/h2o-py/h2o/frame.py @@ -9,16 +9,16 @@ import datetime import functools from io import StringIO +import itertools import os -import sys +import re import tempfile -import traceback from types import FunctionType import warnings import h2o from h2o.base import Keyed -from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, in_ipy, in_zep, repr_def +from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, repr_def from h2o.exceptions import H2OTypeError, H2OValueError, H2ODeprecationWarning from h2o.expr import ExprNode from h2o.group_by import GroupBy @@ -26,10 +26,9 @@ from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, RAISE_ON_FIGURE_ACCESS from h2o.utils.config import get_config_value from h2o.utils.metaclass import deprecated_fn -from h2o.utils.shared_utils import (_handle_numpy_array, _handle_pandas_data_frame, _handle_python_dicts, - _handle_python_lists, _is_list, _is_str_list, _py_tmp_key, _quoted, - can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, - check_frame_id, can_use_datatable) +from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted, + can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, + check_frame_id, can_use_datatable) from h2o.utils.threading import local_context, local_env from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray, numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U) @@ -171,7 +170,7 @@ def _upload_sparse_matrix(self, matrix, destination_frame=None): tmp_handle, tmp_path = tempfile.mkstemp(suffix=".svmlight") out = os.fdopen(tmp_handle, 'wt', **H2OFrame.__fdopen_kwargs) if destination_frame is None: - destination_frame = _py_tmp_key(h2o.connection().session_id) + destination_frame = py_tmp_key(h2o.connection().session_id) # sp.find(matrix) returns (row indices, column indices, values) of the non-zero elements of A. Unfortunately # there is no guarantee that those elements are returned in the correct order, so need to sort @@ -501,7 +500,7 @@ def _parse_raw(self, setup): p.update({k: v for k, v in setup.items() if k in p}) # Extract only 'name' from each src in the array of srcs - p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']] + p['source_frames'] = [quoted(src['name']) for src in setup['source_frames']] H2OJob(h2o.api("POST /3/Parse", data=p), "Parse").poll() # Need to return a Frame here for nearly all callers @@ -717,7 +716,7 @@ def __pow__(self, rhs): return _binop(self, "^", rhs) def __contains__(self, lhs): - return all((t == self).any() for t in lhs) if _is_list(lhs) else (lhs == self).any() + return all((t == self).any() for t in lhs) if is_list(lhs) else (lhs == self).any() # rops def __rmod__(self, lhs): @@ -2175,7 +2174,7 @@ def _compute_ncol_update(self, item): # computes new ncol, names, and types new_ncols = -1 if isinstance(item, list): new_ncols = len(item) - if _is_str_list(item): + if is_str_list(item): new_types = {k: self.types[k] for k in item} new_names = item else: @@ -5193,3 +5192,71 @@ def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL): ftemp = temp[newNames] ctemp = pd.concat([ftemp, zeroFrame], axis=1) return ctemp + + +### Module-scope utility functions ### + +def _handle_python_lists(python_obj, check_header): + # convert all inputs to lol + if is_list_of_lists(python_obj): # do we have a list of lists: [[...], ..., [...]] ? + ncols = _check_lists_of_lists(python_obj) # must be a list of flat lists, raise ValueError if not + elif isinstance(python_obj, (list, tuple)): # single list + ncols = 1 + python_obj = [[e] for e in python_obj] + else: # scalar + python_obj = [[python_obj]] + ncols = 1 + # create the header + if check_header == 1: + header = python_obj[0] + python_obj = python_obj[1:] + else: + header = gen_header(ncols) + # shape up the data for csv.DictWriter + # data_to_write = [dict(list(zip(header, row))) for row in python_obj] + return header, python_obj + + +def _handle_python_dicts(python_obj, check_header): + header = list(python_obj.keys()) if python_obj else gen_header(1) + is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header? + if not is_valid: + raise ValueError( + "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ") + for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string + v = python_obj[k] + if isinstance(v, (tuple, list)): # if value is a tuple/list, then it must be flat + if is_list_of_lists(v): + raise ValueError("Values in the dictionary must be flattened!") + elif is_type(v, str, numeric): + python_obj[k] = [v] + else: + raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v)) + + zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip + rows = list(map(list, zipper(*list(python_obj.values())))) + data_to_write = [dict(list(zip(header, row))) for row in rows] + return header, data_to_write + +def _handle_numpy_array(python_obj, header): + return _handle_python_lists(python_obj.tolist(), header) + +def _handle_pandas_data_frame(python_obj, header): + data = _handle_python_lists(python_obj.values.tolist(), -1)[1] + return list(str(c) for c in python_obj.columns), data + +def _check_lists_of_lists(python_obj): + # check we have a lists of flat lists + # returns longest length of sublist + most_cols = 1 + for l in python_obj: + # All items in the list must be a list! + if not isinstance(l, (tuple, list)): + raise ValueError("`python_obj` is a mixture of nested lists and other types.") + most_cols = max(most_cols, len(l)) + for ll in l: + # in fact, we must have a list of flat lists! + if isinstance(ll, (tuple, list)): + raise ValueError("`python_obj` is not a list of flat lists!") + return most_cols + diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py index 8b28fa462f61..d949193cd3cd 100644 --- a/h2o-py/h2o/h2o.py +++ b/h2o-py/h2o/h2o.py @@ -834,7 +834,8 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if is_type(raw_frames, str): raw_frames = [raw_frames] # temporary dictionary just to pass the following information to the parser: header, separator - kwargs = {"check_header": header, "source_frames": [quoted(frame_id) for frame_id in raw_frames], + kwargs = {"check_header": header, + "source_frames": [quoted(frame_id) for frame_id in raw_frames], "single_quotes": quotechar == "'"} if separator: kwargs["separator"] = ord(separator) @@ -844,6 +845,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co if custom_non_data_line_markers is not None: kwargs["custom_non_data_line_markers"] = custom_non_data_line_markers + if partition_by is not None: kwargs["partition_by"] = partition_by diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py index fc2366e4b35d..3a649e6afe2c 100644 --- a/h2o-py/h2o/utils/shared_utils.py +++ b/h2o-py/h2o/utils/shared_utils.py @@ -162,43 +162,6 @@ def _gen_header(cols): return ["C" + str(c) for c in range(1, cols + 1, 1)] -def _check_lists_of_lists(python_obj): - # check we have a lists of flat lists - # returns longest length of sublist - most_cols = 1 - for l in python_obj: - # All items in the list must be a list! - if not isinstance(l, (tuple, list)): - raise ValueError("`python_obj` is a mixture of nested lists and other types.") - most_cols = max(most_cols, len(l)) - for ll in l: - # in fact, we must have a list of flat lists! - if isinstance(ll, (tuple, list)): - raise ValueError("`python_obj` is not a list of flat lists!") - return most_cols - - -def _handle_python_lists(python_obj, check_header): - # convert all inputs to lol - if _is_list_of_lists(python_obj): # do we have a list of lists: [[...], ..., [...]] ? - ncols = _check_lists_of_lists(python_obj) # must be a list of flat lists, raise ValueError if not - elif isinstance(python_obj, (list, tuple)): # single list - ncols = 1 - python_obj = [[e] for e in python_obj] - else: # scalar - python_obj = [[python_obj]] - ncols = 1 - # create the header - if check_header == 1: - header = python_obj[0] - python_obj = python_obj[1:] - else: - header = _gen_header(ncols) - # shape up the data for csv.DictWriter - # data_to_write = [dict(list(zip(header, row))) for row in python_obj] - return header, python_obj - - def stringify_dict(d): return stringify_list(["{'key': %s, 'value': %s}" % (_quoted(k), v) for k, v in d.items()]) @@ -244,38 +207,6 @@ def _is_num_list(l): def _is_list_of_lists(o): return any(isinstance(l, (tuple, list)) for l in o) - -def _handle_numpy_array(python_obj, header): - return _handle_python_lists(python_obj.tolist(), header) - - -def _handle_pandas_data_frame(python_obj, header): - data = _handle_python_lists(python_obj.values.tolist(), -1)[1] - return list(str(c) for c in python_obj.columns), data - - -def _handle_python_dicts(python_obj, check_header): - header = list(python_obj.keys()) if python_obj else _gen_header(1) - is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header) # is this a valid header? - if not is_valid: - raise ValueError( - "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ") - for k in python_obj: # check that each value entry is a flat list/tuple or single int, float, or string - v = python_obj[k] - if isinstance(v, (tuple, list)): # if value is a tuple/list, then it must be flat - if _is_list_of_lists(v): - raise ValueError("Values in the dictionary must be flattened!") - elif is_type(v, str, numeric): - python_obj[k] = [v] - else: - raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v)) - - zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip - rows = list(map(list, zipper(*list(python_obj.values())))) - data_to_write = [dict(list(zip(header, row))) for row in rows] - return header, data_to_write - - def _is_fr(o): return o.__class__.__name__ == "H2OFrame" # hack to avoid circular imports @@ -426,14 +357,9 @@ def slice_is_normalized(s): quoted = _quoted is_list = _is_list is_fr = _is_fr -handle_python_dicts = _handle_python_dicts -handle_pandas_data_frame = _handle_pandas_data_frame -handle_numpy_array = _handle_numpy_array is_list_of_lists = _is_list_of_lists is_num_list = _is_num_list is_str_list = _is_str_list -handle_python_lists = _handle_python_lists -check_lists_of_lists = _check_lists_of_lists gen_model_file_name = "h2o-genmodel.jar" h2o_predictor_class = "hex.genmodel.tools.PredictCsv" diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py index 5d082c6b767c..2fe59bb59b77 100644 --- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py +++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py @@ -1,28 +1,145 @@ import sys sys.path.insert(1,"../../../") -from tests import pyunit_utils +from tests import pyunit_utils as pu import h2o from h2o.utils.typechecks import assert_is_type from h2o.frame import H2OFrame -def h2o_H2OFrame(): - """ - Python API test: h2o.frame.H2OFrame(python_obj=None, destination_frame=None, header=0, separator=', ', - column_names=None, column_types=None, na_strings=None) - """ - python_lists = [[1,4,"a",1], [2,5,"b",0], [3,6,"c",1]] - col_names=["num1","num2","str1","enum1"] - dest_frame="newFrame" - heads=-1 - sep=',' - col_types=['numeric', 'numeric', 'string', 'enum'] - na_str=['NA'] - h2oframe = h2o.H2OFrame(python_obj=python_lists, destination_frame=dest_frame, header=heads, separator=sep, - column_names=col_names, column_types=col_types, na_strings=na_str) - assert_is_type(h2oframe, H2OFrame) - assert h2oframe.nrows==len(python_lists) and h2oframe.ncols==len(python_lists[0]), "h2o.H2OFrame() command is " \ - "not working." - - -pyunit_utils.standalone_test(h2o_H2OFrame) + +data = [ + [1, .4, "a", 1], + [2, 5 , "b", 0], + [3, 6 , "", 1], +] +col_names = ["num1", "num2", "str1", "enum1"] +col_types = ['numeric', 'numeric', 'string', 'enum'] +col_types_back = dict(zip(col_names, ['int', 'real', 'string', 'enum'])) +na_str = [''] + + +def check_frame(fr, dest=None): + if dest is not None: + assert fr.key == dest + assert fr.nrows == len(data) + assert fr.ncols == len(data[0]) + assert fr.columns == col_names + assert fr.types == col_types_back + for i, row in enumerate(data): + for j, value in enumerate(row): + expected_val = ('NA' if data[i][j] == na_str[0] + else str(data[i][j]) if col_types[j] == 'enum' + else data[i][j]) + assert fr[i, j] == expected_val, "expected value at [%s, %s] = %s, but got %s" % (i, j, fr[i, j], expected_val) + + +def H2OFrame_from_list_no_header(): + fr = h2o.H2OFrame(data, destination_frame="from_list_no_header", + column_names=col_names, column_types=col_types, na_strings=na_str) + assert_is_type(fr, H2OFrame) + check_frame(fr, "from_list_no_header") + + +def H2OFrame_from_list_with_header(): + fr = h2o.H2OFrame([col_names]+data, destination_frame="from_list_with_header", + header=1, column_types=col_types, na_strings=na_str) + assert_is_type(fr, H2OFrame) + check_frame(fr, "from_list_with_header") + + +def H2OFrame_from_dict(): + fr = h2o.H2OFrame(dict(zip(col_names, zip(*data))), destination_frame="from_dict", + column_types=col_types, na_strings=na_str) + assert_is_type(fr, H2OFrame) + check_frame(fr, "from_dict") + + +def H2OFrame_from_numpy(): + import numpy as np + fr = h2o.H2OFrame(np.array(data), destination_frame="from_numpy_array", + column_names=col_names, column_types=col_types, na_strings=na_str) + assert_is_type(fr, H2OFrame) + check_frame(fr, "from_numpy_array") + + +def H2OFrame_from_pandas(): + import pandas as pd + fr = h2o.H2OFrame(pd.DataFrame(data, columns=col_names), destination_frame="from_pandas_dataframe", + column_types=col_types, na_strings=na_str) + assert_is_type(fr, H2OFrame) + check_frame(fr, "from_pandas_dataframe") + + +def H2OFrame_from_scipy(): + from scipy.sparse import csr_matrix, csc_matrix + nostr_data = [[0 if isinstance(v, str) else v for v in r] for r in data] + def check_sparse_frame(fr, dest): + assert fr.key == dest + assert fr.nrows == len(nostr_data) + assert fr.ncols == len(nostr_data[0]) + assert fr.columns == ["C%i"%i for i in range(1, fr.ncols+1)] # BUG! construction with scipy matrix ignores `column_names` param: see ticket GH-##### + assert fr.types == dict(C1='int', C2='real', C3='int', C4='int') # enum types are also ignored, but it makes sense for sparse matrices, we could add a warning though. + for i, row in enumerate(data): + for j, value in enumerate(row): + assert fr[i, j] == nostr_data[i][j], "expected value at [%s, %s] = %s, but got %s" % (i, j, fr[i, j], nostr_data[i][j]) + + + fr = h2o.H2OFrame(csr_matrix(nostr_data), destination_frame="from_sparse_row_matrix", + column_names=col_names, column_types=col_types) + assert_is_type(fr, H2OFrame) + print(fr) + check_sparse_frame(fr, "from_sparse_row_matrix") + + fr = h2o.H2OFrame(csc_matrix(nostr_data), destination_frame="from_sparse_column_matrix", + column_names=col_names, column_types=col_types) + assert_is_type(fr, H2OFrame) + check_sparse_frame(fr, "from_sparse_column_matrix") + +def H2OFrame_from_H2OFrame(): + fr = h2o.H2OFrame(python_obj=data, destination_frame="h2oframe_ori", + column_names=col_names, column_types=col_types, na_strings=na_str) + + dupl1 = h2o.H2OFrame(python_obj=fr) + assert_is_type(dupl1, H2OFrame) + check_frame(dupl1) + assert fr.key != dupl1.key + + dupl2 = h2o.H2OFrame(python_obj=fr, destination_frame="h2oframe_duplicate") + assert_is_type(dupl2, H2OFrame) + check_frame(dupl2, "h2oframe_duplicate") + + dupl3 = h2o.H2OFrame(python_obj=fr, column_names=["n1", "n2", "s1", "e1"], destination_frame="h2oframe_duplicate_with_renamed_columns") + assert_is_type(dupl3, H2OFrame) + assert dupl3.key == "h2oframe_duplicate_with_renamed_columns" + assert dupl3.ncols == fr.ncols + assert dupl3.nrows == fr.nrows + assert dupl3.as_data_frame(use_pandas=False, header=False) == fr.as_data_frame(use_pandas=False, header=False) + assert dupl3.columns == ["n1", "n2", "s1", "e1"] + + dupl4 = h2o.H2OFrame(python_obj=fr, skipped_columns=[1, 3], column_names=["n1", "s1"], destination_frame="h2oframe_duplicate_with_skipped_columns") + assert_is_type(dupl3, H2OFrame) + assert dupl4.key == "h2oframe_duplicate_with_skipped_columns" + assert dupl4.ncols == fr.ncols - 2 + assert dupl4.nrows == fr.nrows + assert dupl4.as_data_frame(use_pandas=False, header=False) == fr.drop([1, 3]).as_data_frame(use_pandas=False, header=False) + assert dupl4.columns == ["n1", "s1"] + + +def H2OFrame_skipped_columns_is_BUGGY(): + try: + h2o.H2OFrame(data, skipped_columns=[1]) + assert False, "skipped_columns handling may be fixed now" # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue GH-##### + except ValueError as e: + assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e) + + +pu.run_tests([ + H2OFrame_from_list_no_header, + H2OFrame_from_list_with_header, + H2OFrame_from_dict, + H2OFrame_from_numpy, + H2OFrame_from_pandas, + H2OFrame_from_scipy, + H2OFrame_from_H2OFrame, + H2OFrame_skipped_columns_is_BUGGY +])