added proper tests for H2OFrame constructor and discovered a bunch of…

… bugs…
h2oai · Nov 21, 2023 · 9ba3073 · 9ba3073
1 parent 77bf637
commit 9ba3073
Show file tree

Hide file tree

Showing 4 changed files with 219 additions and 107 deletions.
diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
@@ -9,27 +9,26 @@
 import datetime
 import functools
 from io import StringIO
+import itertools
 import os
-import sys
+import re
 import tempfile
-import traceback
 from types import FunctionType
 import warnings
 
 import h2o
 from h2o.base import Keyed
-from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, in_ipy, in_zep, repr_def
+from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, repr_def
 from h2o.exceptions import H2OTypeError, H2OValueError, H2ODeprecationWarning
 from h2o.expr import ExprNode
 from h2o.group_by import GroupBy
 from h2o.job import H2OJob
 from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, RAISE_ON_FIGURE_ACCESS
 from h2o.utils.config import get_config_value
 from h2o.utils.metaclass import deprecated_fn
-from h2o.utils.shared_utils import (_handle_numpy_array, _handle_pandas_data_frame, _handle_python_dicts,
-                                    _handle_python_lists, _is_list, _is_str_list, _py_tmp_key, _quoted,
-                                    can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, 
-                                    check_frame_id, can_use_datatable)
+from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted,
+                                   can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, 
+                                   check_frame_id, can_use_datatable)
 from h2o.utils.threading import local_context, local_env
 from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray,
                                   numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U)
@@ -171,7 +170,7 @@ def _upload_sparse_matrix(self, matrix, destination_frame=None):
         tmp_handle, tmp_path = tempfile.mkstemp(suffix=".svmlight")
         out = os.fdopen(tmp_handle, 'wt', **H2OFrame.__fdopen_kwargs)
         if destination_frame is None:
-            destination_frame = _py_tmp_key(h2o.connection().session_id)
+            destination_frame = py_tmp_key(h2o.connection().session_id)
 
         # sp.find(matrix) returns (row indices, column indices, values) of the non-zero elements of A. Unfortunately
         # there is no guarantee that those elements are returned in the correct order, so need to sort
@@ -501,7 +500,7 @@ def _parse_raw(self, setup):
         p.update({k: v for k, v in setup.items() if k in p})
 
         # Extract only 'name' from each src in the array of srcs
-        p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']]
+        p['source_frames'] = [quoted(src['name']) for src in setup['source_frames']]
 
         H2OJob(h2o.api("POST /3/Parse", data=p), "Parse").poll()
         # Need to return a Frame here for nearly all callers
@@ -717,7 +716,7 @@ def __pow__(self, rhs):
         return _binop(self, "^", rhs)
 
     def __contains__(self, lhs):
-        return all((t == self).any() for t in lhs) if _is_list(lhs) else (lhs == self).any()
+        return all((t == self).any() for t in lhs) if is_list(lhs) else (lhs == self).any()
 
     # rops
     def __rmod__(self, lhs):
@@ -2175,7 +2174,7 @@ def _compute_ncol_update(self, item):  # computes new ncol, names, and types
         new_ncols = -1
         if isinstance(item, list):
             new_ncols = len(item)
-            if _is_str_list(item):
+            if is_str_list(item):
                 new_types = {k: self.types[k] for k in item}
                 new_names = item
             else:
@@ -5193,3 +5192,71 @@ def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL):
     ftemp = temp[newNames]
     ctemp = pd.concat([ftemp, zeroFrame], axis=1)
     return ctemp
+
+
+### Module-scope utility functions ###
+
+def _handle_python_lists(python_obj, check_header):
+    # convert all inputs to lol
+    if is_list_of_lists(python_obj):  # do we have a list of lists: [[...], ..., [...]] ?
+        ncols = _check_lists_of_lists(python_obj)  # must be a list of flat lists, raise ValueError if not
+    elif isinstance(python_obj, (list, tuple)):  # single list
+        ncols = 1
+        python_obj = [[e] for e in python_obj]
+    else:  # scalar
+        python_obj = [[python_obj]]
+        ncols = 1
+    # create the header
+    if check_header == 1:
+        header = python_obj[0]
+        python_obj = python_obj[1:]
+    else:
+        header = gen_header(ncols)
+    # shape up the data for csv.DictWriter
+    # data_to_write = [dict(list(zip(header, row))) for row in python_obj]
+    return header, python_obj
+
+
+def _handle_python_dicts(python_obj, check_header):
+    header = list(python_obj.keys()) if python_obj else gen_header(1)
+    is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header)  # is this a valid header?
+    if not is_valid:
+        raise ValueError(
+            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
+    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
+        v = python_obj[k]
+        if isinstance(v, (tuple, list)):  # if value is a tuple/list, then it must be flat
+            if is_list_of_lists(v):
+                raise ValueError("Values in the dictionary must be flattened!")
+        elif is_type(v, str, numeric):
+            python_obj[k] = [v]
+        else:
+            raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))
+
+    zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
+    rows = list(map(list, zipper(*list(python_obj.values()))))
+    data_to_write = [dict(list(zip(header, row))) for row in rows]
+    return header, data_to_write
+
+def _handle_numpy_array(python_obj, header):
+    return _handle_python_lists(python_obj.tolist(), header)
+
+def _handle_pandas_data_frame(python_obj, header):
+    data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
+    return list(str(c) for c in python_obj.columns), data
+
+def _check_lists_of_lists(python_obj):
+    # check we have a lists of flat lists
+    # returns longest length of sublist
+    most_cols = 1
+    for l in python_obj:
+        # All items in the list must be a list!
+        if not isinstance(l, (tuple, list)):
+            raise ValueError("`python_obj` is a mixture of nested lists and other types.")
+        most_cols = max(most_cols, len(l))
+        for ll in l:
+            # in fact, we must have a list of flat lists!
+            if isinstance(ll, (tuple, list)):
+                raise ValueError("`python_obj` is not a list of flat lists!")
+    return most_cols
+
diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
@@ -834,7 +834,8 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
     if is_type(raw_frames, str): raw_frames = [raw_frames]
 
     # temporary dictionary just to pass the following information to the parser: header, separator
-    kwargs = {"check_header": header, "source_frames": [quoted(frame_id) for frame_id in raw_frames],
+    kwargs = {"check_header": header, 
+              "source_frames": [quoted(frame_id) for frame_id in raw_frames],
               "single_quotes": quotechar == "'"}
     if separator:
         kwargs["separator"] = ord(separator)
@@ -844,6 +845,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
 
     if custom_non_data_line_markers is not None:
         kwargs["custom_non_data_line_markers"] = custom_non_data_line_markers
+
     if partition_by is not None:
         kwargs["partition_by"] = partition_by
 

diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
@@ -162,43 +162,6 @@ def _gen_header(cols):
     return ["C" + str(c) for c in range(1, cols + 1, 1)]
 
 
-def _check_lists_of_lists(python_obj):
-    # check we have a lists of flat lists
-    # returns longest length of sublist
-    most_cols = 1
-    for l in python_obj:
-        # All items in the list must be a list!
-        if not isinstance(l, (tuple, list)):
-            raise ValueError("`python_obj` is a mixture of nested lists and other types.")
-        most_cols = max(most_cols, len(l))
-        for ll in l:
-            # in fact, we must have a list of flat lists!
-            if isinstance(ll, (tuple, list)):
-                raise ValueError("`python_obj` is not a list of flat lists!")
-    return most_cols
-
-
-def _handle_python_lists(python_obj, check_header):
-    # convert all inputs to lol
-    if _is_list_of_lists(python_obj):  # do we have a list of lists: [[...], ..., [...]] ?
-        ncols = _check_lists_of_lists(python_obj)  # must be a list of flat lists, raise ValueError if not
-    elif isinstance(python_obj, (list, tuple)):  # single list
-        ncols = 1
-        python_obj = [[e] for e in python_obj]
-    else:  # scalar
-        python_obj = [[python_obj]]
-        ncols = 1
-    # create the header
-    if check_header == 1:
-        header = python_obj[0]
-        python_obj = python_obj[1:]
-    else:
-        header = _gen_header(ncols)
-    # shape up the data for csv.DictWriter
-    # data_to_write = [dict(list(zip(header, row))) for row in python_obj]
-    return header, python_obj
-
-
 def stringify_dict(d):
     return stringify_list(["{'key': %s, 'value': %s}" % (_quoted(k), v) for k, v in d.items()])
 
@@ -244,38 +207,6 @@ def _is_num_list(l):
 def _is_list_of_lists(o):
     return any(isinstance(l, (tuple, list)) for l in o)
 
-
-def _handle_numpy_array(python_obj, header):
-    return _handle_python_lists(python_obj.tolist(), header)
-
-
-def _handle_pandas_data_frame(python_obj, header):
-    data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
-    return list(str(c) for c in python_obj.columns), data
-
-
-def _handle_python_dicts(python_obj, check_header):
-    header = list(python_obj.keys()) if python_obj else _gen_header(1)
-    is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header)  # is this a valid header?
-    if not is_valid:
-        raise ValueError(
-            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
-    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
-        v = python_obj[k]
-        if isinstance(v, (tuple, list)):  # if value is a tuple/list, then it must be flat
-            if _is_list_of_lists(v):
-                raise ValueError("Values in the dictionary must be flattened!")
-        elif is_type(v, str, numeric):
-            python_obj[k] = [v]
-        else:
-            raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))
-
-    zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
-    rows = list(map(list, zipper(*list(python_obj.values()))))
-    data_to_write = [dict(list(zip(header, row))) for row in rows]
-    return header, data_to_write
-
-
 def _is_fr(o):
     return o.__class__.__name__ == "H2OFrame"  # hack to avoid circular imports
 
@@ -426,14 +357,9 @@ def slice_is_normalized(s):
 quoted = _quoted
 is_list = _is_list
 is_fr = _is_fr
-handle_python_dicts = _handle_python_dicts
-handle_pandas_data_frame = _handle_pandas_data_frame
-handle_numpy_array = _handle_numpy_array
 is_list_of_lists = _is_list_of_lists
 is_num_list = _is_num_list
 is_str_list = _is_str_list
-handle_python_lists = _handle_python_lists
-check_lists_of_lists = _check_lists_of_lists
 
 gen_model_file_name = "h2o-genmodel.jar"
 h2o_predictor_class = "hex.genmodel.tools.PredictCsv"