From 9ba30730130cfe7e77e9294e12db091864cbed54 Mon Sep 17 00:00:00 2001
From: sebhrusen <sebastien.poirier@h2o.ai>
Date: Tue, 21 Nov 2023 22:57:45 +0100
Subject: [PATCH] =?UTF-8?q?added=20proper=20tests=20for=20H2OFrame=20const?=
 =?UTF-8?q?ructor=20and=20discovered=20a=20bunch=20of=20bugs=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 h2o-py/h2o/frame.py                           |  89 ++++++++--
 h2o-py/h2o/h2o.py                             |   4 +-
 h2o-py/h2o/utils/shared_utils.py              |  74 --------
 .../Data_Manipulation/pyunit_h2oH2OFrame.py   | 159 +++++++++++++++---
 4 files changed, 219 insertions(+), 107 deletions(-)

diff --git a/h2o-py/h2o/frame.py b/h2o-py/h2o/frame.py
index 8be019aa35ba..5f784bc3a140 100644
--- a/h2o-py/h2o/frame.py
+++ b/h2o-py/h2o/frame.py
@@ -9,16 +9,16 @@
 import datetime
 import functools
 from io import StringIO
+import itertools
 import os
-import sys
+import re
 import tempfile
-import traceback
 from types import FunctionType
 import warnings
 
 import h2o
 from h2o.base import Keyed
-from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, in_ipy, in_zep, repr_def
+from h2o.display import H2ODisplay, H2ODisplayWrapper, H2OItemsDisplay, H2OTableDisplay, display, repr_def
 from h2o.exceptions import H2OTypeError, H2OValueError, H2ODeprecationWarning
 from h2o.expr import ExprNode
 from h2o.group_by import GroupBy
@@ -26,10 +26,9 @@
 from h2o.plot import get_matplotlib_pyplot, decorate_plot_result, RAISE_ON_FIGURE_ACCESS
 from h2o.utils.config import get_config_value
 from h2o.utils.metaclass import deprecated_fn
-from h2o.utils.shared_utils import (_handle_numpy_array, _handle_pandas_data_frame, _handle_python_dicts,
-                                    _handle_python_lists, _is_list, _is_str_list, _py_tmp_key, _quoted,
-                                    can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, 
-                                    check_frame_id, can_use_datatable)
+from h2o.utils.shared_utils import(gen_header, is_list, is_list_of_lists, is_str_list, py_tmp_key, quoted,
+                                   can_use_pandas, can_use_numpy, quote, normalize_slice, slice_is_normalized, 
+                                   check_frame_id, can_use_datatable)
 from h2o.utils.threading import local_context, local_env
 from h2o.utils.typechecks import (assert_is_type, assert_satisfies, Enum, I, is_type, numeric, numpy_ndarray,
                                   numpy_datetime, pandas_dataframe, pandas_timestamp, scipy_sparse, U)
@@ -171,7 +170,7 @@ def _upload_sparse_matrix(self, matrix, destination_frame=None):
         tmp_handle, tmp_path = tempfile.mkstemp(suffix=".svmlight")
         out = os.fdopen(tmp_handle, 'wt', **H2OFrame.__fdopen_kwargs)
         if destination_frame is None:
-            destination_frame = _py_tmp_key(h2o.connection().session_id)
+            destination_frame = py_tmp_key(h2o.connection().session_id)
 
         # sp.find(matrix) returns (row indices, column indices, values) of the non-zero elements of A. Unfortunately
         # there is no guarantee that those elements are returned in the correct order, so need to sort
@@ -501,7 +500,7 @@ def _parse_raw(self, setup):
         p.update({k: v for k, v in setup.items() if k in p})
 
         # Extract only 'name' from each src in the array of srcs
-        p['source_frames'] = [_quoted(src['name']) for src in setup['source_frames']]
+        p['source_frames'] = [quoted(src['name']) for src in setup['source_frames']]
 
         H2OJob(h2o.api("POST /3/Parse", data=p), "Parse").poll()
         # Need to return a Frame here for nearly all callers
@@ -717,7 +716,7 @@ def __pow__(self, rhs):
         return _binop(self, "^", rhs)
 
     def __contains__(self, lhs):
-        return all((t == self).any() for t in lhs) if _is_list(lhs) else (lhs == self).any()
+        return all((t == self).any() for t in lhs) if is_list(lhs) else (lhs == self).any()
 
     # rops
     def __rmod__(self, lhs):
@@ -2175,7 +2174,7 @@ def _compute_ncol_update(self, item):  # computes new ncol, names, and types
         new_ncols = -1
         if isinstance(item, list):
             new_ncols = len(item)
-            if _is_str_list(item):
+            if is_str_list(item):
                 new_types = {k: self.types[k] for k in item}
                 new_names = item
             else:
@@ -5193,3 +5192,71 @@ def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL):
     ftemp = temp[newNames]
     ctemp = pd.concat([ftemp, zeroFrame], axis=1)
     return ctemp
+    
+    
+### Module-scope utility functions ###
+    
+def _handle_python_lists(python_obj, check_header):
+    # convert all inputs to lol
+    if is_list_of_lists(python_obj):  # do we have a list of lists: [[...], ..., [...]] ?
+        ncols = _check_lists_of_lists(python_obj)  # must be a list of flat lists, raise ValueError if not
+    elif isinstance(python_obj, (list, tuple)):  # single list
+        ncols = 1
+        python_obj = [[e] for e in python_obj]
+    else:  # scalar
+        python_obj = [[python_obj]]
+        ncols = 1
+    # create the header
+    if check_header == 1:
+        header = python_obj[0]
+        python_obj = python_obj[1:]
+    else:
+        header = gen_header(ncols)
+    # shape up the data for csv.DictWriter
+    # data_to_write = [dict(list(zip(header, row))) for row in python_obj]
+    return header, python_obj
+
+
+def _handle_python_dicts(python_obj, check_header):
+    header = list(python_obj.keys()) if python_obj else gen_header(1)
+    is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header)  # is this a valid header?
+    if not is_valid:
+        raise ValueError(
+            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
+    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
+        v = python_obj[k]
+        if isinstance(v, (tuple, list)):  # if value is a tuple/list, then it must be flat
+            if is_list_of_lists(v):
+                raise ValueError("Values in the dictionary must be flattened!")
+        elif is_type(v, str, numeric):
+            python_obj[k] = [v]
+        else:
+            raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))
+
+    zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
+    rows = list(map(list, zipper(*list(python_obj.values()))))
+    data_to_write = [dict(list(zip(header, row))) for row in rows]
+    return header, data_to_write
+
+def _handle_numpy_array(python_obj, header):
+    return _handle_python_lists(python_obj.tolist(), header)
+
+def _handle_pandas_data_frame(python_obj, header):
+    data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
+    return list(str(c) for c in python_obj.columns), data
+
+def _check_lists_of_lists(python_obj):
+    # check we have a lists of flat lists
+    # returns longest length of sublist
+    most_cols = 1
+    for l in python_obj:
+        # All items in the list must be a list!
+        if not isinstance(l, (tuple, list)):
+            raise ValueError("`python_obj` is a mixture of nested lists and other types.")
+        most_cols = max(most_cols, len(l))
+        for ll in l:
+            # in fact, we must have a list of flat lists!
+            if isinstance(ll, (tuple, list)):
+                raise ValueError("`python_obj` is not a list of flat lists!")
+    return most_cols
+
diff --git a/h2o-py/h2o/h2o.py b/h2o-py/h2o/h2o.py
index 8b28fa462f61..d949193cd3cd 100644
--- a/h2o-py/h2o/h2o.py
+++ b/h2o-py/h2o/h2o.py
@@ -834,7 +834,8 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
     if is_type(raw_frames, str): raw_frames = [raw_frames]
 
     # temporary dictionary just to pass the following information to the parser: header, separator
-    kwargs = {"check_header": header, "source_frames": [quoted(frame_id) for frame_id in raw_frames],
+    kwargs = {"check_header": header, 
+              "source_frames": [quoted(frame_id) for frame_id in raw_frames],
               "single_quotes": quotechar == "'"}
     if separator:
         kwargs["separator"] = ord(separator)
@@ -844,6 +845,7 @@ def parse_setup(raw_frames, destination_frame=None, header=0, separator=None, co
 
     if custom_non_data_line_markers is not None:
         kwargs["custom_non_data_line_markers"] = custom_non_data_line_markers
+        
     if partition_by is not None:
         kwargs["partition_by"] = partition_by
 
diff --git a/h2o-py/h2o/utils/shared_utils.py b/h2o-py/h2o/utils/shared_utils.py
index fc2366e4b35d..3a649e6afe2c 100644
--- a/h2o-py/h2o/utils/shared_utils.py
+++ b/h2o-py/h2o/utils/shared_utils.py
@@ -162,43 +162,6 @@ def _gen_header(cols):
     return ["C" + str(c) for c in range(1, cols + 1, 1)]
 
 
-def _check_lists_of_lists(python_obj):
-    # check we have a lists of flat lists
-    # returns longest length of sublist
-    most_cols = 1
-    for l in python_obj:
-        # All items in the list must be a list!
-        if not isinstance(l, (tuple, list)):
-            raise ValueError("`python_obj` is a mixture of nested lists and other types.")
-        most_cols = max(most_cols, len(l))
-        for ll in l:
-            # in fact, we must have a list of flat lists!
-            if isinstance(ll, (tuple, list)):
-                raise ValueError("`python_obj` is not a list of flat lists!")
-    return most_cols
-
-
-def _handle_python_lists(python_obj, check_header):
-    # convert all inputs to lol
-    if _is_list_of_lists(python_obj):  # do we have a list of lists: [[...], ..., [...]] ?
-        ncols = _check_lists_of_lists(python_obj)  # must be a list of flat lists, raise ValueError if not
-    elif isinstance(python_obj, (list, tuple)):  # single list
-        ncols = 1
-        python_obj = [[e] for e in python_obj]
-    else:  # scalar
-        python_obj = [[python_obj]]
-        ncols = 1
-    # create the header
-    if check_header == 1:
-        header = python_obj[0]
-        python_obj = python_obj[1:]
-    else:
-        header = _gen_header(ncols)
-    # shape up the data for csv.DictWriter
-    # data_to_write = [dict(list(zip(header, row))) for row in python_obj]
-    return header, python_obj
-
-
 def stringify_dict(d):
     return stringify_list(["{'key': %s, 'value': %s}" % (_quoted(k), v) for k, v in d.items()])
 
@@ -244,38 +207,6 @@ def _is_num_list(l):
 def _is_list_of_lists(o):
     return any(isinstance(l, (tuple, list)) for l in o)
 
-
-def _handle_numpy_array(python_obj, header):
-    return _handle_python_lists(python_obj.tolist(), header)
-
-
-def _handle_pandas_data_frame(python_obj, header):
-    data = _handle_python_lists(python_obj.values.tolist(), -1)[1]
-    return list(str(c) for c in python_obj.columns), data
-
-
-def _handle_python_dicts(python_obj, check_header):
-    header = list(python_obj.keys()) if python_obj else _gen_header(1)
-    is_valid = all(re.match(r"^[a-zA-Z_][a-zA-Z0-9_.]*$", col) for col in header)  # is this a valid header?
-    if not is_valid:
-        raise ValueError(
-            "Did not get a valid set of column names! Must match the regular expression: ^[a-zA-Z_][a-zA-Z0-9_.]*$ ")
-    for k in python_obj:  # check that each value entry is a flat list/tuple or single int, float, or string
-        v = python_obj[k]
-        if isinstance(v, (tuple, list)):  # if value is a tuple/list, then it must be flat
-            if _is_list_of_lists(v):
-                raise ValueError("Values in the dictionary must be flattened!")
-        elif is_type(v, str, numeric):
-            python_obj[k] = [v]
-        else:
-            raise ValueError("Encountered invalid dictionary value when constructing H2OFrame. Got: {0}".format(v))
-
-    zipper = getattr(itertools, "zip_longest", None) or getattr(itertools, "izip_longest", None) or zip
-    rows = list(map(list, zipper(*list(python_obj.values()))))
-    data_to_write = [dict(list(zip(header, row))) for row in rows]
-    return header, data_to_write
-
-
 def _is_fr(o):
     return o.__class__.__name__ == "H2OFrame"  # hack to avoid circular imports
 
@@ -426,14 +357,9 @@ def slice_is_normalized(s):
 quoted = _quoted
 is_list = _is_list
 is_fr = _is_fr
-handle_python_dicts = _handle_python_dicts
-handle_pandas_data_frame = _handle_pandas_data_frame
-handle_numpy_array = _handle_numpy_array
 is_list_of_lists = _is_list_of_lists
 is_num_list = _is_num_list
 is_str_list = _is_str_list
-handle_python_lists = _handle_python_lists
-check_lists_of_lists = _check_lists_of_lists
 
 gen_model_file_name = "h2o-genmodel.jar"
 h2o_predictor_class = "hex.genmodel.tools.PredictCsv"
diff --git a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
index 5d082c6b767c..2fe59bb59b77 100644
--- a/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
+++ b/h2o-py/tests/testdir_apis/Data_Manipulation/pyunit_h2oH2OFrame.py
@@ -1,28 +1,145 @@
 
 import sys
 sys.path.insert(1,"../../../")
-from tests import pyunit_utils
+from tests import pyunit_utils as pu
 import h2o
 from h2o.utils.typechecks import assert_is_type
 from h2o.frame import H2OFrame
 
-def h2o_H2OFrame():
-    """
-    Python API test: h2o.frame.H2OFrame(python_obj=None, destination_frame=None, header=0, separator=', ',
-    column_names=None, column_types=None, na_strings=None)
-    """
-    python_lists = [[1,4,"a",1], [2,5,"b",0], [3,6,"c",1]]
-    col_names=["num1","num2","str1","enum1"]
-    dest_frame="newFrame"
-    heads=-1
-    sep=','
-    col_types=['numeric', 'numeric', 'string', 'enum']
-    na_str=['NA']
-    h2oframe = h2o.H2OFrame(python_obj=python_lists, destination_frame=dest_frame, header=heads, separator=sep,
-                            column_names=col_names, column_types=col_types, na_strings=na_str)
-    assert_is_type(h2oframe, H2OFrame)
-    assert h2oframe.nrows==len(python_lists) and h2oframe.ncols==len(python_lists[0]), "h2o.H2OFrame() command is " \
-                                                                                 "not working."
-
-
-pyunit_utils.standalone_test(h2o_H2OFrame)
+
+data = [
+    [1, .4, "a",  1],
+    [2, 5 , "b",  0],
+    [3, 6 ,  "", 1],
+]
+col_names = ["num1", "num2", "str1", "enum1"]
+col_types = ['numeric', 'numeric', 'string', 'enum']
+col_types_back = dict(zip(col_names, ['int', 'real', 'string', 'enum']))
+na_str = ['']
+
+
+def check_frame(fr, dest=None):
+    if dest is not None:
+        assert fr.key == dest
+    assert fr.nrows == len(data)
+    assert fr.ncols == len(data[0])
+    assert fr.columns == col_names
+    assert fr.types == col_types_back
+    for i, row in enumerate(data):
+        for j, value in enumerate(row):
+            expected_val = ('NA' if data[i][j] == na_str[0] 
+                            else str(data[i][j]) if col_types[j] == 'enum'
+                            else data[i][j])
+            assert fr[i, j] == expected_val, "expected value at [%s, %s] = %s, but got %s" % (i, j, fr[i, j], expected_val)
+    
+
+def H2OFrame_from_list_no_header():
+    fr = h2o.H2OFrame(data, destination_frame="from_list_no_header", 
+                      column_names=col_names, column_types=col_types, na_strings=na_str)
+    assert_is_type(fr, H2OFrame)
+    check_frame(fr, "from_list_no_header")
+
+
+def H2OFrame_from_list_with_header():
+    fr = h2o.H2OFrame([col_names]+data, destination_frame="from_list_with_header",
+                      header=1, column_types=col_types, na_strings=na_str)
+    assert_is_type(fr, H2OFrame)
+    check_frame(fr, "from_list_with_header")
+    
+    
+def H2OFrame_from_dict():
+    fr = h2o.H2OFrame(dict(zip(col_names, zip(*data))), destination_frame="from_dict",
+                      column_types=col_types, na_strings=na_str)
+    assert_is_type(fr, H2OFrame)
+    check_frame(fr, "from_dict")
+    
+    
+def H2OFrame_from_numpy():
+    import numpy as np
+    fr = h2o.H2OFrame(np.array(data), destination_frame="from_numpy_array",
+                      column_names=col_names, column_types=col_types, na_strings=na_str)
+    assert_is_type(fr, H2OFrame)
+    check_frame(fr, "from_numpy_array")
+
+
+def H2OFrame_from_pandas():
+    import pandas as pd
+    fr = h2o.H2OFrame(pd.DataFrame(data, columns=col_names), destination_frame="from_pandas_dataframe",
+                      column_types=col_types, na_strings=na_str)
+    assert_is_type(fr, H2OFrame)
+    check_frame(fr, "from_pandas_dataframe")
+
+
+def H2OFrame_from_scipy():
+    from scipy.sparse import csr_matrix, csc_matrix
+    nostr_data = [[0 if isinstance(v, str) else v for v in r] for r in data]
+    def check_sparse_frame(fr, dest):
+        assert fr.key == dest
+        assert fr.nrows == len(nostr_data)
+        assert fr.ncols == len(nostr_data[0])
+        assert fr.columns == ["C%i"%i for i in range(1, fr.ncols+1)]  # BUG! construction with scipy matrix ignores `column_names` param: see ticket GH-#####
+        assert fr.types == dict(C1='int', C2='real', C3='int', C4='int') # enum types are also ignored, but it makes sense for sparse matrices, we could add a warning though.
+        for i, row in enumerate(data):
+            for j, value in enumerate(row):
+                assert fr[i, j] == nostr_data[i][j], "expected value at [%s, %s] = %s, but got %s" % (i, j, fr[i, j], nostr_data[i][j])
+        
+    
+    fr = h2o.H2OFrame(csr_matrix(nostr_data), destination_frame="from_sparse_row_matrix",
+                      column_names=col_names, column_types=col_types)
+    assert_is_type(fr, H2OFrame)
+    print(fr)
+    check_sparse_frame(fr, "from_sparse_row_matrix")
+
+    fr = h2o.H2OFrame(csc_matrix(nostr_data), destination_frame="from_sparse_column_matrix",
+                      column_names=col_names, column_types=col_types)
+    assert_is_type(fr, H2OFrame)
+    check_sparse_frame(fr, "from_sparse_column_matrix")
+
+def H2OFrame_from_H2OFrame():
+    fr = h2o.H2OFrame(python_obj=data, destination_frame="h2oframe_ori",
+                      column_names=col_names, column_types=col_types, na_strings=na_str)
+    
+    dupl1 = h2o.H2OFrame(python_obj=fr)
+    assert_is_type(dupl1, H2OFrame)
+    check_frame(dupl1)
+    assert fr.key != dupl1.key
+    
+    dupl2 = h2o.H2OFrame(python_obj=fr, destination_frame="h2oframe_duplicate")
+    assert_is_type(dupl2, H2OFrame)
+    check_frame(dupl2, "h2oframe_duplicate")
+
+    dupl3 = h2o.H2OFrame(python_obj=fr, column_names=["n1", "n2", "s1", "e1"], destination_frame="h2oframe_duplicate_with_renamed_columns")
+    assert_is_type(dupl3, H2OFrame)
+    assert dupl3.key == "h2oframe_duplicate_with_renamed_columns"
+    assert dupl3.ncols == fr.ncols
+    assert dupl3.nrows == fr.nrows
+    assert dupl3.as_data_frame(use_pandas=False, header=False) == fr.as_data_frame(use_pandas=False, header=False)
+    assert dupl3.columns == ["n1", "n2", "s1", "e1"]
+
+    dupl4 = h2o.H2OFrame(python_obj=fr, skipped_columns=[1, 3], column_names=["n1", "s1"], destination_frame="h2oframe_duplicate_with_skipped_columns")
+    assert_is_type(dupl3, H2OFrame)
+    assert dupl4.key == "h2oframe_duplicate_with_skipped_columns"
+    assert dupl4.ncols == fr.ncols - 2
+    assert dupl4.nrows == fr.nrows
+    assert dupl4.as_data_frame(use_pandas=False, header=False) == fr.drop([1, 3]).as_data_frame(use_pandas=False, header=False)
+    assert dupl4.columns == ["n1", "s1"]
+
+
+def H2OFrame_skipped_columns_is_BUGGY():
+    try:
+        h2o.H2OFrame(data, skipped_columns=[1])
+        assert False, "skipped_columns handling may be fixed now"  # parse_setup is absolutely weird, with only half parameters passed to build the ParseSetup, and then a bunch of logic done locally, that's why it's buggy: see issue GH-#####
+    except ValueError as e:
+        assert "length of col_names should be equal to the number of columns parsed: 4 vs 3" in str(e)
+
+
+pu.run_tests([
+    H2OFrame_from_list_no_header,
+    H2OFrame_from_list_with_header,
+    H2OFrame_from_dict,
+    H2OFrame_from_numpy,
+    H2OFrame_from_pandas,
+    H2OFrame_from_scipy,
+    H2OFrame_from_H2OFrame,
+    H2OFrame_skipped_columns_is_BUGGY
+])