Skip to content

Commit

Permalink
replaced partial_plot function param 'data' with 'frame', and kept th…
Browse files Browse the repository at this point in the history
…e old param as deprecated
  • Loading branch information
sebhrusen committed Oct 31, 2023
1 parent cd38506 commit 445b209
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 53 deletions.
24 changes: 12 additions & 12 deletions h2o-py/h2o/model/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1251,15 +1251,15 @@ def _get_metrics(o, train, valid, xval):
metrics["train"] = output["training_metrics"]
return metrics

@deprecated_params({'save_to_file': 'save_plot_path'})
def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_column=None,
@deprecated_params({'data': 'frame', 'save_to_file': 'save_plot_path'})
def partial_plot(self, frame, cols=None, destination_key=None, nbins=20, weight_column=None,
plot=True, plot_stddev=True, figsize=(7, 10), server=False, include_na=False, user_splits=None,
col_pairs_2dpdp=None, save_plot_path=None, row_index=None, targets=None):
"""
Create partial dependence plot which gives a graphical depiction of the marginal effect of a variable on the
response. The effect of a variable is measured in change in the mean response.
:param H2OFrame data: An H2OFrame object used for scoring and constructing the plot.
:param H2OFrame frame: An H2OFrame object used for scoring and constructing the plot.
:param cols: Feature(s) for which partial dependence will be calculated.
:param destination_key: A key reference to the created partial dependence tables in H2O.
:param nbins: Number of bins used. For categorical columns make sure the number of bins exceed the level count. If you enable ``add_missing_NA``, the returned length will be nbin+1.
Expand All @@ -1277,7 +1277,7 @@ def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_c
:returns: Plot and list of calculated mean response tables for each feature requested + the resulting plot (can be accessed using ``result.figure()``).
"""
if not isinstance(data, h2o.H2OFrame): raise ValueError("Data must be an instance of H2OFrame.")
if not isinstance(frame, h2o.H2OFrame): raise ValueError("frame must be an instance of H2OFrame.")
num_1dpdp = 0
num_2dpdp = 0
if cols is not None:
Expand All @@ -1301,22 +1301,22 @@ def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_c
# Check cols specified exist in frame data
if cols is not None:
for xi in cols:
if xi not in data.names:
if xi not in frame.names:
raise H2OValueError("Column %s does not exist in the training frame." % xi)
if col_pairs_2dpdp is not None:
for oneP in col_pairs_2dpdp:
if oneP[0] not in data.names:
if oneP[0] not in frame.names:
raise H2OValueError("Column %s does not exist in the training frame." % oneP[0])
if oneP[1] not in data.names:
if oneP[1] not in frame.names:
raise H2OValueError("Column %s does not exist in the training frame." % oneP[1])
if oneP[0] is oneP[1]:
raise H2OValueError("2D pdp must be with different columns.")
if isinstance(weight_column, int) and not (weight_column == -1):
raise H2OValueError("Weight column should be a column name in your data frame.")
elif isinstance(weight_column, str): # index is a name
if weight_column not in data.names:
if weight_column not in frame.names:
raise H2OValueError("Column %s does not exist in the data frame" % weight_column)
weight_column = data.names.index(weight_column)
weight_column = frame.names.index(weight_column)

if row_index is not None:
if not isinstance(row_index, int):
Expand All @@ -1334,7 +1334,7 @@ def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_c
kwargs = {}
kwargs["cols"] = cols
kwargs["model_id"] = self.model_id
kwargs["frame_id"] = data.frame_id
kwargs["frame_id"] = frame.frame_id
kwargs["nbins"] = nbins
kwargs["destination_key"] = destination_key
kwargs["weight_column_index"] = weight_column
Expand All @@ -1344,7 +1344,7 @@ def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_c
if targets:
kwargs["targets"] = targets

self.__generate_user_splits(user_splits, data, kwargs)
self.__generate_user_splits(user_splits, frame, kwargs)
json = H2OJob(h2o.api("POST /3/PartialDependence/", data=kwargs), job_type="PartialDependencePlot").poll()
json = h2o.api("GET /3/PartialDependence/%s" % json.dest_key)

Expand All @@ -1353,7 +1353,7 @@ def partial_plot(self, data, cols=None, destination_key=None, nbins=20, weight_c

# Plot partial dependence plots using matplotlib
return self.__generate_partial_plots(num_1dpdp, num_2dpdp, plot, server, pps, figsize,
col_pairs_2dpdp, data, nbins,
col_pairs_2dpdp, frame, nbins,
kwargs["user_cols"], kwargs["num_user_splits"],
plot_stddev, cols, save_plot_path, row_index, targets, include_na)

Expand Down
4 changes: 2 additions & 2 deletions h2o-py/tests/testdir_algos/gbm/pyunit_gbm_pojo_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def prostate_pojo_import():
assert_frame_equal(preds_original.as_data_frame(), preds_imported.as_data_frame())

# 2. check we can get PDPs
pdp_original = model.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False)
pdp_imported = model_imported.partial_plot(data=prostate, cols=['AGE'], server=True, plot=False)
pdp_original = model.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
pdp_imported = model_imported.partial_plot(frame=prostate, cols=['AGE'], server=True, plot=False)
assert_frame_equal(pdp_original[0].as_data_frame(), pdp_imported[0].as_data_frame())


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ def partial_plots():
with TemporaryDirectory() as tmpdir:
path1 = "{}/plot1.png".format(tmpdir)
path2 = "{}/plot2.png".format(tmpdir)
test_plot_result_saving(gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True, row_index=1), path2,
gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True, row_index=1, save_plot_path=path1), path1)
test_plot_result_saving(gbm_model.partial_plot(frame=data, cols=['AGE'], server=True, plot=True, row_index=1), path2,
gbm_model.partial_plot(frame=data, cols=['AGE'], server=True, plot=True, row_index=1, save_plot_path=path1), path1)


def partial_plots_multinomial():
Expand All @@ -178,9 +178,9 @@ def partial_plots_multinomial():

test_plot_result_saving(model.plot(), path2, model.plot(save_plot_path=path1), path1)

test_plot_result_saving(model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
test_plot_result_saving(model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
server=True), path2,
model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
server=True, save_to_file=path1), path1)

def roc_pr_curve():
Expand Down
4 changes: 2 additions & 2 deletions h2o-py/tests/testdir_jira/pyunit_pubdev_7705.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def partial_plot_row_index():
gbm_model.train(x=x, y=y, training_frame=data)

# Generate Partial Dependence for row index -1 and row index 0, they should differ
pdp = gbm_model.partial_plot(data=data, cols=['RACE'], plot=False, plot_stddev=False, row_index=-1)
pdp0 = gbm_model.partial_plot(data=data, cols=['RACE'], plot=False, plot_stddev=False, row_index=0)
pdp = gbm_model.partial_plot(frame=data, cols=['RACE'], plot=False, plot_stddev=False, row_index=-1)
pdp0 = gbm_model.partial_plot(frame=data, cols=['RACE'], plot=False, plot_stddev=False, row_index=0)
assert not(pyunit_utils.equal_two_arrays(pdp[0][1], pdp0[0][1], throw_error=False))


Expand Down
2 changes: 1 addition & 1 deletion h2o-py/tests/testdir_jira/pyunit_pubdev_7949_pdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def test_pdp_user_splits_no_cardinality_check():
user_splits = {
"AGE": ["64", "75"]
}
pdp = gbm_model.partial_plot(data=data, cols=['AGE'], user_splits=user_splits, plot=False)
pdp = gbm_model.partial_plot(frame=data, cols=['AGE'], user_splits=user_splits, plot=False)
assert len(pdp[0].cell_values) == 2


Expand Down
6 changes: 3 additions & 3 deletions h2o-py/tests/testdir_misc/pyunit_partial_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def partial_plot_test():
gbm_model.train(x=x, y=y, training_frame=data)

# Plot Partial Dependence for one feature then for both
pdp1 = gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True)
pdp1 = gbm_model.partial_plot(frame=data, cols=['AGE'], server=True, plot=True)
# Manual test
h2o_mean_response_pdp1 = pdp1[0]["mean_response"]
h2o_stddev_response_pdp1 = pdp1[0]["stddev_response"]
Expand All @@ -34,7 +34,7 @@ def partial_plot_test():
assert h2o_stddev_response_pdp1 == pdp_manual[1]
assert h2o_std_error_mean_response_pdp1 == pdp_manual[2]

pdp2=gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=False)
pdp2=gbm_model.partial_plot(frame=data, cols=['AGE', 'RACE'], server=True, plot=False)
# Manual test
h2o_mean_response_pdp2 = pdp2[0]["mean_response"]
h2o_stddev_response_pdp2 = pdp2[0]["stddev_response"]
Expand All @@ -56,7 +56,7 @@ def partial_plot_test():
assert h2o_std_error_mean_response_pdp2_race == pdp_manual[2]

# Plot Partial Dependence for one row
pdp_row = gbm_model.partial_plot(data=data, cols=['AGE'], server=True, plot=True, row_index=1)
pdp_row = gbm_model.partial_plot(frame=data, cols=['AGE'], server=True, plot=True, row_index=1)
# Manual test
h2o_mean_response_pdp_row = pdp_row[0]["mean_response"]
h2o_stddev_response_pdp_row = pdp_row[0]["stddev_response"]
Expand Down
20 changes: 10 additions & 10 deletions h2o-py/tests/testdir_misc/pyunit_partial_plots_multinomial.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,55 +32,55 @@ def partial_plot_test():
# one class target
cols = ["petal_len"]
targets = ["Iris-setosa"]
pdp_petal_len_se = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=False,
pdp_petal_len_se = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=False,
plot=True, server=True)
print(pdp_petal_len_se)

pdp_petal_len_se_std = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True,
pdp_petal_len_se_std = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True,
plot=True, server=True)
print(pdp_petal_len_se_std)

# two clasess target
targets = ["Iris-setosa", "Iris-virginica"]
pdp_petal_len_se_vi = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=False,
pdp_petal_len_se_vi = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=False,
plot=True, server=True)
print(pdp_petal_len_se_vi)

pdp_petal_len_se_vi_std = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True,
pdp_petal_len_se_vi_std = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True,
plot=True, server=True)
print(pdp_petal_len_se_vi_std)

# three classes target
targets = ["Iris-setosa", "Iris-virginica", "Iris-versicolor"]
pdp_petal_len_se_vi_ve_std = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True,
pdp_petal_len_se_vi_ve_std = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True,
plot=True, server=True)
print(pdp_petal_len_se_vi_ve_std)

# two columns and three classes target
cols = ["sepal_len", "petal_len"]
pdp_petal_len_sepal_len_se_vi_ve_std = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True,
pdp_petal_len_sepal_len_se_vi_ve_std = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True,
plot=True, server=True)
print(pdp_petal_len_sepal_len_se_vi_ve_std)

# three columns and three classes target
cols = ["sepal_len","petal_len", "sepal_wid"]
pdp_petal_len_sepal_len_sepal_wid_se_vi_ve = model.partial_plot(data=iris, cols=cols, targets=targets,
pdp_petal_len_sepal_len_sepal_wid_se_vi_ve = model.partial_plot(frame=iris, cols=cols, targets=targets,
plot_stddev=False, plot=True, server=True)
print(pdp_petal_len_sepal_len_sepal_wid_se_vi_ve)

pdp_petal_len_sepal_len_sepal_wid_se_vi_ve_std = model.partial_plot(data=iris, cols=cols, targets=targets,
pdp_petal_len_sepal_len_sepal_wid_se_vi_ve_std = model.partial_plot(frame=iris, cols=cols, targets=targets,
plot_stddev=True, plot=True, server=True)
print(pdp_petal_len_sepal_len_sepal_wid_se_vi_ve_std)

# categorical column - nonsense column, just for testing
cols = ["random_cat"]
targets = ["Iris-setosa"]
pdp_petal_len_cat = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=False, plot=True,
pdp_petal_len_cat = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=False, plot=True,
server=True)
print(pdp_petal_len_cat)

targets = ["Iris-setosa", "Iris-versicolor"]
pdp_petal_len_cat_std = model.partial_plot(data=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
pdp_petal_len_cat_std = model.partial_plot(frame=iris, cols=cols, targets=targets, plot_stddev=True, plot=True,
server=True)
print(pdp_petal_len_cat_std)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ def partial_plot_test_with_user_splits():
# pdp without weight or NA
with pyunit_utils.TemporaryDirectory() as tmpdir:
file, filename = tempfile.mkstemp(suffix=".png", dir=tmpdir)
pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DPROS'],server=True, plot=True, save_to_file=filename)
pdpOrig = gbm_model.partial_plot(frame=data,cols=['AGE', 'RACE', 'DPROS'],server=True, plot=True, save_to_file=filename)
assert os.path.getsize(filename) > 0

pdpUserSplit = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DPROS'],server=True, plot=True,
pdpUserSplit = gbm_model.partial_plot(frame=data,cols=['AGE', 'RACE', 'DPROS'],server=True, plot=True,
user_splits=user_splits)

# compare results
Expand Down
6 changes: 3 additions & 3 deletions h2o-py/tests/testdir_misc/pyunit_pubdev_5761_pdp_NA.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ def partial_plot_test():
gbm_model.train(x=x, y=y, training_frame=data)

# pdp without weight or NA
pdpOrig = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE'],server=True, plot=True)
pdpOrig = gbm_model.partial_plot(frame=data,cols=['AGE', 'RACE'],server=True, plot=True)
# pdp with constant weight and NA
pdpcWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
pdpcWNA = gbm_model.partial_plot(frame=data, cols=['AGE', 'RACE'], server=True, plot=True,
weight_column="constWeight", include_na=True)

# compare results
pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[0], pdpcWNA[0], pdpOrig[0].col_header, tolerance=1e-10)
pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdpOrig[1], pdpcWNA[1], pdpOrig[1].col_header, tolerance=1e-10)
# pdp with changing weight NA
pdpvWNA = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE'], server=True, plot=True,
pdpvWNA = gbm_model.partial_plot(frame=data, cols=['AGE', 'RACE'], server=True, plot=True,
weight_column="variWeight", include_na=True)
ageList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[0], "age")
raceList = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpvWNA[1], "race")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ def partial_plot_test():
gbm_model.train(x=x, y=y, training_frame=data)

# pdp with weight and no NA
pdpw = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
pdpw = gbm_model.partial_plot(frame=test, cols=["Input_miss", "Distance"], server=True, plot=False,
weight_column=WC)

# pdp with weight and NA
pdpwNA = gbm_model.partial_plot(data=test, cols=["Input_miss", "Distance"], server=True, plot=False,
pdpwNA = gbm_model.partial_plot(frame=test, cols=["Input_miss", "Distance"], server=True, plot=False,
weight_column=WC, include_na = True)
input_miss_list = pyunit_utils.extract_col_value_H2OTwoDimTable(pdpwNA[0], "input_miss")
assert math.isnan(input_miss_list[-1]), "Expected last element to be nan but is not."
Expand All @@ -47,4 +47,4 @@ def partial_plot_test():
if __name__ == "__main__":
pyunit_utils.standalone_test(partial_plot_test)
else:
partial_plot_test()
partial_plot_test()
6 changes: 3 additions & 3 deletions h2o-py/tests/testdir_misc/pyunit_pubdev_6438_2D_pdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ def partial_plot_test_with_user_splits():
67.63157894736842, 69.52631578947368, 71.42105263157895, 73.3157894736842,
75.21052631578948, 77.10526315789474]
user_splits['RACE'] = ["Black", "White"]
pdpUserSplit2D = gbm_model.partial_plot(data=data,server=True, plot=True, user_splits=user_splits,
pdpUserSplit2D = gbm_model.partial_plot(frame=data,server=True, plot=True, user_splits=user_splits,
col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename)
pdpUserSplit1D2D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True,
pdpUserSplit1D2D = gbm_model.partial_plot(frame=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True,
user_splits=user_splits,
col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']], save_to_file=filename)
pdpUserSplit1D = gbm_model.partial_plot(data=data,cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True,
pdpUserSplit1D = gbm_model.partial_plot(frame=data,cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=True,
user_splits=user_splits, save_to_file=filename)
if os.path.isfile(filename):
os.remove(filename)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def partial_plot_test_with_no_user_splits_no_1DPDP():
gbm_model.train(x=x, y=y, training_frame=data)

# pdp without weight or NA
pdp2dOnly = gbm_model.partial_plot(data=data, server=True, plot=False,
pdp2dOnly = gbm_model.partial_plot(frame=data, server=True, plot=False,
col_pairs_2dpdp=[['AGE', 'PSA'],['AGE', 'RACE']])
pdp1D2D = gbm_model.partial_plot(data=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=False,
pdp1D2D = gbm_model.partial_plot(frame=data, cols=['AGE', 'RACE', 'DCAPS'], server=True, plot=False,
col_pairs_2dpdp=[['AGE', 'PSA'], ['AGE', 'RACE']])
# compare results 2D pdp
pyunit_utils.assert_H2OTwoDimTable_equal_upto(pdp2dOnly[0], pdp1D2D[3],
Expand Down
Loading

0 comments on commit 445b209

Please sign in to comment.