Skip to content

Commit

Permalink
GH-16348 - remove reference to erin-data and h2o-smalldata (#16364)
Browse files Browse the repository at this point in the history
* GH-16348 - remove reference to erin-data and add the one to h2o-public-test-data

* Fix filename and upload preprocessed  file from the bucket
  • Loading branch information
valenad1 authored Aug 19, 2024
1 parent 16cfb04 commit c7f0001
Show file tree
Hide file tree
Showing 15 changed files with 56 additions and 48 deletions.
8 changes: 6 additions & 2 deletions h2o-bindings/bin/custom/R/gen_stackedensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,12 @@ def update_param(name, param):
h2o.init()
# Import a sample binary outcome train/test set
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv"
)
test <- h2o.importFile(
"https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv"
)
# Identify predictors and response
y <- "response"
Expand Down
22 changes: 11 additions & 11 deletions h2o-docs/src/product/admissible.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ The code below generates an infogram, and we plot the infogram and view the data
h2o.init()

# Import credit dataset
f <- "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv"
col_types <- list(by.col.name = c("SEX", "MARRIAGE", "default_payment_next_month"),
types = c("factor", "factor", "factor"))
df <- h2o.importFile(path = f, col.types = col_types)
Expand Down Expand Up @@ -201,7 +201,7 @@ The code below generates an infogram, and we plot the infogram and view the data
h2o.init()

# Import credit dataset
f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv"
col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
df = h2o.import_file(path=f, col_types=col_types)

Expand Down Expand Up @@ -259,7 +259,7 @@ The code below generates an infogram, and we plot the infogram and view the data
h2o.init()

# Import HMDA dataset
f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types <- list(by.col.name = c("high_priced"),
types = c("factor"))
df <- h2o.importFile(path = f, col.types = col_types)
Expand Down Expand Up @@ -303,7 +303,7 @@ The code below generates an infogram, and we plot the infogram and view the data
h2o.init()

# Import HDMA dataset
f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types = {'high_priced': "enum"}
df = h2o.import_file(path=f, col_types=col_types)

Expand Down Expand Up @@ -548,7 +548,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the
.. tabs::
.. code-tab:: r R

f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types <- list(by.col.name = c("high_priced"),
types = c("factor"))
df <- h2o.importFile(path = f, col.types = col_types)
Expand Down Expand Up @@ -590,7 +590,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the
.. code-tab:: python

# Import HDMA dataset
f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types = {'high_priced': "enum"}
df = h2o.import_file(path=f, col_types=col_types)

Expand Down Expand Up @@ -638,7 +638,7 @@ Characteristics or Precision-Recall Curves.
.. tabs::
.. code-tab:: r R

f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types <- list(by.col.name = c("high_priced"),
types = c("factor"))
df <- h2o.importFile(path = f, col.types = col_types)
Expand Down Expand Up @@ -670,7 +670,7 @@ Characteristics or Precision-Recall Curves.
.. code-tab:: python

# Import HDMA dataset
f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types = {'high_priced': "enum"}
df = h2o.import_file(path=f, col_types=col_types)

Expand Down Expand Up @@ -749,7 +749,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair
.. tabs::
.. code-tab:: r R

f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types <- list(by.col.name = c("high_priced"),
types = c("factor"))
df <- h2o.importFile(path = f, col.types = col_types)
Expand Down Expand Up @@ -781,7 +781,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair
.. code-tab:: python

# Import HDMA dataset
f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
col_types = {'high_priced': "enum"}
df = h2o.import_file(path=f, col_types=col_types)

Expand Down Expand Up @@ -836,4 +836,4 @@ Subhadeep Mukhopadhyay. *InfoGram and Admissible Machine Learning*, August 2021.

LUM, Kristian, ZHANG, Yunfeng and BOWER, Amanda. *De-biasing “bias” measurement*, June 2022. `arXiv Url <https://arxiv.org/abs/2205.05770>`__.

HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url <https://arxiv.org/abs/1610.02413>`__.
HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url <https://arxiv.org/abs/1610.02413>`__.
8 changes: 4 additions & 4 deletions h2o-docs/src/product/automl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ Here’s an example showing basic usage of the ``h2o.automl()`` function in *R*
h2o.init()

# Import a sample binary outcome train/test set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
Expand Down Expand Up @@ -241,8 +241,8 @@ Here’s an example showing basic usage of the ``h2o.automl()`` function in *R*
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x <- setdiff(names(train), y)
Expand Down Expand Up @@ -77,7 +77,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x = train.columns
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x <- setdiff(names(train), y)
Expand Down Expand Up @@ -77,7 +77,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x = train.columns
Expand Down
4 changes: 2 additions & 2 deletions h2o-docs/src/product/data-science/algo-params/sort_metric.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x <- setdiff(names(train), y)
Expand Down Expand Up @@ -81,7 +81,7 @@ Example
h2o.init()

# Import a sample binary outcome training set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")

# Identify predictors and response
x = train.columns
Expand Down
8 changes: 4 additions & 4 deletions h2o-docs/src/product/data-science/stacked-ensembles.rst
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,8 @@ Below is a simple example showing how to build a Stacked Ensembles model.
h2o.init()

# Import a sample binary outcome train/test set into H2O
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
Expand Down Expand Up @@ -287,8 +287,8 @@ Below is a simple example showing how to build a Stacked Ensembles model.
h2o.init()

# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
Expand Down
8 changes: 4 additions & 4 deletions h2o-docs/src/product/grid-search.rst
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ Grid Search Examples
h2o.init()

# Import a sample binary outcome dataset into H2O
data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
Expand Down Expand Up @@ -175,8 +175,8 @@ Grid Search Examples
h2o.init()

# Import a sample binary outcome dataset into H2O
data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = data.columns
Expand Down
8 changes: 4 additions & 4 deletions h2o-docs/src/product/performance-and-prediction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2428,8 +2428,8 @@ Allowed options include:
h2o.init()

# Import a sample binary outcome dataset into H2O
data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
y <- "response"
Expand Down Expand Up @@ -2470,8 +2470,8 @@ Allowed options include:
h2o.init()

# Import a sample binary outcome dataset into H2O
data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = data.columns
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/ensemble/demos/h2o_ensemble_documentation_example.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ h2o.init(nthreads = -1) # Start an H2O cluster with nthreads = num cores on you


# Import a sample binary outcome train/test set into R
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
y <- "response"
x <- setdiff(names(train), y)
family <- "binomial"
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/ensemble/demos/h2o_metalearn_documentation_example.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ h2o.init(nthreads = -1) # Start an H2O cluster with nthreads = num cores on you


# Import a sample binary outcome train/test set into R
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
y <- "response"
x <- setdiff(names(train), y)
family <- "binomial"
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/ensemble/demos/h2o_stack_documentation_example.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ h2o.init(nthreads = -1) # Start an H2O cluster with nthreads = num cores on you


# Import a sample binary outcome train/test set into R
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
y <- "response"
x <- setdiff(names(train), y)
family <- "binomial"
Expand Down
6 changes: 3 additions & 3 deletions h2o-r/ensemble/demos/higgs_randomgrid_stack.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ h2o.init(nthreads = -1) # Start an H2O cluster with nthreads = num cores on you


# Import a sample binary outcome train/test set into R
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_5k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
y <- "response"
x <- setdiff(names(train), y)
family <- "binomial"
Expand Down Expand Up @@ -172,4 +172,4 @@ print(perf3)
#
#Ensemble performance (AUC): 0.781995245966915

# We have a winner.
# We have a winner.
8 changes: 6 additions & 2 deletions h2o-r/h2o-package/R/stackedensemble.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,12 @@
#' h2o.init()
#'
#' # Import a sample binary outcome train/test set
#' train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
#' test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
#' train <- h2o.importFile(
#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv"
#' )
#' test <- h2o.importFile(
#' "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv"
#' )
#'
#' # Identify predictors and response
#' y <- "response"
Expand Down
4 changes: 2 additions & 2 deletions h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ source("../../scripts/h2o-r-test-setup.R")
# R behavior: Reports an error but keeps the frame as is

test.pubdev.2800 <- function(conn){
df <- h2o.importFile("http://h2o-smalldata.s3.amazonaws.com/jira/test_string_missing.csv")
df <- h2o.importFile("smalldata/jira/test_string_missing.csv")
expect_false(is.na(df[3,2]))
}

doTest("'0' Parsed incorrectly", test.pubdev.2800)
doTest("'0' Parsed incorrectly", test.pubdev.2800)

0 comments on commit c7f0001

Please sign in to comment.