GH-16348 - remove reference to erin-data and h2o-smalldata (#16364)

* GH-16348 - remove reference to erin-data and add the one to h2o-public-test-data * Fix filename and upload preprocessed file from the bucket
h2oai · Aug 19, 2024 · c7f0001 · c7f0001
1 parent 16cfb04
commit c7f0001
Show file tree

Hide file tree

Showing 15 changed files with 56 additions and 48 deletions.
diff --git a/h2o-bindings/bin/custom/R/gen_stackedensemble.py b/h2o-bindings/bin/custom/R/gen_stackedensemble.py
@@ -90,8 +90,12 @@ def update_param(name, param):
 h2o.init()
 
 # Import a sample binary outcome train/test set
-train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+train <- h2o.importFile(
+    "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv"
+    )
+test <- h2o.importFile(
+    "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv"
+    )
 
 # Identify predictors and response
 y <- "response"

diff --git a/h2o-docs/src/product/admissible.rst b/h2o-docs/src/product/admissible.rst
@@ -167,7 +167,7 @@ The code below generates an infogram, and we plot the infogram and view the data
         h2o.init()
 
         # Import credit dataset
-        f <- "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv"
         col_types <- list(by.col.name = c("SEX", "MARRIAGE", "default_payment_next_month"), 
                           types = c("factor", "factor", "factor"))
         df <- h2o.importFile(path = f, col.types = col_types)
@@ -201,7 +201,7 @@ The code below generates an infogram, and we plot the infogram and view the data
         h2o.init()
 
         # Import credit dataset
-        f = "https://erin-data.s3.amazonaws.com/admissible/data/taiwan_credit_card_uci.csv"
+        f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/taiwan_credit_card_uci_prep.csv"
         col_types = {'SEX': "enum", 'MARRIAGE': "enum", 'default_payment_next_month': "enum"}
         df = h2o.import_file(path=f, col_types=col_types)
 
@@ -259,7 +259,7 @@ The code below generates an infogram, and we plot the infogram and view the data
         h2o.init()
 
         # Import HMDA dataset
-        f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types <- list(by.col.name = c("high_priced"), 
                           types = c("factor"))
         df <- h2o.importFile(path = f, col.types = col_types)
@@ -303,7 +303,7 @@ The code below generates an infogram, and we plot the infogram and view the data
         h2o.init()
 
         # Import HDMA dataset
-        f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types = {'high_priced': "enum"}
         df = h2o.import_file(path=f, col_types=col_types)
 
@@ -548,7 +548,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the
 .. tabs::
    .. code-tab:: r R
 
-        f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types <- list(by.col.name = c("high_priced"),
                           types = c("factor"))
         df <- h2o.importFile(path = f, col.types = col_types)
@@ -590,7 +590,7 @@ impact ratio (air), significant adverse impact ratio (calculated only using the
    .. code-tab:: python
 
         # Import HDMA dataset
-        f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types = {'high_priced': "enum"}
         df = h2o.import_file(path=f, col_types=col_types)
 
@@ -638,7 +638,7 @@ Characteristics or Precision-Recall Curves.
 .. tabs::
    .. code-tab:: r R
 
-        f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types <- list(by.col.name = c("high_priced"),
                           types = c("factor"))
         df <- h2o.importFile(path = f, col.types = col_types)
@@ -670,7 +670,7 @@ Characteristics or Precision-Recall Curves.
    .. code-tab:: python
 
         # Import HDMA dataset
-        f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types = {'high_priced': "enum"}
         df = h2o.import_file(path=f, col_types=col_types)
 
@@ -749,7 +749,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair
 .. tabs::
    .. code-tab:: r R
 
-        f <- "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f <- "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types <- list(by.col.name = c("high_priced"),
                           types = c("factor"))
         df <- h2o.importFile(path = f, col.types = col_types)
@@ -781,7 +781,7 @@ This kind of SHAP plot can be obtained using ``model.fair_shap_plot``/``h2o.fair
    .. code-tab:: python
 
         # Import HDMA dataset
-        f = "https://erin-data.s3.amazonaws.com/admissible/data/hmda_lar_2018_sample.csv"
+        f = "https://h2o-public-test-data.s3.amazonaws.com/smalldata/admissibleml_test/hmda_lar_2018_sample.csv"
         col_types = {'high_priced': "enum"}
         df = h2o.import_file(path=f, col_types=col_types)
 
@@ -836,4 +836,4 @@ Subhadeep Mukhopadhyay. *InfoGram and Admissible Machine Learning*, August 2021.
 
 LUM, Kristian, ZHANG, Yunfeng and BOWER, Amanda. *De-biasing “bias” measurement*, June 2022. `arXiv Url <https://arxiv.org/abs/2205.05770>`__.
 
-HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url <https://arxiv.org/abs/1610.02413>`__.
+HARDT, Moritz, PRICE, Eric and SREBRO, Nathan. *Equality of Opportunity in Supervised Learning*, October 2016. `arXiv Url <https://arxiv.org/abs/1610.02413>`__.
diff --git a/h2o-docs/src/product/automl.rst b/h2o-docs/src/product/automl.rst
@@ -182,8 +182,8 @@ Here’s an example showing basic usage of the ``h2o.automl()`` function in *R*
         h2o.init()
 
         # Import a sample binary outcome train/test set into H2O
-        train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-        test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+        train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+        test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
         # Identify predictors and response
         y <- "response"
@@ -241,8 +241,8 @@ Here’s an example showing basic usage of the ``h2o.automl()`` function in *R*
         h2o.init()
 
         # Import a sample binary outcome train/test set into H2O
-        train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-        test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+        train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+        test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
         # Identify predictors and response
         x = train.columns

diff --git a/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst b/h2o-docs/src/product/data-science/algo-params/exclude_algos.rst
@@ -34,7 +34,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x <- setdiff(names(train), y)
@@ -77,7 +77,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x = train.columns

diff --git a/h2o-docs/src/product/data-science/algo-params/include_algos.rst b/h2o-docs/src/product/data-science/algo-params/include_algos.rst
@@ -34,7 +34,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x <- setdiff(names(train), y)
@@ -77,7 +77,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x = train.columns

diff --git a/h2o-docs/src/product/data-science/algo-params/sort_metric.rst b/h2o-docs/src/product/data-science/algo-params/sort_metric.rst
@@ -38,7 +38,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x <- setdiff(names(train), y)
@@ -81,7 +81,7 @@ Example
 		h2o.init()
 
 		# Import a sample binary outcome training set into H2O
-		train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
+		train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
 
 		# Identify predictors and response
 		x = train.columns

diff --git a/h2o-docs/src/product/data-science/stacked-ensembles.rst b/h2o-docs/src/product/data-science/stacked-ensembles.rst
@@ -157,8 +157,8 @@ Below is a simple example showing how to build a Stacked Ensembles model.
         h2o.init()
 
         # Import a sample binary outcome train/test set into H2O
-        train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-        test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+        train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+        test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
         # Identify predictors and response
         y <- "response"
@@ -287,8 +287,8 @@ Below is a simple example showing how to build a Stacked Ensembles model.
         h2o.init()
 
         # Import a sample binary outcome train/test set into H2O
-        train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-        test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+        train = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+        test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
         # Identify predictors and response
         x = train.columns

diff --git a/h2o-docs/src/product/grid-search.rst b/h2o-docs/src/product/grid-search.rst
@@ -116,8 +116,8 @@ Grid Search Examples
     h2o.init()
 
     # Import a sample binary outcome dataset into H2O
-    data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-    test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+    data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+    test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
     # Identify predictors and response
     y <- "response"
@@ -175,8 +175,8 @@ Grid Search Examples
     h2o.init()
 
     # Import a sample binary outcome dataset into H2O
-    data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-    test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+    data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+    test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
     # Identify predictors and response
     x = data.columns

diff --git a/h2o-docs/src/product/performance-and-prediction.rst b/h2o-docs/src/product/performance-and-prediction.rst
@@ -2428,8 +2428,8 @@ Allowed options include:
     h2o.init()
 
     # Import a sample binary outcome dataset into H2O
-    data <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-    test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+    data <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+    test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
     # Identify predictors and response
     y <- "response"
@@ -2470,8 +2470,8 @@ Allowed options include:
     h2o.init()
 
     # Import a sample binary outcome dataset into H2O
-    data = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-    test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+    data = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv")
+    test = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 
     # Identify predictors and response
     x = data.columns

diff --git a/h2o-r/ensemble/demos/h2o_ensemble_documentation_example.R b/h2o-r/ensemble/demos/h2o_ensemble_documentation_example.R
@@ -5,8 +5,8 @@ h2o.init(nthreads = -1)  # Start an H2O cluster with nthreads = num cores on you
 
 
 # Import a sample binary outcome train/test set into R
-train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
-test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
+test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
 y <- "response"
 x <- setdiff(names(train), y)
 family <- "binomial"

diff --git a/h2o-r/ensemble/demos/h2o_metalearn_documentation_example.R b/h2o-r/ensemble/demos/h2o_metalearn_documentation_example.R
@@ -5,8 +5,8 @@ h2o.init(nthreads = -1)  # Start an H2O cluster with nthreads = num cores on you
 
 
 # Import a sample binary outcome train/test set into R
-train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
-test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
+test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
 y <- "response"
 x <- setdiff(names(train), y)
 family <- "binomial"

diff --git a/h2o-r/ensemble/demos/h2o_stack_documentation_example.R b/h2o-r/ensemble/demos/h2o_stack_documentation_example.R
@@ -5,8 +5,8 @@ h2o.init(nthreads = -1)  # Start an H2O cluster with nthreads = num cores on you
 
 
 # Import a sample binary outcome train/test set into R
-train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
-test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_train_5k.csv")
+test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/higgs/higgs_test_5k.csv")
 y <- "response"
 x <- setdiff(names(train), y)
 family <- "binomial"

diff --git a/h2o-r/ensemble/demos/higgs_randomgrid_stack.R b/h2o-r/ensemble/demos/higgs_randomgrid_stack.R
@@ -6,8 +6,8 @@ h2o.init(nthreads = -1)  # Start an H2O cluster with nthreads = num cores on you
 
 
 # Import a sample binary outcome train/test set into R
-train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_5k.csv")
-test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+train <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_5k.csv")
+test <- h2o.importFile("https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv")
 y <- "response"
 x <- setdiff(names(train), y)
 family <- "binomial"
@@ -172,4 +172,4 @@ print(perf3)
 #
 #Ensemble performance (AUC): 0.781995245966915
 
-# We have a winner.
+# We have a winner.
diff --git a/h2o-r/h2o-package/R/stackedensemble.R b/h2o-r/h2o-package/R/stackedensemble.R
@@ -59,8 +59,12 @@
 #' h2o.init()
 #' 
 #' # Import a sample binary outcome train/test set
-#' train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-#' test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+#' train <- h2o.importFile(
+#'     "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_train_10k.csv"
+#'     )
+#' test <- h2o.importFile(
+#'     "https://s3.amazonaws.com/h2o-public-test-data/smalldata/higgs/higgs_test_5k.csv"
+#'     )
 #' 
 #' # Identify predictors and response
 #' y <- "response"

diff --git a/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R b/h2o-r/tests/testdir_jira/runit_NOPASS_pub_2800.R
@@ -4,8 +4,8 @@ source("../../scripts/h2o-r-test-setup.R")
 # R behavior: Reports an error but keeps the frame as is
 
 test.pubdev.2800 <- function(conn){
-    df <- h2o.importFile("http://h2o-smalldata.s3.amazonaws.com/jira/test_string_missing.csv")
+    df <- h2o.importFile("smalldata/jira/test_string_missing.csv")
     expect_false(is.na(df[3,2]))
 }
 
-doTest("'0' Parsed incorrectly", test.pubdev.2800)
+doTest("'0' Parsed incorrectly", test.pubdev.2800)