From 716839d773046fa059d2e800672986957fa0832b Mon Sep 17 00:00:00 2001
From: Theodore Vasiloudis <thvasilo@amazon.com>
Date: Mon, 13 Jan 2025 01:08:12 +0000
Subject: [PATCH] [SageMaker] Add launcher and support for SageMaker HPO jobs

---
 .../distributed/sagemaker.rst                 |  74 +++++++
 .../graphstorm/sagemaker/sagemaker_train.py   |  26 ++-
 .../launch/launch_hyperparameter_tuning.py    | 209 ++++++++++++++++++
 sagemaker/run/train_entry.py                  |   9 +-
 4 files changed, 308 insertions(+), 10 deletions(-)
 create mode 100644 sagemaker/launch/launch_hyperparameter_tuning.py

diff --git a/docs/source/cli/model-training-inference/distributed/sagemaker.rst b/docs/source/cli/model-training-inference/distributed/sagemaker.rst
index 603b529d7d..68a52d8cc5 100644
--- a/docs/source/cli/model-training-inference/distributed/sagemaker.rst
+++ b/docs/source/cli/model-training-inference/distributed/sagemaker.rst
@@ -213,6 +213,80 @@ from ``${DATASET_S3_PATH}`` as input and create a DistDGL graph with
 ``${NUM_PARTITIONS}`` under the output path, ``${OUTPUT_PATH}``.
 Currently we only support ``random`` as the partitioning algorithm.
 
+Launch hyper-parameter optimization task
+````````````````````````````````````````
+
+GraphStorm supports `automatic model tuning <https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning.html>`_
+with SageMaker AI,
+which allows you to optimize the hyper-parameters
+of your model with an easy-to-use interface.
+
+The ``launch/launch_hyperparameter_tuning.py`` script can act as a thin
+wrapper for SageMaker's `HyperParameterTuner <https://sagemaker.readthedocs.io/en/stable/api/training/tuner.html>`_.
+
+You define the hyper-parameters of interest by passing a python dictionary as a string,
+where the keys of the dictionary are the names of the parameters to tune,
+and each value is a dictionary that defines the type and tuning range
+of the parameter.
+
+.. code:: bash
+
+    # Example hyper-parameter ranges
+    python launch/launch_hyperparameter_tuning.py \
+        --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2, "scaling_type": "Auto"}, "num_layers": {"type": "integer", "min": 1, "max": 3}, "model_encoder_type": {"type": "categorical", "values": ["rgcn", "rgat"]}}'
+
+The supported parameter types are ``continuous``
+for real-values parameters, ``integer`` for numerical integer parameters,
+and ``categorical`` for discrete-value string parameters.
+These directly correspond to Sagemaker's
+`Dynamic hyper-parameters <https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html#automatic-model-tuning-define-ranges-dynamic>`~
+
+Continuous and integer parameters need to define a ``"min"`` and ``"max"`` value to use
+during tuning, while categorical variables need to provide a list of strings
+as ``"values"`` to choose from.
+
+For continuous and integer parameters you can also provide a ``scaling_type``
+string that directly corresponds to one of SageMaker's
+`scaling types <https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-define-ranges.html#scaling-type>`_,
+i.e. can be 'Auto', 'Linear', 'Logarithmic', or 'ReverseLogarithmic'.
+By default scaling type will be 'Auto'.
+
+Use ``--metric-name`` to define the name of metric to use as a tuning objective,
+e.g. ``"accuracy"``. See the entry for ``eval_metric`` in :ref:`Evaluation Metrics <eval_metrics>`
+for a full list of supported metrics.
+
+``--metric-dataset`` defines which dataset to collect metrics from, and
+can be either ``"test"`` or ``"val"`` to collect metrics during test or validation
+respectively. Finally use ``--objective-type`` to set the type of the objective,
+which can be either ``"Maximize"`` or ``"Minimize"``.
+
+Finally you can use ``--strategy`` to select the optimization strategy
+from one of "Bayesian", "Random", "Hyperband", "Grid". See the
+`SageMaker documentation <https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-how-it-works.html>`_
+for more details on each strategy.
+
+Example HPO call:
+
+.. code:: bash
+
+    python launch/launch_hyperparameter_tuning.py \
+        --task-name my-gnn-hpo-job \
+        --role arn:aws:iam::123456789012:role/SageMakerRole \
+        --region us-west-2 \
+        --image-url 123456789012.dkr.ecr.us-west-2.amazonaws.com/graphstorm:sagemaker-gpu \
+        --graph-name my-graph \
+        --task-type node_classification \
+        --graph-data-s3 s3://my-bucket/graph-data/ \
+        --yaml-s3 s3://my-bucket/train.yaml \
+        --model-artifact-s3 s3://my-bucket/model-artifacts/ \
+        --max-jobs 20 \
+        --max-parallel-jobs 4 \
+        --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2}, "num_layers": {"type": "integer", "min": 2, "max": 5}}' \
+        --metric-name "accuracy" \
+        --metric-set "val" \
+        --objective-type "Maximize" \
+        --strategy "Bayesian"
+
 Passing additional arguments to the SageMaker Estimator
 ```````````````````````````````````````````````````````
 Sometimes you might want to pass additional arguments to the constructor
diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py
index fe8aafd146..c26f94e416 100644
--- a/python/graphstorm/sagemaker/sagemaker_train.py
+++ b/python/graphstorm/sagemaker/sagemaker_train.py
@@ -15,7 +15,7 @@
 
     Training entry point.
 """
-# Install additional requirements
+
 import os
 import logging
 import socket
@@ -164,7 +164,16 @@ def run_train(args, unknownargs):
         os.makedirs(restore_model_path, exist_ok=True)
     else:
         restore_model_path = None
-    output_path = "/tmp/gsgnn_model/"
+
+    if args.model_artifact_s3:
+        # If user provided an S3 output destination as an input arg, the script itself
+        # will upload the model artifacts after training, so we save under /tmp.
+        output_path = "/tmp/gsgnn_model/"
+    else:
+        # If the user did not provide an output destination as an arg, we rely on SageMaker to
+        # do the model upload so we save the model to the pre-determined path /opt/ml/model
+        output_path = "/opt/ml/model"
+
     os.makedirs(output_path, exist_ok=True)
 
     # start the ssh server
@@ -229,7 +238,11 @@ def run_train(args, unknownargs):
     graph_data_s3 = args.graph_data_s3
     task_type = args.task_type
     train_yaml_s3 = args.train_yaml_s3
-    model_artifact_s3 = args.model_artifact_s3.rstrip('/')
+    # If the user provided an output destination, trim any trailing '/'
+    if args.model_artifact_s3:
+        gs_model_artifact_s3 = args.model_artifact_s3.rstrip('/')
+    else:
+        gs_model_artifact_s3 = None
     custom_script = args.custom_script
 
     boto_session = boto3.session.Session(region_name=args.region)
@@ -292,6 +305,7 @@ def run_train(args, unknownargs):
         logging.error("Task failed")
         sys.exit(-1)
 
-    # If there are saved models
-    if os.path.exists(save_model_path):
-        upload_model_artifacts(model_artifact_s3, save_model_path, sagemaker_session)
+    # We upload models only when the user explicitly set the model_artifact_s3
+    # argument. Otherwise we can rely on the SageMaker service to do the upload.
+    if gs_model_artifact_s3 and os.path.exists(save_model_path):
+        upload_model_artifacts(gs_model_artifact_s3, save_model_path, sagemaker_session)
diff --git a/sagemaker/launch/launch_hyperparameter_tuning.py b/sagemaker/launch/launch_hyperparameter_tuning.py
new file mode 100644
index 0000000000..e551a8b4a3
--- /dev/null
+++ b/sagemaker/launch/launch_hyperparameter_tuning.py
@@ -0,0 +1,209 @@
+r"""
+    Copyright 2023 Contributors
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+    Launch SageMaker HPO jobs.
+
+    Usage:
+
+        python launch_hyperparameter_tuning.py \
+            --task-name my-gnn-hpo-job \
+            --role arn:aws:iam::123456789012:role/SageMakerRole \
+            --region us-west-2 \
+            --image-url 123456789012.dkr.ecr.us-west-2.amazonaws.com/my-graphstorm-image:latest \
+            --graph-name my-graph \
+            --task-type node_classification \
+            --graph-data-s3 s3://my-bucket/graph-data/ \
+            --yaml-s3 s3://my-bucket/train.yaml \
+            --model-artifact-s3 s3://my-bucket/model-artifacts/ \
+            --max-jobs 20 \
+            --max-parallel-jobs 4 \
+            --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2}, "num_layers": {"type": "integer", "min": 2, "max": 5}, "activation": {"type": "categorical", "values": ["relu", "tanh"]}}' \
+            --metric-name "accuracy" \
+            --metric-dataset "val" \
+            --objective-type Maximize
+"""
+import os
+import json
+
+from sagemaker.pytorch.estimator import PyTorch
+from sagemaker.tuner import (
+    HyperparameterTuner,
+    ContinuousParameter,
+    IntegerParameter,
+    CategoricalParameter,
+)
+
+from common_parser import (
+    parse_estimator_kwargs,
+    parse_unknown_gs_args,
+)
+from launch_train import get_train_parser
+
+INSTANCE_TYPE = "ml.g4dn.12xlarge"
+
+
+def parse_hyperparameter_ranges(hyperparameter_ranges_json):
+    """Parse the hyperparameter ranges from JSON string."""
+    ranges = json.loads(hyperparameter_ranges_json)
+    hyperparameter_ranges = {}
+    for param, config in ranges.items():
+        if config["type"] == "continuous":
+            hyperparameter_ranges[param] = ContinuousParameter(
+                config["min"], config["max"], config.get("scaling_type", "Auto")
+            )
+        elif config["type"] == "integer":
+            hyperparameter_ranges[param] = IntegerParameter(
+                config["min"], config["max"], config.get("scaling_type", "Auto")
+            )
+        elif config["type"] == "categorical":
+            hyperparameter_ranges[param] = CategoricalParameter(config["values"])
+    return hyperparameter_ranges
+
+
+def run_hyperparameter_tuning_job(args, image, unknownargs):
+    """Run hyperparameter tuning job using SageMaker HyperparameterTuner"""
+
+    container_image_uri = image
+
+    prefix = f"gs-hpo-{args.graph_name}"
+
+    params = {
+        "eval-metric": args.metric_name,
+        "graph-data-s3": args.graph_data_s3,
+        "graph-name": args.graph_name,
+        "log-level": args.log_level,
+        "task-type": args.task_type,
+        "train-yaml-s3": args.yaml_s3,
+    }
+    if args.custom_script is not None:
+        params["custom-script"] = args.custom_script
+    if args.model_checkpoint_to_load is not None:
+        params["model-checkpoint-to-load"] = args.model_checkpoint_to_load
+
+    unknown_args_dict = parse_unknown_gs_args(unknownargs)
+    params.update(unknown_args_dict)
+
+    print(f"SageMaker launch parameters {params}")
+    print(f"GraphStorm forwarded parameters {unknown_args_dict}")
+
+    estimator_kwargs = parse_estimator_kwargs(args.sm_estimator_parameters)
+
+    est = PyTorch(
+        entry_point=os.path.basename(args.entry_point),
+        source_dir=os.path.dirname(args.entry_point),
+        image_uri=container_image_uri,
+        role=args.role,
+        instance_count=args.instance_count,
+        instance_type=args.instance_type,
+        output_path=args.model_artifact_s3,
+        py_version="py3",
+        base_job_name=prefix,
+        hyperparameters=params,
+        tags=[
+            {"Key": "GraphStorm", "Value": "oss"},
+            {"Key": "GraphStorm_Task", "Value": "HPO"},
+        ],
+        **estimator_kwargs,
+    )
+
+    hyperparameter_ranges = parse_hyperparameter_ranges(args.hyperparameter_ranges)
+
+    # Construct the full metric name based on user input
+    full_metric_name = f"best_{args.metric_dataset}_score:{args.metric_name}"
+
+    tuner = HyperparameterTuner(
+        estimator=est,
+        objective_metric_name=full_metric_name,
+        hyperparameter_ranges=hyperparameter_ranges,
+        objective_type=args.objective_type,
+        max_jobs=args.max_jobs,
+        max_parallel_jobs=args.max_parallel_jobs,
+        metric_definitions=[
+            {
+                "Name": full_metric_name,
+                "Regex": (
+                    f"INFO:root:best_{args.metric_dataset}_score: "
+                    f"{{'{args.metric_name}': ([0-9\\.]+)}}"
+                ),
+            }
+        ],
+        strategy=args.strategy,
+    )
+
+    tuner.fit({"train": args.yaml_s3}, wait=not args.async_execution)
+
+
+def get_hpo_parser():
+    """Return a parser for GraphStorm hyperparameter tuning task."""
+    parser = get_train_parser()
+
+    hpo_group = parser.add_argument_group("Hyperparameter tuning arguments")
+
+    hpo_group.add_argument(
+        "--max-jobs",
+        type=int,
+        default=10,
+        help="Maximum number of training jobs to run",
+    )
+    hpo_group.add_argument(
+        "--max-parallel-jobs",
+        type=int,
+        default=2,
+        help="Maximum number of parallel training jobs",
+    )
+    hpo_group.add_argument(
+        "--hyperparameter-ranges",
+        type=str,
+        required=True,
+        help="JSON string defining hyperparameter ranges",
+    )
+    hpo_group.add_argument(
+        "--metric-name",
+        type=str,
+        required=True,
+        help="Name of the metric to optimize (e.g., 'accuracy', 'amri')",
+    )
+    hpo_group.add_argument(
+        "--metric-dataset",
+        type=str,
+        required=True,
+        choices=["test", "val"],
+        help="Whether to use test or validation metrics for HPO.",
+    )
+    hpo_group.add_argument(
+        "--objective-type",
+        type=str,
+        default="Maximize",
+        choices=["Maximize", "Minimize"],
+        help="Type of objective, can be 'Maximize' or 'Minimize'",
+    )
+    hpo_group.add_argument(
+        "--strategy",
+        type=str,
+        default="Bayesian",
+        choices=["Bayesian", "Random", "Hyperband", "Grid"],
+        help="Optimization strategy. Default: 'Bayesian'.",
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    arg_parser = get_hpo_parser()
+    args, unknownargs = arg_parser.parse_known_args()
+    print(f"HPO launch Known args: '{args}'")
+    print(f"HPO launch unknown args:{type(unknownargs)=} '{unknownargs=}'")
+
+    run_hyperparameter_tuning_job(args, args.image_url, unknownargs)
diff --git a/sagemaker/run/train_entry.py b/sagemaker/run/train_entry.py
index 0d08948370..91afb5a8f9 100644
--- a/sagemaker/run/train_entry.py
+++ b/sagemaker/run/train_entry.py
@@ -37,10 +37,11 @@ def get_train_parser():
         required=True)
     parser.add_argument("--train-yaml-s3", type=str,
         help="S3 location of training yaml file. "
-             "Do not store it with partitioned graph",
-             required=True)
-    parser.add_argument("--model-artifact-s3", type=str,
-        help="S3 location to store the model artifacts.")
+             "Do not store it with partitioned graph")
+    parser.add_argument("--model-artifact-s3", type=str, default=None,
+        help="S3 location to store the model artifacts. If None, we rely on SageMaker "
+        "to upload model artifacts, so the launching Estimator needs to have 'output_path' set. "
+        "Default: None")
     parser.add_argument("--model-checkpoint-to-load", type=str, default=None,
         help="S3 path to a model checkpoint from a previous training task "
              "that is going to be resumed.")