From 716839d773046fa059d2e800672986957fa0832b Mon Sep 17 00:00:00 2001 From: Theodore Vasiloudis Date: Mon, 13 Jan 2025 01:08:12 +0000 Subject: [PATCH] [SageMaker] Add launcher and support for SageMaker HPO jobs --- .../distributed/sagemaker.rst | 74 +++++++ .../graphstorm/sagemaker/sagemaker_train.py | 26 ++- .../launch/launch_hyperparameter_tuning.py | 209 ++++++++++++++++++ sagemaker/run/train_entry.py | 9 +- 4 files changed, 308 insertions(+), 10 deletions(-) create mode 100644 sagemaker/launch/launch_hyperparameter_tuning.py diff --git a/docs/source/cli/model-training-inference/distributed/sagemaker.rst b/docs/source/cli/model-training-inference/distributed/sagemaker.rst index 603b529d7d..68a52d8cc5 100644 --- a/docs/source/cli/model-training-inference/distributed/sagemaker.rst +++ b/docs/source/cli/model-training-inference/distributed/sagemaker.rst @@ -213,6 +213,80 @@ from ``${DATASET_S3_PATH}`` as input and create a DistDGL graph with ``${NUM_PARTITIONS}`` under the output path, ``${OUTPUT_PATH}``. Currently we only support ``random`` as the partitioning algorithm. +Launch hyper-parameter optimization task +```````````````````````````````````````` + +GraphStorm supports `automatic model tuning `_ +with SageMaker AI, +which allows you to optimize the hyper-parameters +of your model with an easy-to-use interface. + +The ``launch/launch_hyperparameter_tuning.py`` script can act as a thin +wrapper for SageMaker's `HyperParameterTuner `_. + +You define the hyper-parameters of interest by passing a python dictionary as a string, +where the keys of the dictionary are the names of the parameters to tune, +and each value is a dictionary that defines the type and tuning range +of the parameter. + +.. code:: bash + + # Example hyper-parameter ranges + python launch/launch_hyperparameter_tuning.py \ + --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2, "scaling_type": "Auto"}, "num_layers": {"type": "integer", "min": 1, "max": 3}, "model_encoder_type": {"type": "categorical", "values": ["rgcn", "rgat"]}}' + +The supported parameter types are ``continuous`` +for real-values parameters, ``integer`` for numerical integer parameters, +and ``categorical`` for discrete-value string parameters. +These directly correspond to Sagemaker's +`Dynamic hyper-parameters `~ + +Continuous and integer parameters need to define a ``"min"`` and ``"max"`` value to use +during tuning, while categorical variables need to provide a list of strings +as ``"values"`` to choose from. + +For continuous and integer parameters you can also provide a ``scaling_type`` +string that directly corresponds to one of SageMaker's +`scaling types `_, +i.e. can be 'Auto', 'Linear', 'Logarithmic', or 'ReverseLogarithmic'. +By default scaling type will be 'Auto'. + +Use ``--metric-name`` to define the name of metric to use as a tuning objective, +e.g. ``"accuracy"``. See the entry for ``eval_metric`` in :ref:`Evaluation Metrics ` +for a full list of supported metrics. + +``--metric-dataset`` defines which dataset to collect metrics from, and +can be either ``"test"`` or ``"val"`` to collect metrics during test or validation +respectively. Finally use ``--objective-type`` to set the type of the objective, +which can be either ``"Maximize"`` or ``"Minimize"``. + +Finally you can use ``--strategy`` to select the optimization strategy +from one of "Bayesian", "Random", "Hyperband", "Grid". See the +`SageMaker documentation `_ +for more details on each strategy. + +Example HPO call: + +.. code:: bash + + python launch/launch_hyperparameter_tuning.py \ + --task-name my-gnn-hpo-job \ + --role arn:aws:iam::123456789012:role/SageMakerRole \ + --region us-west-2 \ + --image-url 123456789012.dkr.ecr.us-west-2.amazonaws.com/graphstorm:sagemaker-gpu \ + --graph-name my-graph \ + --task-type node_classification \ + --graph-data-s3 s3://my-bucket/graph-data/ \ + --yaml-s3 s3://my-bucket/train.yaml \ + --model-artifact-s3 s3://my-bucket/model-artifacts/ \ + --max-jobs 20 \ + --max-parallel-jobs 4 \ + --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2}, "num_layers": {"type": "integer", "min": 2, "max": 5}}' \ + --metric-name "accuracy" \ + --metric-set "val" \ + --objective-type "Maximize" \ + --strategy "Bayesian" + Passing additional arguments to the SageMaker Estimator ``````````````````````````````````````````````````````` Sometimes you might want to pass additional arguments to the constructor diff --git a/python/graphstorm/sagemaker/sagemaker_train.py b/python/graphstorm/sagemaker/sagemaker_train.py index fe8aafd146..c26f94e416 100644 --- a/python/graphstorm/sagemaker/sagemaker_train.py +++ b/python/graphstorm/sagemaker/sagemaker_train.py @@ -15,7 +15,7 @@ Training entry point. """ -# Install additional requirements + import os import logging import socket @@ -164,7 +164,16 @@ def run_train(args, unknownargs): os.makedirs(restore_model_path, exist_ok=True) else: restore_model_path = None - output_path = "/tmp/gsgnn_model/" + + if args.model_artifact_s3: + # If user provided an S3 output destination as an input arg, the script itself + # will upload the model artifacts after training, so we save under /tmp. + output_path = "/tmp/gsgnn_model/" + else: + # If the user did not provide an output destination as an arg, we rely on SageMaker to + # do the model upload so we save the model to the pre-determined path /opt/ml/model + output_path = "/opt/ml/model" + os.makedirs(output_path, exist_ok=True) # start the ssh server @@ -229,7 +238,11 @@ def run_train(args, unknownargs): graph_data_s3 = args.graph_data_s3 task_type = args.task_type train_yaml_s3 = args.train_yaml_s3 - model_artifact_s3 = args.model_artifact_s3.rstrip('/') + # If the user provided an output destination, trim any trailing '/' + if args.model_artifact_s3: + gs_model_artifact_s3 = args.model_artifact_s3.rstrip('/') + else: + gs_model_artifact_s3 = None custom_script = args.custom_script boto_session = boto3.session.Session(region_name=args.region) @@ -292,6 +305,7 @@ def run_train(args, unknownargs): logging.error("Task failed") sys.exit(-1) - # If there are saved models - if os.path.exists(save_model_path): - upload_model_artifacts(model_artifact_s3, save_model_path, sagemaker_session) + # We upload models only when the user explicitly set the model_artifact_s3 + # argument. Otherwise we can rely on the SageMaker service to do the upload. + if gs_model_artifact_s3 and os.path.exists(save_model_path): + upload_model_artifacts(gs_model_artifact_s3, save_model_path, sagemaker_session) diff --git a/sagemaker/launch/launch_hyperparameter_tuning.py b/sagemaker/launch/launch_hyperparameter_tuning.py new file mode 100644 index 0000000000..e551a8b4a3 --- /dev/null +++ b/sagemaker/launch/launch_hyperparameter_tuning.py @@ -0,0 +1,209 @@ +r""" + Copyright 2023 Contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + Launch SageMaker HPO jobs. + + Usage: + + python launch_hyperparameter_tuning.py \ + --task-name my-gnn-hpo-job \ + --role arn:aws:iam::123456789012:role/SageMakerRole \ + --region us-west-2 \ + --image-url 123456789012.dkr.ecr.us-west-2.amazonaws.com/my-graphstorm-image:latest \ + --graph-name my-graph \ + --task-type node_classification \ + --graph-data-s3 s3://my-bucket/graph-data/ \ + --yaml-s3 s3://my-bucket/train.yaml \ + --model-artifact-s3 s3://my-bucket/model-artifacts/ \ + --max-jobs 20 \ + --max-parallel-jobs 4 \ + --hyperparameter-ranges '{"lr": {"type": "continuous", "min": 1e-5, "max": 1e-2}, "num_layers": {"type": "integer", "min": 2, "max": 5}, "activation": {"type": "categorical", "values": ["relu", "tanh"]}}' \ + --metric-name "accuracy" \ + --metric-dataset "val" \ + --objective-type Maximize +""" +import os +import json + +from sagemaker.pytorch.estimator import PyTorch +from sagemaker.tuner import ( + HyperparameterTuner, + ContinuousParameter, + IntegerParameter, + CategoricalParameter, +) + +from common_parser import ( + parse_estimator_kwargs, + parse_unknown_gs_args, +) +from launch_train import get_train_parser + +INSTANCE_TYPE = "ml.g4dn.12xlarge" + + +def parse_hyperparameter_ranges(hyperparameter_ranges_json): + """Parse the hyperparameter ranges from JSON string.""" + ranges = json.loads(hyperparameter_ranges_json) + hyperparameter_ranges = {} + for param, config in ranges.items(): + if config["type"] == "continuous": + hyperparameter_ranges[param] = ContinuousParameter( + config["min"], config["max"], config.get("scaling_type", "Auto") + ) + elif config["type"] == "integer": + hyperparameter_ranges[param] = IntegerParameter( + config["min"], config["max"], config.get("scaling_type", "Auto") + ) + elif config["type"] == "categorical": + hyperparameter_ranges[param] = CategoricalParameter(config["values"]) + return hyperparameter_ranges + + +def run_hyperparameter_tuning_job(args, image, unknownargs): + """Run hyperparameter tuning job using SageMaker HyperparameterTuner""" + + container_image_uri = image + + prefix = f"gs-hpo-{args.graph_name}" + + params = { + "eval-metric": args.metric_name, + "graph-data-s3": args.graph_data_s3, + "graph-name": args.graph_name, + "log-level": args.log_level, + "task-type": args.task_type, + "train-yaml-s3": args.yaml_s3, + } + if args.custom_script is not None: + params["custom-script"] = args.custom_script + if args.model_checkpoint_to_load is not None: + params["model-checkpoint-to-load"] = args.model_checkpoint_to_load + + unknown_args_dict = parse_unknown_gs_args(unknownargs) + params.update(unknown_args_dict) + + print(f"SageMaker launch parameters {params}") + print(f"GraphStorm forwarded parameters {unknown_args_dict}") + + estimator_kwargs = parse_estimator_kwargs(args.sm_estimator_parameters) + + est = PyTorch( + entry_point=os.path.basename(args.entry_point), + source_dir=os.path.dirname(args.entry_point), + image_uri=container_image_uri, + role=args.role, + instance_count=args.instance_count, + instance_type=args.instance_type, + output_path=args.model_artifact_s3, + py_version="py3", + base_job_name=prefix, + hyperparameters=params, + tags=[ + {"Key": "GraphStorm", "Value": "oss"}, + {"Key": "GraphStorm_Task", "Value": "HPO"}, + ], + **estimator_kwargs, + ) + + hyperparameter_ranges = parse_hyperparameter_ranges(args.hyperparameter_ranges) + + # Construct the full metric name based on user input + full_metric_name = f"best_{args.metric_dataset}_score:{args.metric_name}" + + tuner = HyperparameterTuner( + estimator=est, + objective_metric_name=full_metric_name, + hyperparameter_ranges=hyperparameter_ranges, + objective_type=args.objective_type, + max_jobs=args.max_jobs, + max_parallel_jobs=args.max_parallel_jobs, + metric_definitions=[ + { + "Name": full_metric_name, + "Regex": ( + f"INFO:root:best_{args.metric_dataset}_score: " + f"{{'{args.metric_name}': ([0-9\\.]+)}}" + ), + } + ], + strategy=args.strategy, + ) + + tuner.fit({"train": args.yaml_s3}, wait=not args.async_execution) + + +def get_hpo_parser(): + """Return a parser for GraphStorm hyperparameter tuning task.""" + parser = get_train_parser() + + hpo_group = parser.add_argument_group("Hyperparameter tuning arguments") + + hpo_group.add_argument( + "--max-jobs", + type=int, + default=10, + help="Maximum number of training jobs to run", + ) + hpo_group.add_argument( + "--max-parallel-jobs", + type=int, + default=2, + help="Maximum number of parallel training jobs", + ) + hpo_group.add_argument( + "--hyperparameter-ranges", + type=str, + required=True, + help="JSON string defining hyperparameter ranges", + ) + hpo_group.add_argument( + "--metric-name", + type=str, + required=True, + help="Name of the metric to optimize (e.g., 'accuracy', 'amri')", + ) + hpo_group.add_argument( + "--metric-dataset", + type=str, + required=True, + choices=["test", "val"], + help="Whether to use test or validation metrics for HPO.", + ) + hpo_group.add_argument( + "--objective-type", + type=str, + default="Maximize", + choices=["Maximize", "Minimize"], + help="Type of objective, can be 'Maximize' or 'Minimize'", + ) + hpo_group.add_argument( + "--strategy", + type=str, + default="Bayesian", + choices=["Bayesian", "Random", "Hyperband", "Grid"], + help="Optimization strategy. Default: 'Bayesian'.", + ) + + return parser + + +if __name__ == "__main__": + arg_parser = get_hpo_parser() + args, unknownargs = arg_parser.parse_known_args() + print(f"HPO launch Known args: '{args}'") + print(f"HPO launch unknown args:{type(unknownargs)=} '{unknownargs=}'") + + run_hyperparameter_tuning_job(args, args.image_url, unknownargs) diff --git a/sagemaker/run/train_entry.py b/sagemaker/run/train_entry.py index 0d08948370..91afb5a8f9 100644 --- a/sagemaker/run/train_entry.py +++ b/sagemaker/run/train_entry.py @@ -37,10 +37,11 @@ def get_train_parser(): required=True) parser.add_argument("--train-yaml-s3", type=str, help="S3 location of training yaml file. " - "Do not store it with partitioned graph", - required=True) - parser.add_argument("--model-artifact-s3", type=str, - help="S3 location to store the model artifacts.") + "Do not store it with partitioned graph") + parser.add_argument("--model-artifact-s3", type=str, default=None, + help="S3 location to store the model artifacts. If None, we rely on SageMaker " + "to upload model artifacts, so the launching Estimator needs to have 'output_path' set. " + "Default: None") parser.add_argument("--model-checkpoint-to-load", type=str, default=None, help="S3 path to a model checkpoint from a previous training task " "that is going to be resumed.")