Merge branch 'main' into gsprocessing-hard-negative

awslabs · Nov 11, 2024 · 0e7c471 · 0e7c471
2 parents e631d2c + 19fac3b
commit 0e7c471
Show file tree

Hide file tree

Showing 19 changed files with 622 additions and 82 deletions.
diff --git a/.github/workflow_scripts/pytest_check.sh b/.github/workflow_scripts/pytest_check.sh
@@ -1,9 +1,20 @@
-# Move to parent directory
+#!/bin/env bash
+# Move to repository root
+set -ex
+
 cd ../../
 
-set -ex
 
-FORCE_CUDA=1 python3 -m pip install -e '.[test]'  --no-build-isolation
+GS_HOME=$(pwd)
+# Add SageMaker launch scripts to make the scripts testable
+export PYTHONPATH="${PYTHONPATH}:${GS_HOME}/sagemaker/launch/"
+
 python3 -m pip install pytest
+FORCE_CUDA=1 python3 -m pip install -e '.[test]'  --no-build-isolation
+
+# Run SageMaker tests
+python3 -m pytest -x ./tests/sagemaker-tests -s
+
+# Run main library unit tests (Requires multi-gpu instance)
 sh ./tests/unit-tests/prepare_test_data.sh
 export NCCL_IB_DISABLE=1; export NCCL_SHM_DISABLE=1; NCCL_NET=Socket NCCL_DEBUG=INFO python3 -m pytest -x ./tests/unit-tests -s
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,9 @@ __pycache__/
 *.py[cod]
 *$py.class
 *.egg-info/
+.pytest_cache
+.mypy_cache
+.ipynb_checkpoints
 
 # used by the container build
 /code
@@ -20,3 +23,8 @@ __pycache__/
 docs/build/
 docs/source/_build
 docs/source/generated
+
+# IDEs and Python venv
+.idea
+.venv
+.vscode
diff --git a/docker/push_gsf_container.sh b/docker/push_gsf_container.sh
@@ -5,7 +5,7 @@ set -euox pipefail
 if [ -b "${1-}" ] && [ "$1" == "--help" ] || [ -b "${1-}" ] && [ "$1" == "-h" ]; then
     echo "Usage: docker/push_gsf_container.sh <image-name> <tag> <region> <account>"
     echo "Optionally provide the image name, tag, region and account number for the ecr repository"
-    echo "For example: docker/push_gsf_container.sh graphstorm sm us-west-2 1234567890"
+    echo "For example: docker/push_gsf_container.sh graphstorm sm-gpu us-west-2 1234567890"
     exit 1
 fi
 

diff --git a/docker/sagemaker/Dockerfile.sm b/docker/sagemaker/Dockerfile.sm
@@ -20,15 +20,16 @@ FROM branch-${DEVICE} AS final
 
 LABEL maintainer="Amazon AI Graph ML team"
 
-# Install related Python packages
+# Install required Python packages,
+# we set the versions to match those of the base conda environment when possible
 RUN pip3 install \
-        boto3 \
-        numba==0.58.1 \
-        numpy==1.26.1 \
+        boto3==1.34.112 \
+        numba==0.59.1 \
+        numpy==1.26.4 \
         ogb==1.3.6 \
-        pyarrow \
-        scikit-learn \
-        scipy \
+        pyarrow==16.1.0 \
+        scikit-learn==1.5.0 \
+        scipy==1.13.1 \
         transformers==4.28.1 \
     && rm -rf /root/.cache
 
@@ -49,12 +50,12 @@ ENV PYTHONPATH="/opt/ml/code/graphstorm/python/:${PYTHONPATH}"
 RUN cp /opt/ml/code/graphstorm/sagemaker/run/* /opt/ml/code/
 
 # Download DGL source code
-RUN cd /root; git clone --branch v${DGL_VERSION} https://github.com/dmlc/dgl.git
+RUN cd /root; git clone --branch v${DGL_VERSION} --single-branch https://github.com/dmlc/dgl.git
 # Un-comment if we prefer a local DGL distribution
 # COPY dgl /root/dgl
 ENV PYTHONPATH="/root/dgl/tools/:${PYTHONPATH}"
 
 WORKDIR /opt/ml/code
 
-ENTRYPOINT ["bash", "-m", "start_with_right_hostname.sh"]
+ENTRYPOINT ["bash", "-m", "/usr/local/bin/start_with_right_hostname.sh"]
 CMD ["/bin/bash"]
diff --git a/docker/sagemaker/build_artifacts/start_with_right_hostname.sh b/docker/sagemaker/build_artifacts/start_with_right_hostname.sh
@@ -5,7 +5,8 @@ if [[ "$1" = "train" ]]; then
      sed -ie "s/PLACEHOLDER_HOSTNAME/$CURRENT_HOST/g" changehostname.c
      gcc -o changehostname.o -c -fPIC -Wall changehostname.c
      gcc -o libchangehostname.so -shared -export-dynamic changehostname.o -ldl
-     LD_PRELOAD=/libchangehostname.so train
+     CWD=$(pwd)
+     LD_PRELOAD=$CWD/libchangehostname.so train
 else
-     eval "$@"
+     "$@"
 fi
diff --git a/docs/source/advanced/using-graphbolt.rst b/docs/source/advanced/using-graphbolt.rst
@@ -166,3 +166,103 @@ data using our GraphBolt conversion entry point:
     # We'll see the GraphBolt representation has been re-created
     ls /tmp/acm_graphbolt/part0
     edge_feat.dgl  fused_csc_sampling_graph.pt  graph.dgl  node_feat.dgl
+
+Using GraphBolt on SageMaker
+----------------------------
+
+Before being able to train on SageMaker
+we need to ensure our data on S3 have been
+converted to the GraphBolt format.
+When using GConstruct to process our data
+we can include the GraphBolt data conversion in the GConstruct
+step as we'll show below.
+
+When using distributed graph construction with GSProcessing and GSPartition,
+to prepare data to use with GraphBolt on SageMaker
+we need to launch the GraphBolt data conversion step
+as a separate SageMaker job, after
+the partitioned DGL graph files have been created on S3.
+
+After running your distributed partition SageMaker job as normal using
+``sagemaker/launch_partition.py``, you next need to launch the
+``sagemaker/launch_graphbolt_convert.py`` script, passing as input
+the S3 URI, where the DistDGL partition data is stored by ``launch_partition.py``,
+**plus the suffix `dist_graph`** as that's where GSPartition creates the partition files.
+
+For example, if you used ``--output-data-s3 s3://my-bucket/my-part-graph`` for
+``sagemaker/launch_partition.py`` you need to use ``--graph-data-s3 s3://my-bucket/my-part-graph/dist_graph``
+for ``sagemaker/launch_graphbolt_convert.py``.
+
+Without using GraphBolt a SageMaker job sequence for distributed processing and training
+is ``GSProcessing -> GSPartition -> GSTraining``. To use GraphBolt we need to add
+a step after partitioning and before training:
+``GSProcessing -> GSPartition -> GraphBoltConvert -> GSTraining``.
+
+.. code-block:: bash
+
+    cd graphstorm/sagemaker
+    sagemaker/launch_partition.py \
+        --graph-data-s3 "s3-uri-where-gsprocessing-data-exist" \
+        --output-data-s3 "s3-uri-where-gspartition-data-will-be"
+        # Add other required parameters like --partition-algorithm, --num-instances etc.
+
+    # Once the above job succeeds we run the following command to convert the data to GraphBolt format.
+    # Note the /dist_graph suffix!
+    sagemaker/launch_graphbolt_convert.py \
+        --graph-data-s3 "s3-uri-where-gspartition-data-will-be/dist_graph" \
+        --metadata-filename "metadata.json" # Or <graph-name>.json for gconstruct-ed partitions
+
+
+If your data are small enough to process on a single SageMaker instance
+using ``GConstuct``, you can simply pass the ``--use-graphbolt true`` argument
+to the ``GConstruct`` SageMaker launch script and that will create the
+necessary GraphBolt files as well.
+So the job sequence there remains ``GConstruct -> GSTraining``.
+
+.. code-block:: bash
+
+    sagemaker/launch_gconstruct.py \
+        --graph-data-s3 "s3-uri-where-raw-data-exist" \
+        --output-data-s3 "s3-uri-where-gspartition-data-will-be" \
+        --graph-config-file "gconstruct-config.json" \
+        --use-graphbolt true
+
+If you initially used GConstruct to create the non-GraphBolt DistDGL files,
+you'll need to pass in the additional argument ``--metadata-filename``
+to ``launch_graphbolt_convert.py``.
+Use ``<graph-name>.json`` where the graph name should be the
+one you used with GConstruct as shown below:
+
+.. code-block:: bash
+
+    # NOTE: we provide 'my-graph' as the graph name
+    sagemaker/launch_gconstruct.py \
+        --graph-name my-graph \
+        --graph-data-s3 "s3-uri-where-raw-data-exist" \
+        --output-data-s3 "s3-uri-where-gspartition-data-will-be" \
+        --graph-config-file "gconstruct-config.json" # We don't add --use-graphbolt true
+
+    # Once the above job succeeds we run the below to convert the data to GraphBolt
+    # NOTE: Our metadata file name will be named 'my-graph.json'
+    sagemaker/launch_graphbolt_convert.py \
+        --graph-data-s3 "s3-uri-where-gspartition-data-will-be"
+        --metadata-filename "my-graph.json" # Should be <graph-name>.json
+
+
+Once the data have been converted to the GraphBolt format you can run your training
+and inference jobs as before, passing the additional
+argument ``--use-graphbolt`` to the SageMaker launch scripts
+to indicate that we want to use GraphBolt during training/inference:
+
+.. code-block:: bash
+
+    sagemaker/launch_train.py \
+        --graph-name my-graph \
+        --graph-data-s3 "s3-uri-where-gspartition-data-will-be" \
+        --yaml-s3 "s3-path-to-train-yaml" \
+        --use-graphbolt true
+
+
+If you want to test steps locally you can use SageMaker's
+[local mode](https://sagemaker.readthedocs.io/en/stable/overview.html#local-mode)
+by providing `local` as the instance type in the launch scripts.
diff --git a/python/graphstorm/gpartition/convert_to_graphbolt.py b/python/graphstorm/gpartition/convert_to_graphbolt.py
@@ -27,24 +27,72 @@
 
 def parse_gbconv_args() -> argparse.Namespace:
     """Parses GraphBolt conversion arguments"""
-    parser = argparse.ArgumentParser("Convert partitioned DGL graph to GraphBolt format.")
-    parser.add_argument("--metadata-filepath", type=str, required=True,
-                           help="File path to the partitioned DGL metadata file.")
-    parser.add_argument("--logging-level", type=str, default="info",
-                           help="The logging level. The possible values: debug, info, warning, \
-                                   error. The default value is info.")
+    parser = argparse.ArgumentParser(
+        "Convert partitioned DGL graph to GraphBolt format."
+    )
+    parser.add_argument(
+        "--metadata-filepath",
+        type=str,
+        required=True,
+        help="File path to the partitioned DGL metadata file.",
+    )
+    parser.add_argument(
+        "--logging-level",
+        type=str,
+        default="info",
+        help="The logging level. The possible values: debug, info, warning, "
+        "error. The default value is info.",
+    )
+    parser.add_argument(
+        "--njobs",
+        type=int,
+        default=1,
+        help="Number of parallel processes to use for GraphBolt partition conversion. "
+        "Only applies for DGL >= v2.4.0.",
+    )
 
     return parser.parse_args()
 
 
-def main():
-    """ Entry point
+def run_gb_conversion(part_config: str, njobs=1):
+    """Converts the DistGraph data under the given part_config to GraphBolt
+
+    Parameters
+    ----------
+    part_config : str
+        File path to the partitioned data metadata JSON file
+    njobs : int, optional
+        Number of partitions to convert in parallel, by default 1.
+        Only applies if DGL >= 2.4.0.
     """
-    dgl_version = importlib.metadata.version('dgl')
+    dgl_version = importlib.metadata.version("dgl")
+    if version.parse(dgl_version) < version.parse("2.4.0"):
+        if njobs > 1:
+            logging.warning(
+                "GB conversion njobs > 1 is only supported for DGL >= 2.4.0. "
+                "njobs will be set to 1."
+            )
+        dgl_distributed.dgl_partition_to_graphbolt(
+            part_config,
+            store_eids=True,
+            graph_formats="coo",
+        )
+    else:
+        dgl_distributed.dgl_partition_to_graphbolt(  # pylint: disable=unexpected-keyword-arg
+            part_config,
+            store_eids=True,
+            graph_formats="coo",
+            njobs=njobs,
+        )
+
+
+def main():
+    """Entry point"""
+    dgl_version = importlib.metadata.version("dgl")
     if version.parse(dgl_version) < version.parse("2.1.0"):
         raise ValueError(
-                "GraphBolt conversion requires DGL version >= 2.1.0, "
-                f"but DGL version was {dgl_version}. "
+            "GraphBolt conversion requires DGL version >= 2.1.0, "
+            f"but DGL version was {dgl_version}. "
         )
 
     gb_conv_args = parse_gbconv_args()
@@ -59,14 +107,10 @@ def main():
     gb_start = time.time()
     logging.info("Converting partitions to GraphBolt format")
 
-    dgl_distributed.dgl_partition_to_graphbolt(
-            part_config,
-            store_eids=True,
-            graph_formats="coo",
-        )
+    run_gb_conversion(part_config, njobs=gb_conv_args.njobs)
+
+    logging.info("GraphBolt conversion took %f sec.", time.time() - gb_start)
 
-    logging.info("GraphBolt conversion took %f sec.",
-                    time.time() - gb_start)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/python/graphstorm/sagemaker/sagemaker_partition.py b/python/graphstorm/sagemaker/sagemaker_partition.py
@@ -24,7 +24,7 @@
 import time
 import subprocess
 from threading import Thread, Event
-from typing import List
+from typing import Any, List
 
 import boto3
 import botocore
@@ -224,15 +224,15 @@ def run_partition(job_config: PartitionJobConfig):
     metadata_filename = job_config.metadata_filename
     skip_partitioning = job_config.skip_partitioning == 'true'
 
-    # Get env from either processing job or training job
+    # Get resource env from either processing job or training job
     try:
         with open("/opt/ml/config/resourceconfig.json", "r", encoding="utf-8") as f:
-            sm_env = json.load(f)
+            sm_resource_env: dict[str, Any] = json.load(f)
     except FileNotFoundError:
-        sm_env = json.loads(os.environ['SM_TRAINING_ENV'])
+        sm_resource_env = json.loads(os.environ['SM_TRAINING_ENV'])
 
-    hosts = sm_env['hosts']
-    current_host = sm_env['current_host']
+    hosts: list[str] = sm_resource_env['hosts']
+    current_host: str = sm_resource_env['current_host']
     world_size = len(hosts)
     os.environ['WORLD_SIZE'] = str(world_size)
     host_rank = hosts.index(current_host)
@@ -255,7 +255,7 @@ def run_partition(job_config: PartitionJobConfig):
     for key, val in os.environ.items():
         logging.debug("%s: %s", key, val)
 
-    leader_addr = socket.gethostbyname('algo-1')
+    leader_addr = socket.gethostbyname(sorted(hosts)[0])
     # sync with all instances in the cluster
     if host_rank == 0:
         # sync with workers

diff --git a/python/graphstorm/sagemaker/utils.py b/python/graphstorm/sagemaker/utils.py
@@ -280,6 +280,9 @@ def download_graph(graph_data_s3, graph_name, part_id, world_size,
                 # Something else has gone wrong.
                 raise err
 
+    assert graph_config, \
+        (f"Could not find a graph config file named {graph_name}.json or metadata.json "
+         f"under {graph_data_s3}")
     S3Downloader.download(os.path.join(graph_data_s3, graph_config),
             graph_path, sagemaker_session=sagemaker_session)
     try: