From 687f58f9ce5cbcde0449f147a5bbdbc26f11bde1 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 2 Jan 2025 14:43:32 +0530
Subject: [PATCH 01/32] finetuning llama3 8b

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/finetuning_llama3_8b.py   | 172 ++++++++++++++++++
 scripts/llm/performance/utils.py              |   6 +
 2 files changed, 178 insertions(+)
 create mode 100644 scripts/llm/performance/finetuning_llama3_8b.py

diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
new file mode 100644
index 000000000000..e6feaec3cb26
--- /dev/null
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.llama3_8b import finetune_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 1
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def llama3_8b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 8b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps * gbs / dp_size
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            args.finetuning,
+            f"llama3_8b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={
+            "NVTE_FUSED_ATTN": "0",
+            "NVTE_FLASH_ATTN": "1",
+        },
+        retries=0,
+    )
+
+    recipe = llama3_8b_performance_recipe(
+        args.finetuning,
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 8574b4f30f2b..b2baf9974184 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -186,6 +186,12 @@ def parse_cli_args():
         help="Enable tensorboard logging. Disabled by default",
         action="store_true",
     )
+    parser.add_argument(
+        "-f",
+        "--finetuning",
+        help="Finetuning scheme to use. Options- 'sft', 'lora'. Defaults is 'lora'",
+        default='lora',
+    )
     parser.add_argument(
         "-d",
         "--dryrun",

From 95f1809a1889b43c3b6721b2c7a6195e6e35115c Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Tue, 7 Jan 2025 17:31:15 +0530
Subject: [PATCH 02/32] llama3 70b

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/finetuning_llama3_70b.py  | 179 ++++++++++++++++++
 .../llm/performance/finetuning_llama3_8b.py   |  23 ++-
 scripts/llm/performance/utils.py              |  10 +-
 3 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 scripts/llm/performance/finetuning_llama3_70b.py

diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
new file mode 100644
index 000000000000..3f1deb626e12
--- /dev/null
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from nemo_run.config import NEMORUN_HOME
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor, import_ckpt_experiment
+
+from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.utils import logging
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 4
+CP_SIZE = 1
+VP_SIZE = 20
+MAX_STEPS = 100
+
+HF_MODEL_URI = "meta-llama/Meta-Llama-3-70B"
+
+def llama3_70b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 70b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+    if args.log_dir != NEMORUN_HOME:
+        import sys
+
+        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
+        sys.exit(1)
+
+    exp_name = "_".join(
+        [
+            args.finetuning.lower(),
+            f"llama3_70b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={
+            "NVTE_FUSED_ATTN": "0",
+            "NVTE_FLASH_ATTN": "1",
+            # default NEMO_HOME is Path.home() which resolves to '/root' inside NeMo container
+            # observed behavior is the finetuning task does not recognize the ckpt stored under
+            # '/root' by 'import_ckpt' task (both tasks under same exp) possibly due to different
+            # Python processes (need to debug to avoid explicitly setting 'NEMO_HOME'). Paths
+            # outside '/root' are recognized by both tasks.
+            "NEMO_HOME": args.log_dir,
+        },
+        retries=0,
+    )
+
+    recipe = llama3_70b_performance_recipe(
+        args.finetuning.lower(),
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index e6feaec3cb26..61eab620200a 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -16,11 +16,10 @@
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor, import_ckpt_experiment
 
-from nemo.collections.llm.recipes.llama3_8b import finetune_recipe
+from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
-from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
 from nemo.utils import logging
 
@@ -34,6 +33,7 @@
 VP_SIZE = None
 MAX_STEPS = 100
 
+HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"
 
 def llama3_8b_performance_recipe(
     finetuning_scheme: str,
@@ -53,13 +53,13 @@ def llama3_8b_performance_recipe(
 
     NOTE: Use fp8 precision training with caution. It might not give desirable results.
     """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
     recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
 
     # data module configs
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
-    recipe.data.num_train_samples = max_steps * gbs  # ensure only 1 epoch for whole run
-    recipe.data.tokenizer = hf_tokenizer("meta-llama/Meta-Llama-3-8B")
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
 
     recipe.trainer.max_steps = max_steps
     recipe.trainer.num_nodes = num_nodes
@@ -91,7 +91,7 @@ def llama3_8b_performance_recipe(
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
     recipe.trainer.enable_checkpointing = False
-    recipe.trainer.val_check_interval = max_steps * gbs / dp_size
+    recipe.trainer.val_check_interval = max_steps
     recipe.trainer.log_every_n_steps = 1
 
     return recipe
@@ -107,7 +107,7 @@ def llama3_8b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning,
+            args.finetuning.lower(),
             f"llama3_8b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
@@ -128,12 +128,18 @@ def llama3_8b_performance_recipe(
         custom_env_vars={
             "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
+            # default NEMO_HOME is Path.home() which resolves to '/root' inside NeMo container
+            # observed behavior is the finetuning task does not recognize the ckpt stored under
+            # '/root' by 'import_ckpt' task (both tasks under same exp) possibly due to different
+            # Python processes (need to debug to avoid explicitly setting 'NEMO_HOME'). Paths
+            # outside '/root' are recognized by both tasks.
+            "NEMO_HOME": args.log_dir,
         },
         retries=0,
     )
 
     recipe = llama3_8b_performance_recipe(
-        args.finetuning,
+        args.finetuning.lower(),
         args.compute_dtype,
         NUM_NODES,
         NUM_GPUS_PER_NODE,
@@ -159,6 +165,7 @@ def llama3_8b_performance_recipe(
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
     with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
         exp.add(
             recipe,
             executor=executor,
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index b2baf9974184..801fc3781ab8 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -22,7 +22,7 @@
 
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
 from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
-
+from nemo.collections.llm.gpt.model import GPTModel
 
 def slurm_executor(
     account: str,
@@ -106,6 +106,14 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
         use_fast=True,
     )
 
+def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
+    from copy import deepcopy
+    from nemo.collections.llm import import_ckpt
+
+    import_executor = deepcopy(executor)
+    import_executor.ntasks_per_node = num_nodes
+
+    return run.Partial(import_ckpt, model = model,source = source, overwrite = False), import_executor, "import_ckpt_exp"
 
 def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """

From c5c42cc1a7ccac2fa4120beaf382311223397a84 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Tue, 7 Jan 2025 12:02:34 +0000
Subject: [PATCH 03/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/finetuning_llama3_70b.py | 3 ++-
 scripts/llm/performance/finetuning_llama3_8b.py  | 3 ++-
 scripts/llm/performance/utils.py                 | 8 ++++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index 3f1deb626e12..7b7decdadcc8 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -16,7 +16,7 @@
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor, import_ckpt_experiment
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -35,6 +35,7 @@
 
 HF_MODEL_URI = "meta-llama/Meta-Llama-3-70B"
 
+
 def llama3_70b_performance_recipe(
     finetuning_scheme: str,
     compute_dtype: str,
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index 61eab620200a..b07faa3d4604 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -16,7 +16,7 @@
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor, import_ckpt_experiment
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -35,6 +35,7 @@
 
 HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"
 
+
 def llama3_8b_performance_recipe(
     finetuning_scheme: str,
     compute_dtype: str,
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 801fc3781ab8..4f06539bc53c 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -21,8 +21,9 @@
 from nemo_run.config import NEMORUN_HOME
 
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
-from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
 from nemo.collections.llm.gpt.model import GPTModel
+from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
+
 
 def slurm_executor(
     account: str,
@@ -106,14 +107,17 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
         use_fast=True,
     )
 
+
 def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
     from copy import deepcopy
+
     from nemo.collections.llm import import_ckpt
 
     import_executor = deepcopy(executor)
     import_executor.ntasks_per_node = num_nodes
 
-    return run.Partial(import_ckpt, model = model,source = source, overwrite = False), import_executor, "import_ckpt_exp"
+    return run.Partial(import_ckpt, model=model, source=source, overwrite=False), import_executor, "import_ckpt_exp"
+
 
 def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """

From 4cde0fc7d3820c6a03cc4c797b23e1673ef420cf Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 8 Jan 2025 22:07:42 +0530
Subject: [PATCH 04/32] peft and slurm functional

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/finetuning_llama3_70b.py  | 20 +++---
 .../llm/performance/finetuning_llama3_8b.py   | 20 +++---
 scripts/llm/performance/utils.py              | 71 +++++++++++++------
 3 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index 7b7decdadcc8..7126b0339de5 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -16,12 +16,14 @@
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor, isfile_train_pack_metadata
 
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
 from nemo.utils import logging
+import os
+import sys
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -61,6 +63,8 @@ def llama3_70b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        recipe.data.force_redownload = True
 
     recipe.trainer.max_steps = max_steps
     recipe.trainer.num_nodes = num_nodes
@@ -101,10 +105,11 @@ def llama3_70b_performance_recipe(
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
     if args.log_dir != NEMORUN_HOME:
-        import sys
-
         logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
         sys.exit(1)
+    if args.nemo_home and args.nemo_home != os.getenv("NEMO_HOME"):
+        logging.error(f"Run `export NEMO_HOME={args.nemo_home}` in your shell environment and rerun this script.")
+        sys.exit(1)
 
     exp_name = "_".join(
         [
@@ -129,14 +134,9 @@ def llama3_70b_performance_recipe(
         custom_env_vars={
             "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
-            # default NEMO_HOME is Path.home() which resolves to '/root' inside NeMo container
-            # observed behavior is the finetuning task does not recognize the ckpt stored under
-            # '/root' by 'import_ckpt' task (both tasks under same exp) possibly due to different
-            # Python processes (need to debug to avoid explicitly setting 'NEMO_HOME'). Paths
-            # outside '/root' are recognized by both tasks.
-            "NEMO_HOME": args.log_dir,
         },
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home
     )
 
     recipe = llama3_70b_performance_recipe(
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index b07faa3d4604..99d99db17661 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import sys
 from typing import Optional
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor, isfile_train_pack_metadata
 
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -61,6 +63,8 @@ def llama3_8b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        recipe.data.force_redownload = True
 
     recipe.trainer.max_steps = max_steps
     recipe.trainer.num_nodes = num_nodes
@@ -101,10 +105,11 @@ def llama3_8b_performance_recipe(
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
     if args.log_dir != NEMORUN_HOME:
-        import sys
-
         logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
         sys.exit(1)
+    if args.nemo_home and args.nemo_home != os.getenv("NEMO_HOME"):
+        logging.error(f"Run `export NEMO_HOME={args.nemo_home}` in your shell environment and rerun this script.")
+        sys.exit(1)
 
     exp_name = "_".join(
         [
@@ -129,14 +134,9 @@ def llama3_8b_performance_recipe(
         custom_env_vars={
             "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
-            # default NEMO_HOME is Path.home() which resolves to '/root' inside NeMo container
-            # observed behavior is the finetuning task does not recognize the ckpt stored under
-            # '/root' by 'import_ckpt' task (both tasks under same exp) possibly due to different
-            # Python processes (need to debug to avoid explicitly setting 'NEMO_HOME'). Paths
-            # outside '/root' are recognized by both tasks.
-            "NEMO_HOME": args.log_dir,
         },
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home
     )
 
     recipe = llama3_8b_performance_recipe(
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 4f06539bc53c..ec2314cae3e7 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -14,7 +14,7 @@
 
 import argparse
 import os
-from typing import Dict, List, Optional
+from typing import Dict, List
 
 import nemo_run as run
 from lightning.pytorch.callbacks.callback import Callback
@@ -23,7 +23,9 @@
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
 from nemo.collections.llm.gpt.model import GPTModel
 from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
-
+from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
 
 def slurm_executor(
     account: str,
@@ -31,12 +33,13 @@ def slurm_executor(
     log_dir: str,
     nodes: int,
     num_gpus_per_node: int,
-    time_limit: str = "01:00:00",
+    time_limit: str = "00:30:00",
     container_image: str = "nvcr.io/nvidia/nemo:dev",
-    custom_mounts: Optional[List[str]] = None,
-    custom_env_vars: Optional[Dict[str, str]] = None,
-    custom_srun_args: Optional[List[str]] = None,
-    retries: int = 0,
+    custom_mounts: List[str] = [],
+    custom_env_vars: Dict[str, str] = {},
+    custom_srun_args: List[str] = [],
+    hf_token: str = None,
+    nemo_home: str = DEFAULT_NEMO_CACHE_HOME,
 ) -> run.SlurmExecutor:
     """
     Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
@@ -48,9 +51,9 @@ def slurm_executor(
             "function.",
         )
 
-    mounts = []
-    if custom_mounts:
-        mounts.extend(custom_mounts)
+    if nemo_home != DEFAULT_NEMO_CACHE_HOME:
+        custom_mounts.extend([f"{nemo_home}:{nemo_home}"])
+        custom_env_vars.update({"NEMO_HOME": nemo_home})
 
     env_vars = {
         "TRANSFORMERS_OFFLINE": "1",
@@ -64,12 +67,12 @@ def slurm_executor(
         "NEMO_LOG_MEMORY_USAGE": "1",
         "NEMORUN_HOME": log_dir,
     }
-    if custom_env_vars:
-        env_vars |= custom_env_vars
+    if hf_token is not None:
+        custom_env_vars.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+    env_vars |= custom_env_vars
 
     srun_args = ["--mpi=pmix"]
-    if custom_srun_args:
-        srun_args.extend(custom_srun_args)
+    srun_args.extend(custom_srun_args)
 
     executor = run.SlurmExecutor(
         account=account,
@@ -79,18 +82,16 @@ def slurm_executor(
         ),
         nodes=nodes,
         ntasks_per_node=num_gpus_per_node,
+        container_image=container_image,
+        container_mounts=custom_mounts,
+        env_vars=env_vars,
+        srun_args=srun_args,
+        time=time_limit,
         mem="0",
         exclusive=True,
         packager=run.GitArchivePackager(),
     )
 
-    executor.container_image = container_image
-    executor.container_mounts = mounts
-    executor.env_vars = env_vars
-    executor.srun_args = srun_args
-    executor.retries = retries
-    executor.time = time_limit
-
     return executor
 
 
@@ -118,6 +119,17 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
 
     return run.Partial(import_ckpt, model=model, source=source, overwrite=False), import_executor, "import_ckpt_exp"
 
+def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
+    train_pack_metadata_filepath = ""
+    if data_config.__fn_or_cls__ == SquadDataModule:
+        datasets_dir = os.getenv(
+            "NEMO_DATASETS_CACHE", os.path.join(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME), "datasets")
+            )
+        model_dir = hf_model_uri.replace("/", "--")
+        metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
+
+        train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
+    return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 
 def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """
@@ -204,6 +216,23 @@ def parse_cli_args():
         help="Finetuning scheme to use. Options- 'sft', 'lora'. Defaults is 'lora'",
         default='lora',
     )
+    parser.add_argument(
+        "-hf",
+        "--hf_token",
+        type=str,
+        help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
+        default=None,
+    )
+    nemo_home_msg = ["Directory where NeMo searches for models and checkpoints.",
+                     "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
+                     f"Missing files will be downloaded from HuggingFace. Defaults to {DEFAULT_NEMO_CACHE_HOME}"]
+    parser.add_argument(
+        "-nh",
+        "--nemo_home",
+        type=str,
+        help=" ".join(nemo_home_msg),
+        default=DEFAULT_NEMO_CACHE_HOME,
+    )
     parser.add_argument(
         "-d",
         "--dryrun",

From 372d376d39a976bafde2cfd266f2934190f1b752 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Wed, 8 Jan 2025 16:38:42 +0000
Subject: [PATCH 05/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/finetuning_llama3_70b.py | 15 +++++++++++----
 scripts/llm/performance/finetuning_llama3_8b.py  | 11 +++++++++--
 scripts/llm/performance/utils.py                 | 16 ++++++++++------
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index 7126b0339de5..f73a5bd2906c 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -12,18 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import sys
 from typing import Optional
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor, isfile_train_pack_metadata
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
 
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
 from nemo.utils import logging
-import os
-import sys
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -136,7 +143,7 @@ def llama3_70b_performance_recipe(
             "NVTE_FLASH_ATTN": "1",
         },
         hf_token=args.hf_token,
-        nemo_home=args.nemo_home
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_70b_performance_recipe(
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index 99d99db17661..08e26fd98195 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -18,7 +18,14 @@
 
 import nemo_run as run
 from nemo_run.config import NEMORUN_HOME
-from utils import get_comm_overlap_callback_idx, hf_tokenizer, import_ckpt_experiment, parse_cli_args, slurm_executor, isfile_train_pack_metadata
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
 
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -136,7 +143,7 @@ def llama3_8b_performance_recipe(
             "NVTE_FLASH_ATTN": "1",
         },
         hf_token=args.hf_token,
-        nemo_home=args.nemo_home
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_8b_performance_recipe(
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index ec2314cae3e7..cb72b6e25bfc 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -21,11 +21,11 @@
 from nemo_run.config import NEMORUN_HOME
 
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.gpt.model import GPTModel
 from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
 from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
-from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
+
 
 def slurm_executor(
     account: str,
@@ -119,18 +119,20 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
 
     return run.Partial(import_ckpt, model=model, source=source, overwrite=False), import_executor, "import_ckpt_exp"
 
+
 def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
     train_pack_metadata_filepath = ""
     if data_config.__fn_or_cls__ == SquadDataModule:
         datasets_dir = os.getenv(
             "NEMO_DATASETS_CACHE", os.path.join(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME), "datasets")
-            )
+        )
         model_dir = hf_model_uri.replace("/", "--")
         metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
 
         train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
     return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 
+
 def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """
     nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
@@ -223,9 +225,11 @@ def parse_cli_args():
         help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
         default=None,
     )
-    nemo_home_msg = ["Directory where NeMo searches for models and checkpoints.",
-                     "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
-                     f"Missing files will be downloaded from HuggingFace. Defaults to {DEFAULT_NEMO_CACHE_HOME}"]
+    nemo_home_msg = [
+        "Directory where NeMo searches for models and checkpoints.",
+        "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
+        f"Missing files will be downloaded from HuggingFace. Defaults to {DEFAULT_NEMO_CACHE_HOME}",
+    ]
     parser.add_argument(
         "-nh",
         "--nemo_home",

From 92c381eec6ea9108ff0169402ce03855485252f9 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 9 Jan 2025 12:19:48 +0530
Subject: [PATCH 06/32] formatting and cleanup

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/finetuning_llama3_70b.py  | 10 ----
 .../llm/performance/finetuning_llama3_8b.py   | 10 ----
 scripts/llm/performance/gpt3_175b.py          |  7 ---
 scripts/llm/performance/llama3_405b.py        |  7 ---
 scripts/llm/performance/llama3_70b.py         |  7 ---
 scripts/llm/performance/llama3_8b.py          |  7 ---
 scripts/llm/performance/mixtral_8x22b.py      |  7 ---
 scripts/llm/performance/mixtral_8x7b.py       |  7 ---
 scripts/llm/performance/utils.py              | 52 +++++++++++--------
 9 files changed, 31 insertions(+), 83 deletions(-)

diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index f73a5bd2906c..756818fd322a 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import sys
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import (
     get_comm_overlap_callback_idx,
     hf_tokenizer,
@@ -30,7 +27,6 @@
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -111,12 +107,6 @@ def llama3_70b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
-    if args.nemo_home and args.nemo_home != os.getenv("NEMO_HOME"):
-        logging.error(f"Run `export NEMO_HOME={args.nemo_home}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index 08e26fd98195..a05bc7563693 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -12,12 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import sys
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import (
     get_comm_overlap_callback_idx,
     hf_tokenizer,
@@ -30,7 +27,6 @@
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -111,12 +107,6 @@ def llama3_8b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
-    if args.nemo_home and args.nemo_home != os.getenv("NEMO_HOME"):
-        logging.error(f"Run `export NEMO_HOME={args.nemo_home}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/gpt3_175b.py b/scripts/llm/performance/gpt3_175b.py
index 01a3f7381628..f75fc710b408 100644
--- a/scripts/llm/performance/gpt3_175b.py
+++ b/scripts/llm/performance/gpt3_175b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.gpt3_175b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 64
 NUM_GPUS_PER_NODE = 8
@@ -111,11 +109,6 @@ def gpt3_175b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/llama3_405b.py b/scripts/llm/performance/llama3_405b.py
index dd6194c7f8b3..205c2852bfe5 100644
--- a/scripts/llm/performance/llama3_405b.py
+++ b/scripts/llm/performance/llama3_405b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama31_405b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 72
 NUM_GPUS_PER_NODE = 8
@@ -111,11 +109,6 @@ def llama3_405b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/llama3_70b.py b/scripts/llm/performance/llama3_70b.py
index 97babadbe803..ebdeda9bf574 100644
--- a/scripts/llm/performance/llama3_70b.py
+++ b/scripts/llm/performance/llama3_70b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_70b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 8
 NUM_GPUS_PER_NODE = 8
@@ -111,11 +109,6 @@ def llama3_70b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/llama3_8b.py
index 81382cc33a16..50a9216a833d 100644
--- a/scripts/llm/performance/llama3_8b.py
+++ b/scripts/llm/performance/llama3_8b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -108,11 +106,6 @@ def llama3_8b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/mixtral_8x22b.py b/scripts/llm/performance/mixtral_8x22b.py
index b474561296e4..aad2a9946019 100644
--- a/scripts/llm/performance/mixtral_8x22b.py
+++ b/scripts/llm/performance/mixtral_8x22b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 128
 NUM_GPUS_PER_NODE = 8
@@ -111,11 +109,6 @@ def mixtral_8x22b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/mixtral_8x7b.py b/scripts/llm/performance/mixtral_8x7b.py
index 4d5321269227..f5ad8c6413c4 100644
--- a/scripts/llm/performance/mixtral_8x7b.py
+++ b/scripts/llm/performance/mixtral_8x7b.py
@@ -15,14 +15,12 @@
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 8
 NUM_GPUS_PER_NODE = 8
@@ -111,11 +109,6 @@ def mixtral_8x7b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 4247667aa24b..d8acfaea8b0e 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -14,6 +14,7 @@
 
 import argparse
 import os
+import sys
 from typing import Dict, List
 
 import nemo_run as run
@@ -27,6 +28,7 @@
 from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
 from nemo.utils import logging
 
+DEFAULT_NEMO_HOME = os.getenv('NEMO_HOME', DEFAULT_NEMO_CACHE_HOME)
 
 def slurm_executor(
     account: str,
@@ -40,21 +42,20 @@ def slurm_executor(
     custom_env_vars: Dict[str, str] = {},
     custom_srun_args: List[str] = [],
     hf_token: str = None,
-    nemo_home: str = DEFAULT_NEMO_CACHE_HOME,
+    nemo_home: str = DEFAULT_NEMO_HOME,
 ) -> run.SlurmExecutor:
     """
     Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
     and fine-tuning experiments
     """
-    if not (log_dir and account and partition and nodes and num_gpus_per_node):
-        raise RuntimeError(
-            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this ",
-            "function.",
-        )
-
-    if nemo_home != DEFAULT_NEMO_CACHE_HOME:
-        custom_mounts.extend([f"{nemo_home}:{nemo_home}"])
-        custom_env_vars.update({"NEMO_HOME": nemo_home})
+    err_msgs = []
+    if log_dir != NEMORUN_HOME:
+        err_msgs.append(f"Run `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
+    if nemo_home != DEFAULT_NEMO_HOME:
+        err_msgs.append(f"Run `export NEMO_HOME={nemo_home}` in your shell environment and rerun this script.")
+    if len(err_msgs) > 0:
+        logging.error(err_msgs)
+        sys.exit(1)
 
     env_vars = {
         "TRANSFORMERS_OFFLINE": "1",
@@ -67,11 +68,17 @@ def slurm_executor(
         "NEMO_LOG_MEMORY_USAGE": "1",
         "NEMORUN_HOME": log_dir,
     }
+    mounts = []
+    srun_args = ["--mpi=pmix"]
+
+    if nemo_home != DEFAULT_NEMO_HOME:
+        env_vars.update({"NEMO_HOME": nemo_home})
+        mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:
-        custom_env_vars.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+        env_vars.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+    
     env_vars |= custom_env_vars
-
-    srun_args = ["--mpi=pmix"]
+    mounts.extend(custom_mounts)
     srun_args.extend(custom_srun_args)
 
     executor = run.SlurmExecutor(
@@ -83,7 +90,7 @@ def slurm_executor(
         nodes=nodes,
         ntasks_per_node=num_gpus_per_node,
         container_image=container_image,
-        container_mounts=custom_mounts,
+        container_mounts=mounts,
         env_vars=env_vars,
         srun_args=srun_args,
         time=time_limit,
@@ -134,13 +141,12 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
 def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
     train_pack_metadata_filepath = ""
     if data_config.__fn_or_cls__ == SquadDataModule:
-        datasets_dir = os.getenv(
-            "NEMO_DATASETS_CACHE", os.path.join(os.getenv("NEMO_HOME", DEFAULT_NEMO_CACHE_HOME), "datasets")
-        )
+        datasets_dir = os.getenv("NEMO_DATASETS_CACHE", os.path.join(DEFAULT_NEMO_HOME, "datasets"))
         model_dir = hf_model_uri.replace("/", "--")
         metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
 
         train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
+    
     return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 
 
@@ -194,12 +200,15 @@ def parse_cli_args():
         required=False,
         default="00:30:00",
     )
+    container_img_msg = [
+        "NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'",
+        "Make sure your NGC credentials are accessible in your environment.",
+    ]
     parser.add_argument(
         "-i",
         "--container_image",
         type=str,
-        help="NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'\
-            Make sure your NGC credentials are accessible in your environment.",
+        help=" ".join(container_img_msg),
         required=False,
         default="nvcr.io/nvidia/nemo:dev",
     )
@@ -239,14 +248,15 @@ def parse_cli_args():
     nemo_home_msg = [
         "Directory where NeMo searches for models and checkpoints.",
         "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
-        f"Missing files will be downloaded from HuggingFace. Defaults to {DEFAULT_NEMO_CACHE_HOME}",
+        "Missing files will be downloaded from HuggingFace., "
+        f"Defaults to {DEFAULT_NEMO_HOME}",
     ]
     parser.add_argument(
         "-nh",
         "--nemo_home",
         type=str,
         help=" ".join(nemo_home_msg),
-        default=DEFAULT_NEMO_CACHE_HOME,
+        default=DEFAULT_NEMO_HOME,
     )
     parser.add_argument(
         "-d",

From 1bc19a14a34a314d612faa133ec5ec0fbe3b9a4a Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Thu, 9 Jan 2025 06:50:52 +0000
Subject: [PATCH 07/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index d8acfaea8b0e..6e7d15664f52 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -30,6 +30,7 @@
 
 DEFAULT_NEMO_HOME = os.getenv('NEMO_HOME', DEFAULT_NEMO_CACHE_HOME)
 
+
 def slurm_executor(
     account: str,
     partition: str,
@@ -76,7 +77,7 @@ def slurm_executor(
         mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:
         env_vars.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
-    
+
     env_vars |= custom_env_vars
     mounts.extend(custom_mounts)
     srun_args.extend(custom_srun_args)
@@ -146,7 +147,7 @@ def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadD
         metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
 
         train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
-    
+
     return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 
 
@@ -248,8 +249,7 @@ def parse_cli_args():
     nemo_home_msg = [
         "Directory where NeMo searches for models and checkpoints.",
         "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
-        "Missing files will be downloaded from HuggingFace., "
-        f"Defaults to {DEFAULT_NEMO_HOME}",
+        "Missing files will be downloaded from HuggingFace., " f"Defaults to {DEFAULT_NEMO_HOME}",
     ]
     parser.add_argument(
         "-nh",

From cf3bc02ac13332a71358e41e19e177eb6c034a09 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Thu, 9 Jan 2025 14:03:48 +0530
Subject: [PATCH 08/32] 405b lora + more cleanup

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/finetuning_llama3_405b.py | 179 ++++++++++++++++++
 .../llm/performance/finetuning_llama3_70b.py  |   6 +-
 .../llm/performance/finetuning_llama3_8b.py   |   6 +-
 scripts/llm/performance/gpt3_175b.py          |   2 +-
 scripts/llm/performance/llama3_405b.py        |   2 +-
 scripts/llm/performance/llama3_70b.py         |   2 +-
 scripts/llm/performance/llama3_8b.py          |   2 +-
 scripts/llm/performance/mixtral_8x22b.py      |   2 +-
 scripts/llm/performance/mixtral_8x7b.py       |   2 +-
 scripts/llm/performance/utils.py              |  25 ++-
 10 files changed, 211 insertions(+), 17 deletions(-)
 create mode 100644 scripts/llm/performance/finetuning_llama3_405b.py

diff --git a/scripts/llm/performance/finetuning_llama3_405b.py b/scripts/llm/performance/finetuning_llama3_405b.py
new file mode 100644
index 000000000000..ca5394ccf621
--- /dev/null
+++ b/scripts/llm/performance/finetuning_llama3_405b.py
@@ -0,0 +1,179 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
+
+from nemo.collections.llm.recipes.llama31_405b import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+
+NUM_NODES = 3
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 24
+TP_SIZE = 4
+PP_SIZE = 6
+CP_SIZE = 1
+VP_SIZE = 7
+MAX_STEPS = 100
+
+HF_MODEL_URI = "meta-llama/Llama-3.1-405B"
+
+
+def llama31_405b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3.1 405b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
+        recipe.data.force_redownload = True
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    if tp_size > 1:
+        recipe.trainer.strategy.sequence_parallel = True
+    else:
+        recipe.trainer.strategy.sequence_parallel = False
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            args.finetuning.lower(),
+            f"llama31_405b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={
+            "NVTE_FUSED_ATTN": "0",
+            "NVTE_FLASH_ATTN": "1",
+        },
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = llama31_405b_performance_recipe(
+        args.finetuning.lower(),
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index 756818fd322a..77902385b054 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -66,7 +67,8 @@ def llama3_70b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
-    if not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
         recipe.data.force_redownload = True
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index a05bc7563693..88afedf7b827 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -66,7 +67,8 @@ def llama3_8b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
-    if not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
         recipe.data.force_redownload = True
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/gpt3_175b.py b/scripts/llm/performance/gpt3_175b.py
index f75fc710b408..cb3a989bbc67 100644
--- a/scripts/llm/performance/gpt3_175b.py
+++ b/scripts/llm/performance/gpt3_175b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/llama3_405b.py b/scripts/llm/performance/llama3_405b.py
index 205c2852bfe5..e5161344fc70 100644
--- a/scripts/llm/performance/llama3_405b.py
+++ b/scripts/llm/performance/llama3_405b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/llama3_70b.py b/scripts/llm/performance/llama3_70b.py
index ebdeda9bf574..41ae3d6717f9 100644
--- a/scripts/llm/performance/llama3_70b.py
+++ b/scripts/llm/performance/llama3_70b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/llama3_8b.py
index 50a9216a833d..6adf98765f2a 100644
--- a/scripts/llm/performance/llama3_8b.py
+++ b/scripts/llm/performance/llama3_8b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/mixtral_8x22b.py b/scripts/llm/performance/mixtral_8x22b.py
index aad2a9946019..485c6c61349e 100644
--- a/scripts/llm/performance/mixtral_8x22b.py
+++ b/scripts/llm/performance/mixtral_8x22b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/mixtral_8x7b.py b/scripts/llm/performance/mixtral_8x7b.py
index f5ad8c6413c4..9249eedfdc8e 100644
--- a/scripts/llm/performance/mixtral_8x7b.py
+++ b/scripts/llm/performance/mixtral_8x7b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 6e7d15664f52..d3243a7102c7 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -129,6 +129,14 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
 
 
 def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
+    """
+    Downloads/Acceses checkpoint to be used for fine-tuning. `import_ckpt` first tries find the nemo checkpoint in
+    <NEMO_HOME>/models/. For eg: for llama3 8b, the path will look like- <NEMO_HOME>/models/meta-llama/Meta-Llama-3-8B
+    If missing, tries to downloads at the same location from HuggingFace and converts it nemo format.
+
+    Args:
+        source (str): HuggingFace URL. For eg- hf://meta-llama/Meta-Llama-3-70B
+    """
     from copy import deepcopy
 
     from nemo.collections.llm import import_ckpt
@@ -140,13 +148,16 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
 
 
 def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
-    train_pack_metadata_filepath = ""
-    if data_config.__fn_or_cls__ == SquadDataModule:
-        datasets_dir = os.getenv("NEMO_DATASETS_CACHE", os.path.join(DEFAULT_NEMO_HOME, "datasets"))
-        model_dir = hf_model_uri.replace("/", "--")
-        metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
+    """
+    This method is used for fine-tuning. It checks if packed train data for a partiular 
+    sequence length exists locally. This is needed to set data flag (force_redownload=True)
+    which avoids experiment crash in case files are missing.
+    """
+    datasets_dir = os.getenv("NEMO_DATASETS_CACHE", os.path.join(DEFAULT_NEMO_HOME, "datasets"))
+    model_dir = hf_model_uri.replace("/", "--")
+    metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
 
-        train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
+    train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
 
     return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 

From fcbe6676bc746653a5130210e8469cb0bdabbac3 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Thu, 9 Jan 2025 08:34:45 +0000
Subject: [PATCH 09/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/finetuning_llama3_405b.py | 2 +-
 scripts/llm/performance/finetuning_llama3_70b.py  | 2 +-
 scripts/llm/performance/finetuning_llama3_8b.py   | 2 +-
 scripts/llm/performance/utils.py                  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/llm/performance/finetuning_llama3_405b.py b/scripts/llm/performance/finetuning_llama3_405b.py
index ca5394ccf621..9c57f523b70d 100644
--- a/scripts/llm/performance/finetuning_llama3_405b.py
+++ b/scripts/llm/performance/finetuning_llama3_405b.py
@@ -24,10 +24,10 @@
     slurm_executor,
 )
 
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama31_405b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 NUM_NODES = 3
 NUM_GPUS_PER_NODE = 8
diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index 77902385b054..f11b0b2aa868 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -24,10 +24,10 @@
     slurm_executor,
 )
 
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index 88afedf7b827..1e8cafed54b3 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -24,10 +24,10 @@
     slurm_executor,
 )
 
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index d3243a7102c7..c02f1d9004ca 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -149,7 +149,7 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
 
 def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
     """
-    This method is used for fine-tuning. It checks if packed train data for a partiular 
+    This method is used for fine-tuning. It checks if packed train data for a partiular
     sequence length exists locally. This is needed to set data flag (force_redownload=True)
     which avoids experiment crash in case files are missing.
     """

From 46c89bd5bbceda1840ee4c1b957c03d2ee9463e4 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Fri, 10 Jan 2025 15:11:57 +0530
Subject: [PATCH 10/32] no tp comm, import ckpt, data filename

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/README.md             | 13 +++++++----
 .../llm/performance/finetuning_llama3_405b.py |  5 ++--
 .../llm/performance/finetuning_llama3_70b.py  |  5 ++--
 .../llm/performance/finetuning_llama3_8b.py   |  5 ++--
 scripts/llm/performance/utils.py              | 23 ++++++++++---------
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/scripts/llm/performance/README.md b/scripts/llm/performance/README.md
index 62bf58329633..99bf92ed179b 100644
--- a/scripts/llm/performance/README.md
+++ b/scripts/llm/performance/README.md
@@ -13,15 +13,18 @@ The following line shows an example of how you can launch a pre-training experim
 
 - Slurm account and partition are mandatory arguments for launching the experiment.
 - You can use the following optional arguments as needed-
-  - -l/--log_dir: Location to store your experiment artifacts and logs. 
-    - Make sure the environemnt variable `NEMORUN_HOME=<log_dir>` is accessible and set correctly in your virtual environment. 
+  - -l/--log_dir: Location to store your experiment artifacts and logs.
+    - Make sure the environemnt variable `NEMORUN_HOME=<log_dir>` is accessible and set correctly in your virtual environment.
     - You can run `export NEMORUN_HOME=<log_dir>` in your terminal. You can add it your bashrc file (or equivalent for your OS/Linux distro) for setting it permanently.
   - -t/--time_limit: Maximum time limit for your experiment. Your slurm job will be cancelled after this. Default is 30 minutes.
   - -i/--container_image: The NeMo container you want to use. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'.
   - -c/--compute_dtype: Specifies whether you want to use bf16 or fp8 precision for training. Defaults to 'bf16'. You can choose to use 'fp8'.
-  - -ep/--enable_profiling: Enable nsys profiling. It is disabled by default. When enabled, profiling will be enabled for 1 step from step 5 to step 6. You can change the step in the respective recipe script. 
-  - -tb/--tensorboard: Enable tensorboard logging. It is disabled by default. 
-    - CAUTION: Tensorboard logging may cause performance overhead. 
+  - -ep/--enable_profiling: Enable nsys profiling. It is disabled by default. When enabled, profiling will be enabled for 1 step from step 5 to step 6. You can change the step in the respective recipe script.
+  - -tb/--tensorboard: Enable tensorboard logging. It is disabled by default.
+    - CAUTION: Tensorboard logging may cause performance overhead.
+  - -f/--finetuning: Finetuning scheme to use. Options- 'sft', 'lora'. Defaults is 'lora'.
+  - -hf/--hf_token: HuggingFace access token. Defaults to None. Required for accessing tokenizers and checkpoints from HuggingFace.
+  - -nh/--nemo_home:  Directory where NeMo searches for models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints already exist here. Missing files will be downloaded from HuggingFace. Defaults to environment variable DEFAULT_NEMO_CACHE_HOME = ~/.cache/nemo
   - -d/--dryrun: Using this argument will not launch the experiment. It will simply print the sbatch script to stdout. This can be helpful to verify you have set your experiment correctly as needed.
 - You don't need to set any value for `--enable_profiling`, `--tensorboard` and `--dryrun`. See the below example for reference-
   `python3 scripts/llm/performance/llama3_8b.py --account <your_slurm_account> -p <your_slurm_partition> -ep --tensorboard -d`
diff --git a/scripts/llm/performance/finetuning_llama3_405b.py b/scripts/llm/performance/finetuning_llama3_405b.py
index 9c57f523b70d..5ba1528e04a2 100644
--- a/scripts/llm/performance/finetuning_llama3_405b.py
+++ b/scripts/llm/performance/finetuning_llama3_405b.py
@@ -97,6 +97,8 @@ def llama31_405b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -131,7 +133,6 @@ def llama31_405b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={
-            "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
         },
         hf_token=args.hf_token,
@@ -165,7 +166,7 @@ def llama31_405b_performance_recipe(
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
     with run.Experiment(exp_name) as exp:
-        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
         exp.add(
             recipe,
             executor=executor,
diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetuning_llama3_70b.py
index f11b0b2aa868..ec290c2c491d 100644
--- a/scripts/llm/performance/finetuning_llama3_70b.py
+++ b/scripts/llm/performance/finetuning_llama3_70b.py
@@ -97,6 +97,8 @@ def llama3_70b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -131,7 +133,6 @@ def llama3_70b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={
-            "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
         },
         hf_token=args.hf_token,
@@ -165,7 +166,7 @@ def llama3_70b_performance_recipe(
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
     with run.Experiment(exp_name) as exp:
-        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
         exp.add(
             recipe,
             executor=executor,
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetuning_llama3_8b.py
index 1e8cafed54b3..0391c54c1cd5 100644
--- a/scripts/llm/performance/finetuning_llama3_8b.py
+++ b/scripts/llm/performance/finetuning_llama3_8b.py
@@ -97,6 +97,8 @@ def llama3_8b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -131,7 +133,6 @@ def llama3_8b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={
-            "NVTE_FUSED_ATTN": "0",
             "NVTE_FLASH_ATTN": "1",
         },
         hf_token=args.hf_token,
@@ -165,7 +166,7 @@ def llama3_8b_performance_recipe(
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
     with run.Experiment(exp_name) as exp:
-        exp.add(*import_ckpt_experiment(NUM_NODES, executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
         exp.add(
             recipe,
             executor=executor,
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index c02f1d9004ca..c9ccb392d26b 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -51,11 +51,11 @@ def slurm_executor(
     """
     err_msgs = []
     if log_dir != NEMORUN_HOME:
-        err_msgs.append(f"Run `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
+        err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
     if nemo_home != DEFAULT_NEMO_HOME:
         err_msgs.append(f"Run `export NEMO_HOME={nemo_home}` in your shell environment and rerun this script.")
     if len(err_msgs) > 0:
-        logging.error(err_msgs)
+        logging.error("\n".join(err_msgs))
         sys.exit(1)
 
     env_vars = {
@@ -72,7 +72,7 @@ def slurm_executor(
     mounts = []
     srun_args = ["--mpi=pmix"]
 
-    if nemo_home != DEFAULT_NEMO_HOME:
+    if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this 'DEFAULT_NEMO_HOME'/'NEMO_HOME' 
         env_vars.update({"NEMO_HOME": nemo_home})
         mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:
@@ -113,7 +113,7 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
                 huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
     """
     log_msg = [
-        "AutoTokenizer first searches for tokenizer files locally in env var 'NEMO_HOME'.",
+        f"AutoTokenizer first searches for tokenizer files locally in env var {DEFAULT_NEMO_HOME}.",
         "If files are missing locally, AutoTokenizer will try downloading from HuggingFace.",
         "Make sure 'TRANSFORMERS_OFFLINE=0' and 'HF_TOKEN:<token_value>'.",
         "You can set them as scripts.llm.performance.utils.slurm_executor(custom_env_vars=",
@@ -128,7 +128,7 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
     )
 
 
-def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
+def import_ckpt_experiment(executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
     """
     Downloads/Acceses checkpoint to be used for fine-tuning. `import_ckpt` first tries find the nemo checkpoint in
     <NEMO_HOME>/models/. For eg: for llama3 8b, the path will look like- <NEMO_HOME>/models/meta-llama/Meta-Llama-3-8B
@@ -142,12 +142,13 @@ def import_ckpt_experiment(num_nodes: int, executor: run.SlurmExecutor, model: r
     from nemo.collections.llm import import_ckpt
 
     import_executor = deepcopy(executor)
-    import_executor.ntasks_per_node = num_nodes
+    import_executor.ntasks_per_node = 1
+    import_executor.nodes = 1
 
     return run.Partial(import_ckpt, model=model, source=source, overwrite=False), import_executor, "import_ckpt_exp"
 
 
-def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]):
+def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]) -> bool:
     """
     This method is used for fine-tuning. It checks if packed train data for a partiular
     sequence length exists locally. This is needed to set data flag (force_redownload=True)
@@ -155,14 +156,14 @@ def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadD
     """
     datasets_dir = os.getenv("NEMO_DATASETS_CACHE", os.path.join(DEFAULT_NEMO_HOME, "datasets"))
     model_dir = hf_model_uri.replace("/", "--")
-    metadata_filename = f"train_{data_config.seq_length}_metadata.jsonl"
+    metadata_filename = f"{data_config.seq_length}_metadata.jsonl"
 
     train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
 
     return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
 
 
-def get_comm_overlap_callback_idx(callbacks: List[Callback]):
+def get_comm_overlap_callback_idx(callbacks: List[Callback]) -> int | None:
     """
     nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
     from the list defined in recipes in nemo.collections.llm.recipes. The index is needed to override ddp communication
@@ -170,9 +171,9 @@ def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """
     if callbacks:  # default is None in lightning
         for idx, callback in enumerate(callbacks):
-            if isinstance(callback, MegatronCommOverlapCallback):
+            if callback.__fn_or_cls__ == MegatronCommOverlapCallback:
                 return idx
-    return -1
+    return None
 
 
 def parse_cli_args():

From 4df3bda1add5846500f6808e2bb975bc03938d77 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Fri, 10 Jan 2025 09:42:59 +0000
Subject: [PATCH 11/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index c9ccb392d26b..1dc524b02848 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -72,7 +72,7 @@ def slurm_executor(
     mounts = []
     srun_args = ["--mpi=pmix"]
 
-    if nemo_home != DEFAULT_NEMO_CACHE_HOME: # DO NOT change this 'DEFAULT_NEMO_HOME'/'NEMO_HOME' 
+    if nemo_home != DEFAULT_NEMO_CACHE_HOME:  # DO NOT change this 'DEFAULT_NEMO_HOME'/'NEMO_HOME'
         env_vars.update({"NEMO_HOME": nemo_home})
         mounts.extend([f"{nemo_home}:{nemo_home}"])
     if hf_token is not None:

From 4a59854660a91ca56b49659ce9fb87791893e903 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Fri, 10 Jan 2025 16:11:15 +0530
Subject: [PATCH 12/32] renamed files

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../{finetuning_llama3_405b.py => finetune_llama3_405b.py} | 0
 .../{finetuning_llama3_70b.py => finetune_llama3_70b.py}   | 0
 .../{finetuning_llama3_8b.py => finetune_llama3_8b.py}     | 0
 .../llm/performance/{mixtral_8x7b.py => pretrain_8x7b.py}  | 3 ++-
 .../performance/{mixtral_8x22b.py => pretrain__8x22b.py}   | 7 ++++---
 .../performance/{gpt3_175b.py => pretrain_gpt3_175b.py}    | 3 ++-
 .../{llama3_405b.py => pretrain_llama31_405b.py}           | 3 ++-
 .../performance/{llama3_70b.py => pretrain_llama3_70b.py}  | 3 ++-
 .../performance/{llama3_8b.py => pretrain_llama3_8b.py}    | 3 ++-
 scripts/llm/performance/utils.py                           | 2 +-
 10 files changed, 15 insertions(+), 9 deletions(-)
 rename scripts/llm/performance/{finetuning_llama3_405b.py => finetune_llama3_405b.py} (100%)
 rename scripts/llm/performance/{finetuning_llama3_70b.py => finetune_llama3_70b.py} (100%)
 rename scripts/llm/performance/{finetuning_llama3_8b.py => finetune_llama3_8b.py} (100%)
 rename scripts/llm/performance/{mixtral_8x7b.py => pretrain_8x7b.py} (98%)
 rename scripts/llm/performance/{mixtral_8x22b.py => pretrain__8x22b.py} (96%)
 rename scripts/llm/performance/{gpt3_175b.py => pretrain_gpt3_175b.py} (98%)
 rename scripts/llm/performance/{llama3_405b.py => pretrain_llama31_405b.py} (98%)
 rename scripts/llm/performance/{llama3_70b.py => pretrain_llama3_70b.py} (98%)
 rename scripts/llm/performance/{llama3_8b.py => pretrain_llama3_8b.py} (98%)

diff --git a/scripts/llm/performance/finetuning_llama3_405b.py b/scripts/llm/performance/finetune_llama3_405b.py
similarity index 100%
rename from scripts/llm/performance/finetuning_llama3_405b.py
rename to scripts/llm/performance/finetune_llama3_405b.py
diff --git a/scripts/llm/performance/finetuning_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
similarity index 100%
rename from scripts/llm/performance/finetuning_llama3_70b.py
rename to scripts/llm/performance/finetune_llama3_70b.py
diff --git a/scripts/llm/performance/finetuning_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
similarity index 100%
rename from scripts/llm/performance/finetuning_llama3_8b.py
rename to scripts/llm/performance/finetune_llama3_8b.py
diff --git a/scripts/llm/performance/mixtral_8x7b.py b/scripts/llm/performance/pretrain_8x7b.py
similarity index 98%
rename from scripts/llm/performance/mixtral_8x7b.py
rename to scripts/llm/performance/pretrain_8x7b.py
index 9249eedfdc8e..3a28e45e11e6 100644
--- a/scripts/llm/performance/mixtral_8x7b.py
+++ b/scripts/llm/performance/pretrain_8x7b.py
@@ -130,7 +130,8 @@ def mixtral_8x7b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = mixtral_8x7b_performance_recipe(
diff --git a/scripts/llm/performance/mixtral_8x22b.py b/scripts/llm/performance/pretrain__8x22b.py
similarity index 96%
rename from scripts/llm/performance/mixtral_8x22b.py
rename to scripts/llm/performance/pretrain__8x22b.py
index 485c6c61349e..25bbb9b7fda7 100644
--- a/scripts/llm/performance/mixtral_8x22b.py
+++ b/scripts/llm/performance/pretrain__8x22b.py
@@ -17,7 +17,7 @@
 import nemo_run as run
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
-from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
+from nemo.collections.llm.recipes.mixtral_8x22b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
@@ -48,7 +48,7 @@ def mixtral_8x22b_performance_recipe(
     max_steps: int,
 ):
     """
-    mixtral 8x7b pre-train recipe aimed at achieving best possible performance.
+    mixtral 8x22b pre-train recipe aimed at achieving best possible performance.
 
     NOTE: Use fp8 precision training with caution. It might not give desirable results.
     """
@@ -130,7 +130,8 @@ def mixtral_8x22b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = mixtral_8x22b_performance_recipe(
diff --git a/scripts/llm/performance/gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
similarity index 98%
rename from scripts/llm/performance/gpt3_175b.py
rename to scripts/llm/performance/pretrain_gpt3_175b.py
index cb3a989bbc67..dea6ee1b151d 100644
--- a/scripts/llm/performance/gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -130,7 +130,8 @@ def gpt3_175b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = gpt3_175b_performance_recipe(
diff --git a/scripts/llm/performance/llama3_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
similarity index 98%
rename from scripts/llm/performance/llama3_405b.py
rename to scripts/llm/performance/pretrain_llama31_405b.py
index e5161344fc70..8f7c01041261 100644
--- a/scripts/llm/performance/llama3_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -130,7 +130,8 @@ def llama3_405b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_405b_performance_recipe(
diff --git a/scripts/llm/performance/llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
similarity index 98%
rename from scripts/llm/performance/llama3_70b.py
rename to scripts/llm/performance/pretrain_llama3_70b.py
index 41ae3d6717f9..d4b369c36f15 100644
--- a/scripts/llm/performance/llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -130,7 +130,8 @@ def llama3_70b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_70b_performance_recipe(
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
similarity index 98%
rename from scripts/llm/performance/llama3_8b.py
rename to scripts/llm/performance/pretrain_llama3_8b.py
index 6adf98765f2a..e8dbd4ba9ead 100644
--- a/scripts/llm/performance/llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -127,7 +127,8 @@ def llama3_8b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_8b_performance_recipe(
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 1dc524b02848..b892ec4468eb 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -65,7 +65,7 @@ def slurm_executor(
         "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
         "NVTE_ASYNC_AMAX_REDUCTION": "1",
         "NVTE_FUSED_ATTN": "1",
-        "NVTE_FLASH_ATTN": "0",
+        "NVTE_FLASH_ATTN": "1",
         "NEMO_LOG_MEMORY_USAGE": "1",
         "NEMORUN_HOME": log_dir,
     }

From c6d3c82b45ccf1adeb2e9b9dd4e85c98688a5023 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Sat, 11 Jan 2025 00:58:11 +0530
Subject: [PATCH 13/32] tp comm

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../{finetune_llama3_405b.py => finetune_llama31_405b.py}       | 2 +-
 scripts/llm/performance/finetune_llama3_70b.py                  | 2 +-
 scripts/llm/performance/finetune_llama3_8b.py                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename scripts/llm/performance/{finetune_llama3_405b.py => finetune_llama31_405b.py} (99%)

diff --git a/scripts/llm/performance/finetune_llama3_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
similarity index 99%
rename from scripts/llm/performance/finetune_llama3_405b.py
rename to scripts/llm/performance/finetune_llama31_405b.py
index 5ba1528e04a2..90a121495651 100644
--- a/scripts/llm/performance/finetune_llama3_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -97,7 +97,7 @@ def llama31_405b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
-    if comm_overlap_callback_idx:
+    if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index ec290c2c491d..be87b26cfb89 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -97,7 +97,7 @@ def llama3_70b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
-    if comm_overlap_callback_idx:
+    if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 0391c54c1cd5..d88792cd81c1 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -97,7 +97,7 @@ def llama3_8b_performance_recipe(
     if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
-    if comm_overlap_callback_idx:
+    if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
 
     # Misc. for overall faster experiment runtime

From c6c044a0c435194a4fee3a944c0c0e9792e61a8f Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Sun, 12 Jan 2025 18:52:30 +0530
Subject: [PATCH 14/32] mpi tp comm

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py | 4 ++--
 scripts/llm/performance/finetune_llama3_70b.py   | 4 ++--
 scripts/llm/performance/finetune_llama3_8b.py    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 90a121495651..e52696f11682 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -98,7 +98,7 @@ def llama31_405b_performance_recipe(
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -161,7 +161,7 @@ def llama31_405b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index be87b26cfb89..b88498cfec25 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -98,7 +98,7 @@ def llama3_70b_performance_recipe(
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -161,7 +161,7 @@ def llama3_70b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index d88792cd81c1..0bea8cab9368 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -98,7 +98,7 @@ def llama3_8b_performance_recipe(
         if comm_overlap_callback_idx >= 0:
             recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_overlap = False
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None

From bc65fe470f2592c795dc0ceb64e12873136e8269 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Sun, 12 Jan 2025 20:32:23 +0530
Subject: [PATCH 15/32] nemotron recipes

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 .../llm/performance/pretrain_nemotron3_22b.py | 171 ++++++++++++++++++
 .../llm/performance/pretrain_nemotron3_8b.py  | 171 ++++++++++++++++++
 .../llm/performance/pretrain_nemotron4_15b.py | 171 ++++++++++++++++++
 .../performance/pretrain_nemotron4_340b.py    | 171 ++++++++++++++++++
 4 files changed, 684 insertions(+)
 create mode 100644 scripts/llm/performance/pretrain_nemotron3_22b.py
 create mode 100644 scripts/llm/performance/pretrain_nemotron3_8b.py
 create mode 100644 scripts/llm/performance/pretrain_nemotron4_15b.py
 create mode 100644 scripts/llm/performance/pretrain_nemotron4_340b.py

diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
new file mode 100644
index 000000000000..7a8b86aff259
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+NUM_NODES = 2
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 4
+CP_SIZE = 1
+VP_SIZE = 10
+MAX_STEPS = 100
+
+
+def nemotron3_22b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron3 22b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron3_22b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
new file mode 100644
index 000000000000..3065ae21332b
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 2
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def nemotron3_8b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron3 8b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron3_8b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
new file mode 100644
index 000000000000..d283b1645fe3
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron4_15b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+NUM_NODES = 8
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 4
+GLOBAL_BATCH_SIZE = 256
+TP_SIZE = 2
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def nemotron4_15b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron4 15b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron4_15b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
new file mode 100644
index 000000000000..42222209bb85
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+NUM_NODES = 16
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 8
+PP_SIZE = 8
+CP_SIZE = 2
+VP_SIZE = 12
+MAX_STEPS = 100
+
+
+def nemotron4_340b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron4 340b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=500,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
+        if comm_overlap_callback_idx >= 0:
+            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron4_340b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()

From d108b37b9673a05f825457ba3ed82bbc66cb68c3 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Sun, 12 Jan 2025 15:03:24 +0000
Subject: [PATCH 16/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 4 ++--
 scripts/llm/performance/pretrain_nemotron3_8b.py   | 4 ++--
 scripts/llm/performance/pretrain_nemotron4_15b.py  | 4 ++--
 scripts/llm/performance/pretrain_nemotron4_340b.py | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 7a8b86aff259..8c2402eb12a4 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -20,9 +20,9 @@
 
 from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 NUM_NODES = 2
 NUM_GPUS_PER_NODE = 8
@@ -59,7 +59,7 @@ def nemotron3_22b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index 3065ae21332b..f77564e12593 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -20,9 +20,9 @@
 
 from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -59,7 +59,7 @@ def nemotron3_8b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index d283b1645fe3..ee0343cf3a52 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -20,9 +20,9 @@
 
 from nemo.collections.llm.recipes.nemotron4_15b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 NUM_NODES = 8
 NUM_GPUS_PER_NODE = 8
@@ -59,7 +59,7 @@ def nemotron4_15b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index 42222209bb85..1341eab39019 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -20,9 +20,9 @@
 
 from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 
 NUM_NODES = 16
 NUM_GPUS_PER_NODE = 8
@@ -59,7 +59,7 @@ def nemotron4_340b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name = "NullTokenizer",vocab_size = 256000
+        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps

From d45209bd5c48f45a6991853eb7344647492e5acc Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 14:56:23 +0530
Subject: [PATCH 17/32] formatting, cleanup & nemotron tokenizer

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py   |  8 +++-----
 scripts/llm/performance/finetune_llama3_70b.py     |  8 +++-----
 scripts/llm/performance/finetune_llama3_8b.py      |  8 +++-----
 scripts/llm/performance/pretrain_gpt3_175b.py      | 10 ++++------
 scripts/llm/performance/pretrain_llama31_405b.py   | 10 ++++------
 scripts/llm/performance/pretrain_llama3_70b.py     | 10 ++++------
 scripts/llm/performance/pretrain_llama3_8b.py      | 10 ++++------
 ...retrain__8x22b.py => pretrain_mixtral_8x22b.py} | 10 ++++------
 .../{pretrain_8x7b.py => pretrain_mixtral_8x7b.py} | 10 ++++------
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 14 +++++++++-----
 scripts/llm/performance/pretrain_nemotron3_8b.py   |  2 +-
 scripts/llm/performance/pretrain_nemotron4_15b.py  |  2 +-
 scripts/llm/performance/pretrain_nemotron4_340b.py | 14 +++++++++-----
 13 files changed, 53 insertions(+), 63 deletions(-)
 rename scripts/llm/performance/{pretrain__8x22b.py => pretrain_mixtral_8x22b.py} (96%)
 rename scripts/llm/performance/{pretrain_8x7b.py => pretrain_mixtral_8x7b.py} (96%)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index e52696f11682..d7c3e30312de 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -80,10 +81,7 @@ def llama31_405b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -114,7 +112,7 @@ def llama31_405b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(),
+            basename(__file__),
             f"llama31_405b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index b88498cfec25..4b8356b11423 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -80,10 +81,7 @@ def llama3_70b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -114,7 +112,7 @@ def llama3_70b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(),
+            basename(__file__),
             f"llama3_70b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 0bea8cab9368..7a4a0179f3e9 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -80,10 +81,7 @@ def llama3_8b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -114,7 +112,7 @@ def llama3_8b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(),
+            basename(__file__),
             f"llama3_8b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
diff --git a/scripts/llm/performance/pretrain_gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
index dea6ee1b151d..0725ac74f06d 100644
--- a/scripts/llm/performance/pretrain_gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -67,10 +68,7 @@ def gpt3_175b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -84,7 +82,7 @@ def gpt3_175b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -112,7 +110,7 @@ def gpt3_175b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"gpt3_175b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_llama31_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
index 8f7c01041261..696f145f901f 100644
--- a/scripts/llm/performance/pretrain_llama31_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -67,10 +68,7 @@ def llama3_405b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -86,7 +84,7 @@ def llama3_405b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -112,7 +110,7 @@ def llama3_405b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"llama3_405b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
index d4b369c36f15..6d912c5c922f 100644
--- a/scripts/llm/performance/pretrain_llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -67,10 +68,7 @@ def llama3_70b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -86,7 +84,7 @@ def llama3_70b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -112,7 +110,7 @@ def llama3_70b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"llama3_70b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
index e8dbd4ba9ead..d6ea1eee1b15 100644
--- a/scripts/llm/performance/pretrain_llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -67,10 +68,7 @@ def llama3_8b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -83,7 +81,7 @@ def llama3_8b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -109,7 +107,7 @@ def llama3_8b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"llama3_8b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain__8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
similarity index 96%
rename from scripts/llm/performance/pretrain__8x22b.py
rename to scripts/llm/performance/pretrain_mixtral_8x22b.py
index 25bbb9b7fda7..1686e00dbf31 100644
--- a/scripts/llm/performance/pretrain__8x22b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -70,10 +71,7 @@ def mixtral_8x22b_performance_recipe(
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
     recipe.trainer.strategy.expert_model_parallel_size = ep_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -86,7 +84,7 @@ def mixtral_8x22b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -112,7 +110,7 @@ def mixtral_8x22b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"mixtral_8x22b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_8x7b.py b/scripts/llm/performance/pretrain_mixtral_8x7b.py
similarity index 96%
rename from scripts/llm/performance/pretrain_8x7b.py
rename to scripts/llm/performance/pretrain_mixtral_8x7b.py
index 3a28e45e11e6..ac1430ad2f68 100644
--- a/scripts/llm/performance/pretrain_8x7b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x7b.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -70,10 +71,7 @@ def mixtral_8x7b_performance_recipe(
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
     recipe.trainer.strategy.expert_model_parallel_size = ep_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -86,7 +84,7 @@ def mixtral_8x7b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -112,7 +110,7 @@ def mixtral_8x7b_performance_recipe(
 
     exp_name = "_".join(
         [
-            f"mixtral_8x7b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 8c2402eb12a4..3bab89d633c0 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -16,7 +16,7 @@
 from typing import Optional
 
 import nemo_run as run
-from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor, hf_tokenizer
 
 from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -58,9 +58,13 @@ def nemotron3_22b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
-    recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
-    )
+    if compute_dtype == "bf16":
+        recipe.data.tokenizer = run.Config(
+            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+        )
+        recipe.model.tokenizer = recipe.data.tokenizer
+    else:
+        recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
 
     recipe.trainer.max_steps = max_steps
     recipe.trainer.num_nodes = num_nodes
@@ -84,7 +88,7 @@ def nemotron3_22b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index f77564e12593..ae6ae71535fe 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -84,7 +84,7 @@ def nemotron3_8b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index ee0343cf3a52..852ec9e32238 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -84,7 +84,7 @@ def nemotron4_15b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index 1341eab39019..589b04c4b0bf 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -16,7 +16,7 @@
 from typing import Optional
 
 import nemo_run as run
-from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor, hf_tokenizer
 
 from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
@@ -58,9 +58,13 @@ def nemotron4_340b_performance_recipe(
     recipe.data.micro_batch_size = mbs
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
-    recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
-    )
+    if compute_dtype == "bf16":
+        recipe.data.tokenizer = run.Config(
+            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+        )
+        recipe.model.tokenizer = recipe.data.tokenizer
+    else:
+        recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
 
     recipe.trainer.max_steps = max_steps
     recipe.trainer.num_nodes = num_nodes
@@ -84,7 +88,7 @@ def nemotron4_340b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [

From 7ddc1ace17c1bdb3ad350ce876ada30f38c6c0f9 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:27:21 +0000
Subject: [PATCH 18/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 2 +-
 scripts/llm/performance/pretrain_nemotron4_340b.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 3bab89d633c0..2cbb2a4dfff9 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -16,7 +16,7 @@
 from typing import Optional
 
 import nemo_run as run
-from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor, hf_tokenizer
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index 589b04c4b0bf..7455f6985282 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -16,7 +16,7 @@
 from typing import Optional
 
 import nemo_run as run
-from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor, hf_tokenizer
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed

From 35e60ed0a0ef1e6a6a975c840957cdd43b5e97af Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 15:04:43 +0530
Subject: [PATCH 19/32] supported tokenizers

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 nemo/collections/nlp/modules/common/tokenizer_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 08c39b5a67cf..de86d68481b3 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -254,6 +254,6 @@ def get_nmt_tokenizer(
         return NullTokenizer(vocab_size)
     else:
         raise NotImplementedError(
-            'Currently we only support "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer'
-            'libraries.'
+            'Currently we only support "huggingface", "sentencepiece", "megatron", "byte-level", "regex", "tabular",'
+            '"tiktoken", and "null" tokenizer libraries.'
         )

From afb329a348d6278c5e1286bbffbee3faa8d81863 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 15:26:07 +0530
Subject: [PATCH 20/32] cleanup

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py   | 5 ++---
 scripts/llm/performance/finetune_llama3_70b.py     | 5 ++---
 scripts/llm/performance/finetune_llama3_8b.py      | 5 ++---
 scripts/llm/performance/pretrain_gpt3_175b.py      | 6 +++---
 scripts/llm/performance/pretrain_llama31_405b.py   | 6 +++---
 scripts/llm/performance/pretrain_llama3_70b.py     | 6 +++---
 scripts/llm/performance/pretrain_llama3_8b.py      | 6 +++---
 scripts/llm/performance/pretrain_mixtral_8x22b.py  | 6 +++---
 scripts/llm/performance/pretrain_mixtral_8x7b.py   | 6 +++---
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 6 +++---
 scripts/llm/performance/pretrain_nemotron3_8b.py   | 6 +++---
 scripts/llm/performance/pretrain_nemotron4_15b.py  | 6 +++---
 scripts/llm/performance/pretrain_nemotron4_340b.py | 6 +++---
 13 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index d7c3e30312de..8ec9f6974ac8 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -92,11 +92,10 @@ def llama31_405b_performance_recipe(
 
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index 4b8356b11423..c47bf3d36800 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -92,11 +92,10 @@ def llama3_70b_performance_recipe(
 
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 7a4a0179f3e9..9a243227727c 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -92,11 +92,10 @@ def llama3_8b_performance_recipe(
 
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
index 0725ac74f06d..a61fc0acb758 100644
--- a/scripts/llm/performance/pretrain_gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -90,9 +90,9 @@ def gpt3_175b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama31_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
index 696f145f901f..a9689d133120 100644
--- a/scripts/llm/performance/pretrain_llama31_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -92,9 +92,9 @@ def llama3_405b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
index 6d912c5c922f..4fbd0cd5d4d1 100644
--- a/scripts/llm/performance/pretrain_llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -92,9 +92,9 @@ def llama3_70b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
index d6ea1eee1b15..bf86537a460c 100644
--- a/scripts/llm/performance/pretrain_llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -89,9 +89,9 @@ def llama3_8b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
index 1686e00dbf31..53f9d6af42d0 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x22b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -92,9 +92,9 @@ def mixtral_8x22b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_mixtral_8x7b.py b/scripts/llm/performance/pretrain_mixtral_8x7b.py
index ac1430ad2f68..c22840312884 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x7b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x7b.py
@@ -92,9 +92,9 @@ def mixtral_8x7b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 2cbb2a4dfff9..0b33e57f13aa 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -96,9 +96,9 @@ def nemotron3_22b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index ae6ae71535fe..17e3dbcdf00e 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -92,9 +92,9 @@ def nemotron3_8b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index 852ec9e32238..1288c8613668 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -92,9 +92,9 @@ def nemotron4_15b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index 7455f6985282..e9ea61b65095 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -96,9 +96,9 @@ def nemotron4_340b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None

From 5984bb69ad997c3176ce7c907c48aa47df1a15c5 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:57:03 +0000
Subject: [PATCH 21/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/finetune_llama31_405b.py   | 3 ++-
 scripts/llm/performance/finetune_llama3_70b.py     | 3 ++-
 scripts/llm/performance/finetune_llama3_8b.py      | 3 ++-
 scripts/llm/performance/pretrain_gpt3_175b.py      | 3 ++-
 scripts/llm/performance/pretrain_llama31_405b.py   | 3 ++-
 scripts/llm/performance/pretrain_llama3_70b.py     | 3 ++-
 scripts/llm/performance/pretrain_llama3_8b.py      | 3 ++-
 scripts/llm/performance/pretrain_mixtral_8x22b.py  | 3 ++-
 scripts/llm/performance/pretrain_mixtral_8x7b.py   | 3 ++-
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 3 ++-
 scripts/llm/performance/pretrain_nemotron3_8b.py   | 3 ++-
 scripts/llm/performance/pretrain_nemotron4_15b.py  | 3 ++-
 scripts/llm/performance/pretrain_nemotron4_340b.py | 3 ++-
 13 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 8ec9f6974ac8..11c19c828db1 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -95,7 +95,8 @@ def llama31_405b_performance_recipe(
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index c47bf3d36800..c5309531c8bb 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -95,7 +95,8 @@ def llama3_70b_performance_recipe(
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 9a243227727c..db05521b7765 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -95,7 +95,8 @@ def llama3_8b_performance_recipe(
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
index a61fc0acb758..f286e65b2921 100644
--- a/scripts/llm/performance/pretrain_gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -92,7 +92,8 @@ def gpt3_175b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama31_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
index a9689d133120..8de63ad34ace 100644
--- a/scripts/llm/performance/pretrain_llama31_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -94,7 +94,8 @@ def llama3_405b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
index 4fbd0cd5d4d1..7bff24c1278f 100644
--- a/scripts/llm/performance/pretrain_llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -94,7 +94,8 @@ def llama3_70b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
index bf86537a460c..524610b77aca 100644
--- a/scripts/llm/performance/pretrain_llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -91,7 +91,8 @@ def llama3_8b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
index 53f9d6af42d0..4bb2a307b0b5 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x22b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -94,7 +94,8 @@ def mixtral_8x22b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_mixtral_8x7b.py b/scripts/llm/performance/pretrain_mixtral_8x7b.py
index c22840312884..91c060234bce 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x7b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x7b.py
@@ -94,7 +94,8 @@ def mixtral_8x7b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 0b33e57f13aa..17d30b432a4c 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -98,7 +98,8 @@ def nemotron3_22b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index 17e3dbcdf00e..868eda97d136 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -94,7 +94,8 @@ def nemotron3_8b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index 1288c8613668..c2bf2131d81d 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -94,7 +94,8 @@ def nemotron4_15b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index e9ea61b65095..0fb7d20750af 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -98,7 +98,8 @@ def nemotron4_340b_performance_recipe(
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
-            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1)
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None

From c8ffcc14586af11a3fe1956894dc9978cdd4b9e1 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 15:36:53 +0530
Subject: [PATCH 22/32] tp and pp related cfgs

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md | 1 +
 scripts/llm/performance/pretrain_nemotron3_8b.py        | 2 +-
 scripts/llm/performance/pretrain_nemotron4_15b.py       | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index d7cd84a615c8..80eb0416cc18 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -84,6 +84,7 @@
   ```sh
   tp_comm_overlap: bool = None # Enable tensor parallel overlap
   tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None # Tensor parallel overlap config
+  tp_comm_bootstrap_backend: str = None # 'nccl' or 'mpi' for tp communication
   overlap_p2p_comm: bool = None # Enable pipeline parallel communication overlap
   batch_p2p_comm: bool = None # Batch pipeline parallel send and recv into a single op
   overlap_grad_reduce: bool = None # Overlap data parallel gradient reduction with compute
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index 868eda97d136..8563400bdf95 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -154,7 +154,7 @@ def nemotron3_8b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index c2bf2131d81d..89897561a766 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -154,7 +154,7 @@ def nemotron4_15b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 

From bb723660999ad4250b89e8b3b35edacda18c8a9a Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 16:21:08 +0530
Subject: [PATCH 23/32] formatting

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py | 5 +----
 scripts/llm/performance/finetune_llama3_70b.py   | 5 +----
 scripts/llm/performance/finetune_llama3_8b.py    | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 11c19c828db1..42c5041b4932 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -113,7 +113,6 @@ def llama31_405b_performance_recipe(
     exp_name = "_".join(
         [
             basename(__file__),
-            f"llama31_405b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -130,9 +129,7 @@ def llama31_405b_performance_recipe(
         args.time_limit,
         args.container_image,
         custom_mounts=[],
-        custom_env_vars={
-            "NVTE_FLASH_ATTN": "1",
-        },
+        custom_env_vars={},
         hf_token=args.hf_token,
         nemo_home=args.nemo_home,
     )
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index c5309531c8bb..b7b7436b86b9 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -113,7 +113,6 @@ def llama3_70b_performance_recipe(
     exp_name = "_".join(
         [
             basename(__file__),
-            f"llama3_70b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -130,9 +129,7 @@ def llama3_70b_performance_recipe(
         args.time_limit,
         args.container_image,
         custom_mounts=[],
-        custom_env_vars={
-            "NVTE_FLASH_ATTN": "1",
-        },
+        custom_env_vars={},
         hf_token=args.hf_token,
         nemo_home=args.nemo_home,
     )
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index db05521b7765..7aa02225ec02 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -113,7 +113,6 @@ def llama3_8b_performance_recipe(
     exp_name = "_".join(
         [
             basename(__file__),
-            f"llama3_8b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -130,9 +129,7 @@ def llama3_8b_performance_recipe(
         args.time_limit,
         args.container_image,
         custom_mounts=[],
-        custom_env_vars={
-            "NVTE_FLASH_ATTN": "1",
-        },
+        custom_env_vars={},
         hf_token=args.hf_token,
         nemo_home=args.nemo_home,
     )

From 6022e643fbb6f54cbb20c78b239e55ef48787472 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 16:54:44 +0530
Subject: [PATCH 24/32] 340b fused attn

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py   | 2 +-
 scripts/llm/performance/finetune_llama3_70b.py     | 2 +-
 scripts/llm/performance/finetune_llama3_8b.py      | 2 +-
 scripts/llm/performance/pretrain_nemotron4_340b.py | 4 +++-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 42c5041b4932..18d701b14572 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -112,7 +112,7 @@ def llama31_405b_performance_recipe(
 
     exp_name = "_".join(
         [
-            basename(__file__),
+            args.finetuning.lower(), "llama31_405b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index b7b7436b86b9..40d998e06f09 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -112,7 +112,7 @@ def llama3_70b_performance_recipe(
 
     exp_name = "_".join(
         [
-            basename(__file__),
+            args.finetuning.lower(), "llama3_70b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 7aa02225ec02..6964219f1681 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -112,7 +112,7 @@ def llama3_8b_performance_recipe(
 
     exp_name = "_".join(
         [
-            basename(__file__),
+            args.finetuning.lower(), "llama3_8b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index 0fb7d20750af..d8e7e733bfaf 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -132,7 +132,9 @@ def nemotron4_340b_performance_recipe(
         args.time_limit,
         args.container_image,
         custom_mounts=[],
-        custom_env_vars={},
+        custom_env_vars={
+            "NVTE_FUSED_ATTN": "0",
+        },
         hf_token=args.hf_token,
         nemo_home=args.nemo_home,
     )

From dc8819c23ab3af7a25fac7fa82d07787071a4106 Mon Sep 17 00:00:00 2001
From: malay-nagda <malay-nagda@users.noreply.github.com>
Date: Mon, 13 Jan 2025 11:25:33 +0000
Subject: [PATCH 25/32] Apply isort and black reformatting

Signed-off-by: malay-nagda <malay-nagda@users.noreply.github.com>
---
 scripts/llm/performance/finetune_llama31_405b.py | 3 ++-
 scripts/llm/performance/finetune_llama3_70b.py   | 3 ++-
 scripts/llm/performance/finetune_llama3_8b.py    | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 18d701b14572..812c303e7454 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -112,7 +112,8 @@ def llama31_405b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(), "llama31_405b",
+            args.finetuning.lower(),
+            "llama31_405b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index 40d998e06f09..ec16625ed0a9 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -112,7 +112,8 @@ def llama3_70b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(), "llama3_70b",
+            args.finetuning.lower(),
+            "llama3_70b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index 6964219f1681..e252f970318f 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -112,7 +112,8 @@ def llama3_8b_performance_recipe(
 
     exp_name = "_".join(
         [
-            args.finetuning.lower(), "llama3_8b",
+            args.finetuning.lower(),
+            "llama3_8b",
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",

From dc1e8ee170b5ae2b09f1aa0ff31511dd3d626293 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 18:18:13 +0530
Subject: [PATCH 26/32] conditional nccl_pp_comm_chunksize

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py   | 3 +--
 scripts/llm/performance/finetune_llama3_70b.py     | 3 +--
 scripts/llm/performance/finetune_llama3_8b.py      | 3 +--
 scripts/llm/performance/pretrain_gpt3_175b.py      | 2 +-
 scripts/llm/performance/pretrain_llama31_405b.py   | 2 +-
 scripts/llm/performance/pretrain_llama3_70b.py     | 2 +-
 scripts/llm/performance/pretrain_llama3_8b.py      | 2 +-
 scripts/llm/performance/pretrain_mixtral_8x22b.py  | 2 +-
 scripts/llm/performance/pretrain_mixtral_8x7b.py   | 2 +-
 scripts/llm/performance/pretrain_nemotron3_22b.py  | 2 +-
 scripts/llm/performance/pretrain_nemotron3_8b.py   | 2 +-
 scripts/llm/performance/pretrain_nemotron4_15b.py  | 2 +-
 scripts/llm/performance/pretrain_nemotron4_340b.py | 2 +-
 13 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index 812c303e7454..b61a1574e141 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -157,7 +156,7 @@ def llama31_405b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index ec16625ed0a9..fe779f885449 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -157,7 +156,7 @@ def llama3_70b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index e252f970318f..f68fe9acfb92 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from os.path import basename
 from typing import Optional
 
 import nemo_run as run
@@ -157,7 +156,7 @@ def llama3_8b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
index f286e65b2921..ded5899ca018 100644
--- a/scripts/llm/performance/pretrain_gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -154,7 +154,7 @@ def gpt3_175b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_llama31_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
index 8de63ad34ace..85227cb362d9 100644
--- a/scripts/llm/performance/pretrain_llama31_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -154,7 +154,7 @@ def llama3_405b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
index 7bff24c1278f..3b96dda399a8 100644
--- a/scripts/llm/performance/pretrain_llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -154,7 +154,7 @@ def llama3_70b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
index 524610b77aca..55498753c431 100644
--- a/scripts/llm/performance/pretrain_llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -151,7 +151,7 @@ def llama3_8b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
index 4bb2a307b0b5..fd63aacf86be 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x22b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -155,7 +155,7 @@ def mixtral_8x22b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_mixtral_8x7b.py b/scripts/llm/performance/pretrain_mixtral_8x7b.py
index 91c060234bce..0a63eb78765d 100644
--- a/scripts/llm/performance/pretrain_mixtral_8x7b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x7b.py
@@ -155,7 +155,7 @@ def mixtral_8x7b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
index 17d30b432a4c..ce13678832a5 100644
--- a/scripts/llm/performance/pretrain_nemotron3_22b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -158,7 +158,7 @@ def nemotron3_22b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index 8563400bdf95..a40cf44d7af6 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -154,7 +154,7 @@ def nemotron3_8b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index 89897561a766..e83babb3473a 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -154,7 +154,7 @@ def nemotron4_15b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
index d8e7e733bfaf..f341e4ea2f1d 100644
--- a/scripts/llm/performance/pretrain_nemotron4_340b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -160,7 +160,7 @@ def nemotron4_340b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 

From 10145fe392f993bd0f8ecea7c84d584ece2048c8 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Mon, 13 Jan 2025 22:08:02 +0530
Subject: [PATCH 27/32] null tokenizer

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/pretrain_nemotron3_8b.py  | 2 +-
 scripts/llm/performance/pretrain_nemotron4_15b.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
index a40cf44d7af6..8ac3690b14b5 100644
--- a/scripts/llm/performance/pretrain_nemotron3_8b.py
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -59,7 +59,7 @@ def nemotron3_8b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
+        get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index e83babb3473a..bfd3befb4c0b 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -59,7 +59,7 @@ def nemotron4_15b_performance_recipe(
     recipe.data.global_batch_size = gbs
     recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
     recipe.data.tokenizer = run.Config(
-        get_nmt_tokenizer, library="null_lib", model_name="NullTokenizer", vocab_size=256000
+        get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
     )
 
     recipe.trainer.max_steps = max_steps

From 467ba30ee816f48246508deef1a71120224bf4ae Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 15 Jan 2025 18:22:38 +0530
Subject: [PATCH 28/32] logs msgs

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/utils.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index b892ec4468eb..9d640be3b80d 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -52,8 +52,6 @@ def slurm_executor(
     err_msgs = []
     if log_dir != NEMORUN_HOME:
         err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
-    if nemo_home != DEFAULT_NEMO_HOME:
-        err_msgs.append(f"Run `export NEMO_HOME={nemo_home}` in your shell environment and rerun this script.")
     if len(err_msgs) > 0:
         logging.error("\n".join(err_msgs))
         sys.exit(1)
@@ -113,11 +111,11 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
                 huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
     """
     log_msg = [
-        f"AutoTokenizer first searches for tokenizer files locally in env var {DEFAULT_NEMO_HOME}.",
-        "If files are missing locally, AutoTokenizer will try downloading from HuggingFace.",
-        "Make sure 'TRANSFORMERS_OFFLINE=0' and 'HF_TOKEN:<token_value>'.",
-        "You can set them as scripts.llm.performance.utils.slurm_executor(custom_env_vars=",
-        "{'TRANSFORMERS_OFFLINE: 0', 'HF_TOKEN: <token_value>'}",
+        f"`AutoTokenizer` first searches for tokenizer files locally stored in {DEFAULT_NEMO_HOME}.",
+        "(from env var `NEMO_HOME`- can be changed using '-nh/--nemo_home' CLI arg).",
+        "If files are missing locally, `AutoTokenizer` will try downloading from HuggingFace. In this case-",
+        "make sure env vars 'TRANSFORMERS_OFFLINE':'0' and 'HF_TOKEN':'<token_value>' are set in your sbatch script.",
+        "Both of these will be set automatically if you provide '-hf/--hf_token' CLI arg.",
     ]
     logging.warning(" ".join(log_msg))
 

From c0d291ba597ec82991ef99598b24980a7a612631 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 15 Jan 2025 18:35:59 +0530
Subject: [PATCH 29/32] logs msgs

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 9d640be3b80d..68f4883451b2 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -257,9 +257,9 @@ def parse_cli_args():
         default=None,
     )
     nemo_home_msg = [
-        "Directory where NeMo searches for models and checkpoints.",
-        "This saves a lot of time (especially for bigger models) if checkpoints already exist here.",
-        "Missing files will be downloaded from HuggingFace., " f"Defaults to {DEFAULT_NEMO_HOME}",
+        "Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searches",
+        "for models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints already",
+        f"exist here. Missing files will be downloaded here from HuggingFace. Defaults to {DEFAULT_NEMO_HOME}",
     ]
     parser.add_argument(
         "-nh",

From a91071cfbbb9f9cd572721a0e26a709ef9314136 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 15 Jan 2025 22:38:46 +0530
Subject: [PATCH 30/32] temp mem mesaurement

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 nemo/lightning/pytorch/strategies/megatron_strategy.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index d38753bd7935..a2f36f989d4f 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -563,7 +563,7 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
 
             if self.log_memory_usage:
                 max_memory_reserved = torch.cuda.max_memory_reserved()
-                memory_allocated = torch.cuda.memory_allocated()
+                _memory_allocated = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024 / 1024 / 1024
                 self.lightning_module.log(
                     "peak_memory_usage",
                     max_memory_reserved,
@@ -571,8 +571,8 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
                     batch_size=1,
                 )
                 self.lightning_module.log(
-                    "memory_allocated",
-                    memory_allocated,
+                    "_memory_allocated",
+                    _memory_allocated,
                     prog_bar=True,
                     batch_size=1,
                 )

From c1a25db23cfed64c15d3a0ac6c37dc424a7db994 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Sat, 18 Jan 2025 21:43:50 +0530
Subject: [PATCH 31/32] mem usage, 8b tp4

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 nemo/lightning/pytorch/strategies/megatron_strategy.py | 6 +++---
 scripts/llm/performance/pretrain_nemotron4_15b.py      | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
index a2f36f989d4f..d38753bd7935 100644
--- a/nemo/lightning/pytorch/strategies/megatron_strategy.py
+++ b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -563,7 +563,7 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
 
             if self.log_memory_usage:
                 max_memory_reserved = torch.cuda.max_memory_reserved()
-                _memory_allocated = (torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]) / 1024 / 1024 / 1024
+                memory_allocated = torch.cuda.memory_allocated()
                 self.lightning_module.log(
                     "peak_memory_usage",
                     max_memory_reserved,
@@ -571,8 +571,8 @@ def training_step(self, dataloader_iter, *args: Any, **kwargs: Any) -> STEP_OUTP
                     batch_size=1,
                 )
                 self.lightning_module.log(
-                    "_memory_allocated",
-                    _memory_allocated,
+                    "memory_allocated",
+                    memory_allocated,
                     prog_bar=True,
                     batch_size=1,
                 )
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
index bfd3befb4c0b..05cbe78692af 100644
--- a/scripts/llm/performance/pretrain_nemotron4_15b.py
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -28,7 +28,7 @@
 NUM_GPUS_PER_NODE = 8
 MICRO_BATCH_SIZE = 4
 GLOBAL_BATCH_SIZE = 256
-TP_SIZE = 2
+TP_SIZE = 4
 PP_SIZE = 1
 CP_SIZE = 1
 VP_SIZE = None

From c80f80ac9e0e3a3c0a2d1a8621993c056f0fc6c9 Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@nvidia.com>
Date: Wed, 22 Jan 2025 22:44:18 +0530
Subject: [PATCH 32/32] nccl backend

Signed-off-by: Malay Nagda <malayn@nvidia.com>
---
 scripts/llm/performance/finetune_llama31_405b.py | 2 +-
 scripts/llm/performance/finetune_llama3_70b.py   | 2 +-
 scripts/llm/performance/finetune_llama3_8b.py    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
index b61a1574e141..52d28301b710 100644
--- a/scripts/llm/performance/finetune_llama31_405b.py
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -92,7 +92,7 @@ def llama31_405b_performance_recipe(
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
             dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
         )
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
index fe779f885449..1645c4845120 100644
--- a/scripts/llm/performance/finetune_llama3_70b.py
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -92,7 +92,7 @@ def llama3_70b_performance_recipe(
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
             dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
         )
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
index f68fe9acfb92..65e0c0db3f91 100644
--- a/scripts/llm/performance/finetune_llama3_8b.py
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -92,7 +92,7 @@ def llama3_8b_performance_recipe(
     # callback configs
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
     if comm_overlap_callback_idx is not None:
-        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "mpi"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
         recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
             dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
         )