diff --git a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
index d7cd84a615c8..80eb0416cc18 100644
--- a/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
+++ b/nemo/collections/llm/recipes/CONFIGURATION-HIERARCHY.md
@@ -84,6 +84,7 @@
   ```sh
   tp_comm_overlap: bool = None # Enable tensor parallel overlap
   tp_comm_overlap_cfg: TransformerLayerTPOverlapCfg = None # Tensor parallel overlap config
+  tp_comm_bootstrap_backend: str = None # 'nccl' or 'mpi' for tp communication
   overlap_p2p_comm: bool = None # Enable pipeline parallel communication overlap
   batch_p2p_comm: bool = None # Batch pipeline parallel send and recv into a single op
   overlap_grad_reduce: bool = None # Overlap data parallel gradient reduction with compute
diff --git a/nemo/collections/nlp/modules/common/tokenizer_utils.py b/nemo/collections/nlp/modules/common/tokenizer_utils.py
index 21d04887ea6b..6d11d9c42157 100644
--- a/nemo/collections/nlp/modules/common/tokenizer_utils.py
+++ b/nemo/collections/nlp/modules/common/tokenizer_utils.py
@@ -254,6 +254,6 @@ def get_nmt_tokenizer(
         return NullTokenizer(vocab_size)
     else:
         raise NotImplementedError(
-            'Currently we only support "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer'
-            'libraries.'
+            'Currently we only support "huggingface", "sentencepiece", "megatron", "byte-level", "regex", "tabular",'
+            '"tiktoken", and "null" tokenizer libraries.'
         )
diff --git a/scripts/llm/performance/README.md b/scripts/llm/performance/README.md
index 62bf58329633..99bf92ed179b 100644
--- a/scripts/llm/performance/README.md
+++ b/scripts/llm/performance/README.md
@@ -13,15 +13,18 @@ The following line shows an example of how you can launch a pre-training experim
 
 - Slurm account and partition are mandatory arguments for launching the experiment.
 - You can use the following optional arguments as needed-
-  - -l/--log_dir: Location to store your experiment artifacts and logs. 
-    - Make sure the environemnt variable `NEMORUN_HOME=<log_dir>` is accessible and set correctly in your virtual environment. 
+  - -l/--log_dir: Location to store your experiment artifacts and logs.
+    - Make sure the environemnt variable `NEMORUN_HOME=<log_dir>` is accessible and set correctly in your virtual environment.
     - You can run `export NEMORUN_HOME=<log_dir>` in your terminal. You can add it your bashrc file (or equivalent for your OS/Linux distro) for setting it permanently.
   - -t/--time_limit: Maximum time limit for your experiment. Your slurm job will be cancelled after this. Default is 30 minutes.
   - -i/--container_image: The NeMo container you want to use. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'.
   - -c/--compute_dtype: Specifies whether you want to use bf16 or fp8 precision for training. Defaults to 'bf16'. You can choose to use 'fp8'.
-  - -ep/--enable_profiling: Enable nsys profiling. It is disabled by default. When enabled, profiling will be enabled for 1 step from step 5 to step 6. You can change the step in the respective recipe script. 
-  - -tb/--tensorboard: Enable tensorboard logging. It is disabled by default. 
-    - CAUTION: Tensorboard logging may cause performance overhead. 
+  - -ep/--enable_profiling: Enable nsys profiling. It is disabled by default. When enabled, profiling will be enabled for 1 step from step 5 to step 6. You can change the step in the respective recipe script.
+  - -tb/--tensorboard: Enable tensorboard logging. It is disabled by default.
+    - CAUTION: Tensorboard logging may cause performance overhead.
+  - -f/--finetuning: Finetuning scheme to use. Options- 'sft', 'lora'. Defaults is 'lora'.
+  - -hf/--hf_token: HuggingFace access token. Defaults to None. Required for accessing tokenizers and checkpoints from HuggingFace.
+  - -nh/--nemo_home:  Directory where NeMo searches for models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints already exist here. Missing files will be downloaded from HuggingFace. Defaults to environment variable DEFAULT_NEMO_CACHE_HOME = ~/.cache/nemo
   - -d/--dryrun: Using this argument will not launch the experiment. It will simply print the sbatch script to stdout. This can be helpful to verify you have set your experiment correctly as needed.
 - You don't need to set any value for `--enable_profiling`, `--tensorboard` and `--dryrun`. See the below example for reference-
   `python3 scripts/llm/performance/llama3_8b.py --account <your_slurm_account> -p <your_slurm_partition> -ep --tensorboard -d`
diff --git a/scripts/llm/performance/finetune_llama31_405b.py b/scripts/llm/performance/finetune_llama31_405b.py
new file mode 100644
index 000000000000..52d28301b710
--- /dev/null
+++ b/scripts/llm/performance/finetune_llama31_405b.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
+
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.recipes.llama31_405b import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 3
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 24
+TP_SIZE = 4
+PP_SIZE = 6
+CP_SIZE = 1
+VP_SIZE = 7
+MAX_STEPS = 100
+
+HF_MODEL_URI = "meta-llama/Llama-3.1-405B"
+
+
+def llama31_405b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3.1 405b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
+        recipe.data.force_redownload = True
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            args.finetuning.lower(),
+            "llama31_405b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = llama31_405b_performance_recipe(
+        args.finetuning.lower(),
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/finetune_llama3_70b.py b/scripts/llm/performance/finetune_llama3_70b.py
new file mode 100644
index 000000000000..1645c4845120
--- /dev/null
+++ b/scripts/llm/performance/finetune_llama3_70b.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
+
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.recipes.llama3_70b import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 4
+CP_SIZE = 1
+VP_SIZE = 20
+MAX_STEPS = 100
+
+HF_MODEL_URI = "meta-llama/Meta-Llama-3-70B"
+
+
+def llama3_70b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 70b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
+        recipe.data.force_redownload = True
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            args.finetuning.lower(),
+            "llama3_70b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = llama3_70b_performance_recipe(
+        args.finetuning.lower(),
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/finetune_llama3_8b.py b/scripts/llm/performance/finetune_llama3_8b.py
new file mode 100644
index 000000000000..65e0c0db3f91
--- /dev/null
+++ b/scripts/llm/performance/finetune_llama3_8b.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import nemo_run as run
+from utils import (
+    get_comm_overlap_callback_idx,
+    hf_tokenizer,
+    import_ckpt_experiment,
+    isfile_train_pack_metadata,
+    parse_cli_args,
+    slurm_executor,
+)
+
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 1
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"
+
+
+def llama3_8b_performance_recipe(
+    finetuning_scheme: str,
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    llama3 8b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    finetuning_scheme = "none" if finetuning_scheme == "sft" else finetuning_scheme
+    recipe = finetune_recipe(peft_scheme=finetuning_scheme, performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.tokenizer = hf_tokenizer(HF_MODEL_URI)
+    if recipe.data.__fn_or_cls__ == SquadDataModule and not isfile_train_pack_metadata(HF_MODEL_URI, recipe.data):
+        # flag is valid only for SquadDataModule
+        recipe.data.force_redownload = True
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].tp_comm_bootstrap_backend = "nccl"
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            args.finetuning.lower(),
+            "llama3_8b",
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = llama3_8b_performance_recipe(
+        args.finetuning.lower(),
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(*import_ckpt_experiment(executor, model(), source=f"hf://{HF_MODEL_URI}"))
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/gpt3_175b.py b/scripts/llm/performance/pretrain_gpt3_175b.py
similarity index 86%
rename from scripts/llm/performance/gpt3_175b.py
rename to scripts/llm/performance/pretrain_gpt3_175b.py
index 01a3f7381628..ded5899ca018 100644
--- a/scripts/llm/performance/gpt3_175b.py
+++ b/scripts/llm/performance/pretrain_gpt3_175b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.gpt3_175b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 64
 NUM_GPUS_PER_NODE = 8
@@ -69,10 +68,7 @@ def gpt3_175b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -86,7 +82,7 @@ def gpt3_175b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -94,9 +90,10 @@ def gpt3_175b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -111,15 +108,10 @@ def gpt3_175b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"gpt3_175b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -137,7 +129,8 @@ def gpt3_175b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = gpt3_175b_performance_recipe(
@@ -161,7 +154,7 @@ def gpt3_175b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/llama3_405b.py b/scripts/llm/performance/pretrain_llama31_405b.py
similarity index 86%
rename from scripts/llm/performance/llama3_405b.py
rename to scripts/llm/performance/pretrain_llama31_405b.py
index dd6194c7f8b3..85227cb362d9 100644
--- a/scripts/llm/performance/llama3_405b.py
+++ b/scripts/llm/performance/pretrain_llama31_405b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama31_405b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 72
 NUM_GPUS_PER_NODE = 8
@@ -69,10 +68,7 @@ def llama3_405b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -88,7 +84,7 @@ def llama3_405b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -96,9 +92,10 @@ def llama3_405b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -111,15 +108,10 @@ def llama3_405b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"llama3_405b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -137,7 +129,8 @@ def llama3_405b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_405b_performance_recipe(
@@ -161,7 +154,7 @@ def llama3_405b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/llama3_70b.py b/scripts/llm/performance/pretrain_llama3_70b.py
similarity index 86%
rename from scripts/llm/performance/llama3_70b.py
rename to scripts/llm/performance/pretrain_llama3_70b.py
index 97babadbe803..3b96dda399a8 100644
--- a/scripts/llm/performance/llama3_70b.py
+++ b/scripts/llm/performance/pretrain_llama3_70b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_70b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 8
 NUM_GPUS_PER_NODE = 8
@@ -69,10 +68,7 @@ def llama3_70b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -88,7 +84,7 @@ def llama3_70b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -96,9 +92,10 @@ def llama3_70b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -111,15 +108,10 @@ def llama3_70b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"llama3_70b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -137,7 +129,8 @@ def llama3_70b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_70b_performance_recipe(
@@ -161,7 +154,7 @@ def llama3_70b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/llama3_8b.py b/scripts/llm/performance/pretrain_llama3_8b.py
similarity index 85%
rename from scripts/llm/performance/llama3_8b.py
rename to scripts/llm/performance/pretrain_llama3_8b.py
index 81382cc33a16..55498753c431 100644
--- a/scripts/llm/performance/llama3_8b.py
+++ b/scripts/llm/performance/pretrain_llama3_8b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.llama3_8b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 1
 NUM_GPUS_PER_NODE = 8
@@ -69,10 +68,7 @@ def llama3_8b_performance_recipe(
     recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -85,7 +81,7 @@ def llama3_8b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -93,9 +89,10 @@ def llama3_8b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -108,15 +105,10 @@ def llama3_8b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"llama3_8b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -134,7 +126,8 @@ def llama3_8b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = llama3_8b_performance_recipe(
@@ -158,7 +151,7 @@ def llama3_8b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/mixtral_8x22b.py b/scripts/llm/performance/pretrain_mixtral_8x22b.py
similarity index 83%
rename from scripts/llm/performance/mixtral_8x22b.py
rename to scripts/llm/performance/pretrain_mixtral_8x22b.py
index b474561296e4..fd63aacf86be 100644
--- a/scripts/llm/performance/mixtral_8x22b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x22b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
-from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
+from nemo.collections.llm.recipes.mixtral_8x22b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 128
 NUM_GPUS_PER_NODE = 8
@@ -50,7 +49,7 @@ def mixtral_8x22b_performance_recipe(
     max_steps: int,
 ):
     """
-    mixtral 8x7b pre-train recipe aimed at achieving best possible performance.
+    mixtral 8x22b pre-train recipe aimed at achieving best possible performance.
 
     NOTE: Use fp8 precision training with caution. It might not give desirable results.
     """
@@ -72,10 +71,7 @@ def mixtral_8x22b_performance_recipe(
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
     recipe.trainer.strategy.expert_model_parallel_size = ep_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -88,7 +84,7 @@ def mixtral_8x22b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -96,9 +92,10 @@ def mixtral_8x22b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -111,15 +108,10 @@ def mixtral_8x22b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"mixtral_8x22b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -137,7 +129,8 @@ def mixtral_8x22b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = mixtral_8x22b_performance_recipe(
@@ -162,7 +155,7 @@ def mixtral_8x22b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/mixtral_8x7b.py b/scripts/llm/performance/pretrain_mixtral_8x7b.py
similarity index 86%
rename from scripts/llm/performance/mixtral_8x7b.py
rename to scripts/llm/performance/pretrain_mixtral_8x7b.py
index 4d5321269227..0a63eb78765d 100644
--- a/scripts/llm/performance/mixtral_8x7b.py
+++ b/scripts/llm/performance/pretrain_mixtral_8x7b.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from os.path import basename
 from typing import Optional
 
 import nemo_run as run
-from nemo_run.config import NEMORUN_HOME
 from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
 
 from nemo.collections.llm.recipes.mixtral_8x7b import pretrain_recipe
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
 from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
 from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
-from nemo.utils import logging
 
 NUM_NODES = 8
 NUM_GPUS_PER_NODE = 8
@@ -72,10 +71,7 @@ def mixtral_8x7b_performance_recipe(
     recipe.trainer.strategy.context_parallel_size = cp_size
     recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
     recipe.trainer.strategy.expert_model_parallel_size = ep_size
-    if tp_size > 1:
-        recipe.trainer.strategy.sequence_parallel = True
-    else:
-        recipe.trainer.strategy.sequence_parallel = False
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
 
     comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
 
@@ -88,7 +84,7 @@ def mixtral_8x7b_performance_recipe(
     garbage_collection_callback = run.Config(
         GarbageCollectionCallback,
         gc_interval_train=100,
-        gc_interval_val=500,
+        gc_interval_val=100,
     )
     recipe.trainer.callbacks.extend(
         [
@@ -96,9 +92,10 @@ def mixtral_8x7b_performance_recipe(
         ]
     )
     dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
-    if dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1:
-        if comm_overlap_callback_idx >= 0:
-            recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = True
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
 
     # Misc. for overall faster experiment runtime
     recipe.log.ckpt = None
@@ -111,15 +108,10 @@ def mixtral_8x7b_performance_recipe(
 
 if __name__ == "__main__":
     args = parse_cli_args().parse_args()
-    if args.log_dir != NEMORUN_HOME:
-        import sys
-
-        logging.error(f"Run `export NEMORUN_HOME={args.log_dir}` in your shell environment and rerun this script.")
-        sys.exit(1)
 
     exp_name = "_".join(
         [
-            f"mixtral_8x7b",
+            basename(__file__),
             args.compute_dtype,
             f"{NUM_NODES}nodes",
             f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
@@ -137,7 +129,8 @@ def mixtral_8x7b_performance_recipe(
         args.container_image,
         custom_mounts=[],
         custom_env_vars={},
-        retries=0,
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
     )
 
     recipe = mixtral_8x7b_performance_recipe(
@@ -162,7 +155,7 @@ def mixtral_8x7b_performance_recipe(
         # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
         recipe.log.log_dir = "/nemo_run/lightning_logs"
 
-    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152)]
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
     if args.enable_profiling:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
diff --git a/scripts/llm/performance/pretrain_nemotron3_22b.py b/scripts/llm/performance/pretrain_nemotron3_22b.py
new file mode 100644
index 000000000000..ce13678832a5
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron3_22b.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron3_22b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 2
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 4
+CP_SIZE = 1
+VP_SIZE = 10
+MAX_STEPS = 100
+
+
+def nemotron3_22b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron3 22b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    if compute_dtype == "bf16":
+        recipe.data.tokenizer = run.Config(
+            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+        )
+        recipe.model.tokenizer = recipe.data.tokenizer
+    else:
+        recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=100,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron3_22b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron3_8b.py b/scripts/llm/performance/pretrain_nemotron3_8b.py
new file mode 100644
index 000000000000..8ac3690b14b5
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron3_8b.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron3_8b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 1
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 2
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 2
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def nemotron3_8b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron3 8b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=100,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron3_8b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron4_15b.py b/scripts/llm/performance/pretrain_nemotron4_15b.py
new file mode 100644
index 000000000000..05cbe78692af
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron4_15b.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron4_15b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 8
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 4
+GLOBAL_BATCH_SIZE = 256
+TP_SIZE = 4
+PP_SIZE = 1
+CP_SIZE = 1
+VP_SIZE = None
+MAX_STEPS = 100
+
+
+def nemotron4_15b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron4 15b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    recipe.data.tokenizer = run.Config(
+        get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+    )
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=100,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={},
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron4_15b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/pretrain_nemotron4_340b.py b/scripts/llm/performance/pretrain_nemotron4_340b.py
new file mode 100644
index 000000000000..f341e4ea2f1d
--- /dev/null
+++ b/scripts/llm/performance/pretrain_nemotron4_340b.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import basename
+from typing import Optional
+
+import nemo_run as run
+from utils import get_comm_overlap_callback_idx, hf_tokenizer, parse_cli_args, slurm_executor
+
+from nemo.collections.llm.recipes.nemotron4_340b import pretrain_recipe
+from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
+from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+from nemo.lightning.pytorch.callbacks.garbage_collection import GarbageCollectionCallback
+from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+
+NUM_NODES = 16
+NUM_GPUS_PER_NODE = 8
+MICRO_BATCH_SIZE = 1
+GLOBAL_BATCH_SIZE = 32
+TP_SIZE = 8
+PP_SIZE = 8
+CP_SIZE = 2
+VP_SIZE = 12
+MAX_STEPS = 100
+
+
+def nemotron4_340b_performance_recipe(
+    compute_dtype: str,
+    num_nodes: int,
+    num_gpus_per_node: int,
+    mbs: int,
+    gbs: int,
+    tp_size: int,
+    pp_size: int,
+    cp_size: int,
+    vp_size: Optional[int],
+    max_steps: int,
+):
+    """
+    nemotron4 340b pre-train recipe aimed at achieving best possible performance.
+
+    NOTE: Use fp8 precision training with caution. It might not give desirable results.
+    """
+    recipe = pretrain_recipe(performance_mode=True)
+
+    # data module configs
+    recipe.data.micro_batch_size = mbs
+    recipe.data.global_batch_size = gbs
+    recipe.data.num_train_samples = max_steps * gbs * mbs  # ensure only 1 epoch for whole run
+    if compute_dtype == "bf16":
+        recipe.data.tokenizer = run.Config(
+            get_nmt_tokenizer, library="null", model_name="NullTokenizer", vocab_size=256000
+        )
+        recipe.model.tokenizer = recipe.data.tokenizer
+    else:
+        recipe.data.tokenizer = hf_tokenizer("nvidia/megatron-gpt2-345m")
+
+    recipe.trainer.max_steps = max_steps
+    recipe.trainer.num_nodes = num_nodes
+    recipe.trainer.devices = num_gpus_per_node
+
+    # parallelism configs
+    recipe.trainer.strategy.tensor_model_parallel_size = tp_size
+    recipe.trainer.strategy.pipeline_model_parallel_size = pp_size
+    recipe.trainer.strategy.context_parallel_size = cp_size
+    recipe.trainer.strategy.virtual_pipeline_model_parallel_size = vp_size
+    recipe.trainer.strategy.sequence_parallel = bool(tp_size > 1)
+
+    comm_overlap_callback_idx = get_comm_overlap_callback_idx(recipe.trainer.callbacks)
+
+    # compute dtype configs
+    if compute_dtype.lower() == "fp8":
+        recipe.trainer.plugins = bf16_with_fp8_mixed()
+    recipe.trainer.plugins.grad_reduce_in_fp32 = False  # bf16 grad dtype
+
+    # callback configs
+    garbage_collection_callback = run.Config(
+        GarbageCollectionCallback,
+        gc_interval_train=100,
+        gc_interval_val=100,
+    )
+    recipe.trainer.callbacks.extend(
+        [
+            garbage_collection_callback,
+        ]
+    )
+    dp_size = (num_nodes * num_gpus_per_node) / (tp_size * pp_size * cp_size)
+    if comm_overlap_callback_idx is not None:
+        recipe.trainer.callbacks[comm_overlap_callback_idx].overlap_param_gather_with_optimizer_step = bool(
+            dp_size > 1 and pp_size > 1 and vp_size and vp_size > 1
+        )
+
+    # Misc. for overall faster experiment runtime
+    recipe.log.ckpt = None
+    recipe.trainer.enable_checkpointing = False
+    recipe.trainer.val_check_interval = max_steps
+    recipe.trainer.log_every_n_steps = 1
+
+    return recipe
+
+
+if __name__ == "__main__":
+    args = parse_cli_args().parse_args()
+
+    exp_name = "_".join(
+        [
+            basename(__file__),
+            args.compute_dtype,
+            f"{NUM_NODES}nodes",
+            f"tp{TP_SIZE}_pp{PP_SIZE}_cp{CP_SIZE}_vp{VP_SIZE}",
+            f"{MICRO_BATCH_SIZE}mbs_{GLOBAL_BATCH_SIZE}gbs",
+        ]
+    )
+
+    executor = slurm_executor(
+        args.account,
+        args.partition,
+        args.log_dir,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        args.time_limit,
+        args.container_image,
+        custom_mounts=[],
+        custom_env_vars={
+            "NVTE_FUSED_ATTN": "0",
+        },
+        hf_token=args.hf_token,
+        nemo_home=args.nemo_home,
+    )
+
+    recipe = nemotron4_340b_performance_recipe(
+        args.compute_dtype,
+        NUM_NODES,
+        NUM_GPUS_PER_NODE,
+        MICRO_BATCH_SIZE,
+        GLOBAL_BATCH_SIZE,
+        TP_SIZE,
+        PP_SIZE,
+        CP_SIZE,
+        VP_SIZE,
+        MAX_STEPS,
+    )
+
+    if not args.tensorboard:  # tensorboard adds performance overhead.
+        recipe.log.tensorboard = None
+        recipe.trainer.logger = False
+    else:
+        # default path is NOT intuitive- `<log_dir>/code/nemo_experiments/tb_logs/default/<tfevents_file>`
+        # following line ensures file is at- `<log_dir>/lightning_logs/tb_logs/default/<tfevents_file>`
+        recipe.log.log_dir = "/nemo_run/lightning_logs"
+
+    plugins = [PerfEnvPlugin(enable_vboost=True, nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None)]
+    if args.enable_profiling:
+        plugins.append(NsysPlugin(start_step=5, end_step=6))
+
+    with run.Experiment(exp_name) as exp:
+        exp.add(
+            recipe,
+            executor=executor,
+            name=exp_name,
+            plugins=plugins,
+        )
+
+        if not args.dryrun:
+            exp.run(sequential=True, detach=True)
+        else:
+            exp.dryrun()
diff --git a/scripts/llm/performance/utils.py b/scripts/llm/performance/utils.py
index 5f50f8474fbc..68f4883451b2 100644
--- a/scripts/llm/performance/utils.py
+++ b/scripts/llm/performance/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,22 @@
 
 import argparse
 import os
-from typing import Dict, List, Optional
+import sys
+from typing import Dict, List
 
 import nemo_run as run
 from lightning.pytorch.callbacks.callback import Callback
 from nemo_run.config import NEMORUN_HOME
 
 from nemo.collections.common.tokenizers.huggingface import AutoTokenizer
+from nemo.collections.llm.gpt.data.squad import SquadDataModule
+from nemo.collections.llm.gpt.model import GPTModel
 from nemo.collections.llm.recipes.llama3_8b import MegatronCommOverlapCallback
+from nemo.lightning.base import DEFAULT_NEMO_CACHE_HOME
 from nemo.utils import logging
 
+DEFAULT_NEMO_HOME = os.getenv('NEMO_HOME', DEFAULT_NEMO_CACHE_HOME)
+
 
 def slurm_executor(
     account: str,
@@ -31,26 +37,24 @@ def slurm_executor(
     log_dir: str,
     nodes: int,
     num_gpus_per_node: int,
-    time_limit: str = "01:00:00",
+    time_limit: str = "00:30:00",
     container_image: str = "nvcr.io/nvidia/nemo:dev",
-    custom_mounts: Optional[List[str]] = None,
-    custom_env_vars: Optional[Dict[str, str]] = None,
-    custom_srun_args: Optional[List[str]] = None,
-    retries: int = 0,
+    custom_mounts: List[str] = [],
+    custom_env_vars: Dict[str, str] = {},
+    custom_srun_args: List[str] = [],
+    hf_token: str = None,
+    nemo_home: str = DEFAULT_NEMO_HOME,
 ) -> run.SlurmExecutor:
     """
     Slurm cluster definition with appropriate cluster params and NeMo container params needed for pre-training
     and fine-tuning experiments
     """
-    if not (log_dir and account and partition and nodes and num_gpus_per_node):
-        raise RuntimeError(
-            "Please set user, host, remote_job_dir, account, partition, nodes and devices args for using this ",
-            "function.",
-        )
-
-    mounts = []
-    if custom_mounts:
-        mounts.extend(custom_mounts)
+    err_msgs = []
+    if log_dir != NEMORUN_HOME:
+        err_msgs.append(f"\nRun `export NEMORUN_HOME={log_dir}` in your shell environment and rerun this script.")
+    if len(err_msgs) > 0:
+        logging.error("\n".join(err_msgs))
+        sys.exit(1)
 
     env_vars = {
         "TRANSFORMERS_OFFLINE": "1",
@@ -59,16 +63,22 @@ def slurm_executor(
         "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
         "NVTE_ASYNC_AMAX_REDUCTION": "1",
         "NVTE_FUSED_ATTN": "1",
-        "NVTE_FLASH_ATTN": "0",
+        "NVTE_FLASH_ATTN": "1",
         "NEMO_LOG_MEMORY_USAGE": "1",
         "NEMORUN_HOME": log_dir,
     }
-    if custom_env_vars:
-        env_vars |= custom_env_vars
-
+    mounts = []
     srun_args = ["--mpi=pmix"]
-    if custom_srun_args:
-        srun_args.extend(custom_srun_args)
+
+    if nemo_home != DEFAULT_NEMO_CACHE_HOME:  # DO NOT change this 'DEFAULT_NEMO_HOME'/'NEMO_HOME'
+        env_vars.update({"NEMO_HOME": nemo_home})
+        mounts.extend([f"{nemo_home}:{nemo_home}"])
+    if hf_token is not None:
+        env_vars.update({"HF_TOKEN": hf_token, "TRANSFORMERS_OFFLINE": "0"})
+
+    env_vars |= custom_env_vars
+    mounts.extend(custom_mounts)
+    srun_args.extend(custom_srun_args)
 
     executor = run.SlurmExecutor(
         account=account,
@@ -78,18 +88,16 @@ def slurm_executor(
         ),
         nodes=nodes,
         ntasks_per_node=num_gpus_per_node,
+        container_image=container_image,
+        container_mounts=mounts,
+        env_vars=env_vars,
+        srun_args=srun_args,
+        time=time_limit,
         mem="0",
         exclusive=True,
         packager=run.GitArchivePackager(),
     )
 
-    executor.container_image = container_image
-    executor.container_mounts = mounts
-    executor.env_vars = env_vars
-    executor.srun_args = srun_args
-    executor.retries = retries
-    executor.time = time_limit
-
     return executor
 
 
@@ -103,11 +111,11 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
                 huggingface.co/docs/transformers/v4.47.1/en/model_doc/auto#transformers.AutoTokenizer
     """
     log_msg = [
-        "AutoTokenizer first searches for tokenizer files locally in env var 'NEMO_HOME'.",
-        "If files are missing locally, AutoTokenizer will try downloading from HuggingFace.",
-        "Make sure 'TRANSFORMERS_OFFLINE=0' and 'HF_TOKEN:<token_value>'.",
-        "You can set them as scripts.llm.performance.utils.slurm_executor(custom_env_vars=",
-        "{'TRANSFORMERS_OFFLINE: 0', 'HF_TOKEN: <token_value>'}",
+        f"`AutoTokenizer` first searches for tokenizer files locally stored in {DEFAULT_NEMO_HOME}.",
+        "(from env var `NEMO_HOME`- can be changed using '-nh/--nemo_home' CLI arg).",
+        "If files are missing locally, `AutoTokenizer` will try downloading from HuggingFace. In this case-",
+        "make sure env vars 'TRANSFORMERS_OFFLINE':'0' and 'HF_TOKEN':'<token_value>' are set in your sbatch script.",
+        "Both of these will be set automatically if you provide '-hf/--hf_token' CLI arg.",
     ]
     logging.warning(" ".join(log_msg))
 
@@ -118,7 +126,42 @@ def hf_tokenizer(model_name: str) -> run.Config[AutoTokenizer]:
     )
 
 
-def get_comm_overlap_callback_idx(callbacks: List[Callback]):
+def import_ckpt_experiment(executor: run.SlurmExecutor, model: run.Config[GPTModel], source: str):
+    """
+    Downloads/Acceses checkpoint to be used for fine-tuning. `import_ckpt` first tries find the nemo checkpoint in
+    <NEMO_HOME>/models/. For eg: for llama3 8b, the path will look like- <NEMO_HOME>/models/meta-llama/Meta-Llama-3-8B
+    If missing, tries to downloads at the same location from HuggingFace and converts it nemo format.
+
+    Args:
+        source (str): HuggingFace URL. For eg- hf://meta-llama/Meta-Llama-3-70B
+    """
+    from copy import deepcopy
+
+    from nemo.collections.llm import import_ckpt
+
+    import_executor = deepcopy(executor)
+    import_executor.ntasks_per_node = 1
+    import_executor.nodes = 1
+
+    return run.Partial(import_ckpt, model=model, source=source, overwrite=False), import_executor, "import_ckpt_exp"
+
+
+def isfile_train_pack_metadata(hf_model_uri: str, data_config: run.Config[SquadDataModule]) -> bool:
+    """
+    This method is used for fine-tuning. It checks if packed train data for a partiular
+    sequence length exists locally. This is needed to set data flag (force_redownload=True)
+    which avoids experiment crash in case files are missing.
+    """
+    datasets_dir = os.getenv("NEMO_DATASETS_CACHE", os.path.join(DEFAULT_NEMO_HOME, "datasets"))
+    model_dir = hf_model_uri.replace("/", "--")
+    metadata_filename = f"{data_config.seq_length}_metadata.jsonl"
+
+    train_pack_metadata_filepath = os.path.join(datasets_dir, "squad", "packed", model_dir, metadata_filename)
+
+    return os.path.exists(train_pack_metadata_filepath) and os.path.isfile(train_pack_metadata_filepath)
+
+
+def get_comm_overlap_callback_idx(callbacks: List[Callback]) -> int | None:
     """
     nemo.lightning.Trainer has a list of callbacks defined. This method identifies index of MegatronCommOverlapCallback
     from the list defined in recipes in nemo.collections.llm.recipes. The index is needed to override ddp communication
@@ -126,9 +169,9 @@ def get_comm_overlap_callback_idx(callbacks: List[Callback]):
     """
     if callbacks:  # default is None in lightning
         for idx, callback in enumerate(callbacks):
-            if isinstance(callback, MegatronCommOverlapCallback):
+            if callback.__fn_or_cls__ == MegatronCommOverlapCallback:
                 return idx
-    return -1
+    return None
 
 
 def parse_cli_args():
@@ -168,12 +211,15 @@ def parse_cli_args():
         required=False,
         default="00:30:00",
     )
+    container_img_msg = [
+        "NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'",
+        "Make sure your NGC credentials are accessible in your environment.",
+    ]
     parser.add_argument(
         "-i",
         "--container_image",
         type=str,
-        help="NeMo container to use for experiment. Defaults to latest dev container- 'nvcr.io/nvidia/nemo:dev'\
-            Make sure your NGC credentials are accessible in your environment.",
+        help=" ".join(container_img_msg),
         required=False,
         default="nvcr.io/nvidia/nemo:dev",
     )
@@ -197,6 +243,31 @@ def parse_cli_args():
         help="Enable tensorboard logging. Disabled by default",
         action="store_true",
     )
+    parser.add_argument(
+        "-f",
+        "--finetuning",
+        help="Finetuning scheme to use. Options- 'sft', 'lora'. Defaults is 'lora'",
+        default='lora',
+    )
+    parser.add_argument(
+        "-hf",
+        "--hf_token",
+        type=str,
+        help="HuggingFace token. Defaults to None. Required for accessing tokenizers and checkpoints.",
+        default=None,
+    )
+    nemo_home_msg = [
+        "Sets env var `NEMO_HOME` (on compute node using sbatch script)- directory where NeMo searches",
+        "for models and checkpoints. This saves a lot of time (especially for bigger models) if checkpoints already",
+        f"exist here. Missing files will be downloaded here from HuggingFace. Defaults to {DEFAULT_NEMO_HOME}",
+    ]
+    parser.add_argument(
+        "-nh",
+        "--nemo_home",
+        type=str,
+        help=" ".join(nemo_home_msg),
+        default=DEFAULT_NEMO_HOME,
+    )
     parser.add_argument(
         "-d",
         "--dryrun",