From 8b9d6c7b9f582d76462f88af4b167242fefbda20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 8 Jan 2025 00:34:34 +0100 Subject: [PATCH] Bump mcore (#11740) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(beep boop 🤖): Bump `MCORE_TAG=076972e...` (2025-01-03) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Support attention backend configuration changes (#11517) * remove nvte attention flags from test_nemo_resume_from_ckpt Signed-off-by: Ananth Subramaniam * cherry pick 3410df6 Signed-off-by: Ananth Subramaniam * set local attention in config Signed-off-by: Ananth Subramaniam * retro config attention backend setting Signed-off-by: Ananth Subramaniam * set both Signed-off-by: Ananth Subramaniam * update unfused Signed-off-by: Ananth Subramaniam * gemma2b changes too Signed-off-by: Ananth Subramaniam * replace more usages Signed-off-by: Ananth Subramaniam * more test updates Signed-off-by: Ananth Subramaniam * Apply isort and black reformatting Signed-off-by: ananthsub * update unfused Signed-off-by: Ananth Subramaniam * remove duplicate gemma setting Signed-off-by: Ananth Subramaniam * remove gemma2b fused attn env vars Signed-off-by: Ananth Subramaniam * local for testing Signed-off-by: Ananth Subramaniam * update conftest to reset environment variables, use unfused for L2_Megatron_GPT_PEFT_Lora_TP2SP1 Signed-off-by: Ananth Subramaniam --------- Signed-off-by: Ananth Subramaniam Signed-off-by: ananthsub Signed-off-by: oliver könig Co-authored-by: ananthsub Co-authored-by: oliver könig * Some fixes to bump mcore (#11600) * chore(beep boop 🤖): Bump `MCORE_TAG=71c394b...` (2024-12-15) Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * ci: Add `no-fail-fast` mode Signed-off-by: Oliver Koenig * fix _get_layer_offset api for mllama Signed-off-by: yaoyu-33 * bump Signed-off-by: Oliver Koenig --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Oliver Koenig Signed-off-by: yaoyu-33 Signed-off-by: oliver könig Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Co-authored-by: yaoyu-33 * Use empty dict instead of none to load only metadata from dist ckpt due to change in mcore commit https://github.com/NVIDIA/Megatron-LM/commit/31e8bfa926ca05a3b70a48d3ed6b86410a85a262 Signed-off-by: Chen Cui * remove mcore-inserted env vars Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa * Add raising=False for delenv Signed-off-by: Abhishree --------- Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Signed-off-by: Ananth Subramaniam Signed-off-by: ananthsub Signed-off-by: oliver könig Signed-off-by: Oliver Koenig Signed-off-by: yaoyu-33 Signed-off-by: Chen Cui Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Signed-off-by: Abhishree Co-authored-by: pablo-garay <7166088+pablo-garay@users.noreply.github.com> Co-authored-by: Ananth Subramaniam Co-authored-by: ananthsub Co-authored-by: yaoyu-33 Co-authored-by: Chen Cui Co-authored-by: Alexandros Koumparoulis Co-authored-by: akoumpa Co-authored-by: Abhishree --- .github/workflows/cicd-main.yml | 5 +++-- Dockerfile.ci | 2 +- docs/source/nlp/information_retrieval.rst | 3 +-- nemo/collections/diffusion/scripts/train.sh | 1 - nemo/collections/llm/gpt/model/gemma.py | 3 +++ nemo/collections/llm/recipes/gemma_2b.py | 2 -- nemo/collections/llm/recipes/gemma_7b.py | 4 ---- .../language_modeling/megatron_base_model.py | 5 +++++ .../language_modeling/megatron_retro_model.py | 3 +++ nemo/collections/vlm/mllama/model/language.py | 4 ++-- nemo/lightning/pytorch/callbacks/peft.py | 2 +- .../convert_bert_hf_to_nemo.py | 3 ++- .../llm/bitexact/mixtral/pretrain_mini_mixtral.py | 2 ++ tests/collections/llm/bitexact/mixtral/run.sh | 4 ++-- .../collections/llm/gpt/model/test_model_import.py | 5 +++++ tests/collections/llm/hf/peft_nemorun.py | 1 - tests/collections/llm/hf/sft_nemorun.py | 1 - .../llm/megatron_mixtral_pretraining.py | 2 ++ tests/conftest.py | 14 ++++++++++++++ tests/core/test_exp_manager.py | 4 ++-- tests/lightning/test_nemo_resume_from_ckpt.py | 10 +++++----- .../llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 2 -- .../llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb | 2 -- tutorials/llm/mamba/mamba.rst | 9 ++------- 24 files changed, 55 insertions(+), 38 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 101107dddc17..06c7a49ce3ce 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2937,7 +2937,7 @@ jobs: with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | - CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ trainer.log_every_n_steps=1 \ trainer.max_epochs=9999 \ @@ -2965,6 +2965,7 @@ jobs: +model.tp_comm_overlap_ag=False \ +model.tp_comm_overlap_rs=False \ +model.tp_comm_overlap_disable_qkv=True \ + +model.attention_backend="unfused" \ model.peft.peft_scheme="lora" \ model.peft.lora_tuning.adapter_dim=16 \ model.peft.lora_tuning.alpha=32 \ @@ -4331,7 +4332,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \ + python3 tests/collections/llm/megatron_mixtral_pretraining.py \ --experiment-dir=/tmp/mixtral_pretrain_results \ --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document diff --git a/Dockerfile.ci b/Dockerfile.ci index e93d00d03195..75550998faa3 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -54,7 +54,7 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.21.0 -ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa +ARG MCORE_TAG=076972e37420b5325c5fe06e7131be7d96f05b53 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst index 26732283e8f4..69f1c3219093 100644 --- a/docs/source/nlp/information_retrieval.rst +++ b/docs/source/nlp/information_retrieval.rst @@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script: VALIDATION_DATASET_PATH= # Path to validation dataset SAVE_DIR= # where the checkpoint and logs are saved mkdir -p $SAVE_DIR - export NVTE_FLASH_ATTN=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 - export NVTE_FUSED_ATTN=0 python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script: model.post_process=False \ model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size model.micro_batch_size=8 \ + model.attention_backend="unfused" \ model.optim.lr=0.000005 \ model.optim.sched.min_lr=0.00000001 \ model.optim.sched.warmup_steps=100 \ diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh index 2150458e9376..ced479e32526 100644 --- a/nemo/collections/diffusion/scripts/train.sh +++ b/nemo/collections/diffusion/scripts/train.sh @@ -20,7 +20,6 @@ export WANDB_PROJECT=xxx export WANDB_RUN_ID=xxx export WANDB_RESUME=allow -export NVTE_FUSED_ATTN=0 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index bf828bb66277..4d8d541deaa8 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -18,6 +18,7 @@ import torch from megatron.core import parallel_state +from megatron.core.transformer.enums import AttnBackend from torch import nn from nemo.collections.llm.fn.activation import openai_gelu @@ -53,6 +54,8 @@ class GemmaConfig(GPTConfig): # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script # The present implementation is more in line with the official implementation layernorm_zero_centered_gamma: bool = True + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + attention_backend: AttnBackend = AttnBackend.flash @dataclass diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py index 3b43bbdb0e62..64af8192929c 100644 --- a/nemo/collections/llm/recipes/gemma_2b.py +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig2B)) diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py index 40e43bda4d5e..2ac3419d6587 100644 --- a/nemo/collections/llm/recipes/gemma_7b.py +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig7B)) @@ -173,8 +171,6 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Partial( fn, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 330f6ffee05b..53daf42f1a07 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -50,6 +50,7 @@ try: from megatron.core import ModelParallelConfig, parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP + from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -537,6 +538,9 @@ def build_transformer_config(self) -> TransformerConfig: tp_only_amax_red = self.cfg.get('tp_only_amax_red', False) + attention_backend = self.cfg.get('attention_backend', "auto") + attention_backend = AttnBackend[attention_backend] + # any configs that are not in the nemo model config will be added here config_mapping = { 'apply_query_key_layer_scaling': apply_query_key_layer_scaling, @@ -561,6 +565,7 @@ def build_transformer_config(self) -> TransformerConfig: 'rotary_interleaved': rotary_interleaved, 'deallocate_pipeline_outputs': True, 'tp_only_amax_red': tp_only_amax_red, + 'attention_backend': attention_backend, } # populate the transformer config dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py index 493d512fd30e..b3fd7b11c6eb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -76,6 +76,7 @@ from megatron.core.models.retro.utils import get_config_path as get_retro_config_path from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -431,6 +432,8 @@ def build_retro_config(self) -> RetroConfig: te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.3"): + if HAVE_MEGATRON_CORE: + retro_config.attention_backend = AttnBackend.unfused try: os.environ["NVTE_FLASH_ATTN"] = "0" os.environ["NVTE_FUSED_ATTN"] = "0" diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py index bec3ec526f6e..3edc6706defb 100644 --- a/nemo/collections/vlm/mllama/model/language.py +++ b/nemo/collections/vlm/mllama/model/language.py @@ -390,7 +390,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' num_layers = self.config.num_layers for layer in self.layers: - offset = layer._get_layer_offset() + offset = layer._get_layer_offset(layer.config) global_layer_offset = layer.layer_number - 1 # self.layer_number starts at 1 state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = layer_prefix @@ -403,7 +403,7 @@ def sharded_state_dict( for xlayer in self.xattn_layers: if isinstance(xlayer, DummyCrossAttentionTransformerLayer): continue - offset = xlayer._get_layer_offset() + offset = xlayer._get_layer_offset(xlayer.config) global_layer_offset = xlayer.layer_number - 1 state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.' diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index 0c559d1b3990..c830a5de63f6 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -448,7 +448,7 @@ def load_checkpoint( if getattr(path, "base_model_path", None): ## PEFT Resume, FIRST TIME self.adapter_ckpt_path = Path(str(path)) - adapter_ckpt = self.checkpoint_io.load_checkpoint(path) # Loads only metadata + adapter_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict={}) # Loads only metadata # path is adapter path to restore the training metadata, but switch to loading base model here. path = self.model_ckpt_path = path.base_model_path elif adapter_meta_path.exists(): diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py index 14baca53f165..8265da57f656 100644 --- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py @@ -84,6 +84,8 @@ def convert(args): nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict(), mcore_bert=args.mcore) nemo_config.trainer["precision"] = args.precision + # Bert doesn't support FLASH_ATTN + nemo_config.model["attention_backend"] = "fused" trainer = MegatronTrainerBuilder(nemo_config).create_trainer() model = MegatronBertModel(nemo_config.model, trainer) @@ -288,6 +290,5 @@ def convert(args): if __name__ == '__main__': - os.environ['NVTE_FLASH_ATTN'] = '0' # Bert doesn't support FLASH_ATTN args = get_args() convert(args) diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py index b4f95879bad5..654a2a9e05a8 100644 --- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py +++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py @@ -17,6 +17,7 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig +from megatron.core.transformer.enums import AttnBackend from megatron.core.utils import init_method_normal, scaled_init_method_normal from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule @@ -102,6 +103,7 @@ def main(args): bias_dropout_fusion=True, apply_rope_fusion=True, distribute_saved_activations=False, + attention_backend=AttnBackend.unfused, ) data = PreTrainingDataModule( diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh index 87bf7c382b99..0f6612b3d21b 100644 --- a/tests/collections/llm/bitexact/mixtral/run.sh +++ b/tests/collections/llm/bitexact/mixtral/run.sh @@ -8,7 +8,7 @@ MCORE_OUTPUT_PATH="/tmp/bex_mixtral_mcore_output/" NEMO_OUTPUT_PATH="/tmp/bex_mixtral_nemo_output/" # Run Mcore -CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ +CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 \ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --apply-layernorm-1p --rotary-percent 1.0 --rotary-base 1000000 \ --no-position-embedding --position-embedding-type rope \ @@ -30,7 +30,7 @@ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --split 99,1,0 --log-interval 10 --save-interval 20000 --eval-interval 1000 --eval-iters 32 \ --save "$MCORE_OUTPUT_PATH" \ --log-num-zeros-in-grad --distributed-timeout-minutes 6000 --moe-router-topk 1 --num-experts 2 \ - --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 + --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 --attention-backend unfused # Run NeMo CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ diff --git a/tests/collections/llm/gpt/model/test_model_import.py b/tests/collections/llm/gpt/model/test_model_import.py index 9edc235e454f..b49885718837 100644 --- a/tests/collections/llm/gpt/model/test_model_import.py +++ b/tests/collections/llm/gpt/model/test_model_import.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import torch torch.set_grad_enabled(False) @@ -95,5 +97,8 @@ def import_from_hf(config_name, hf_path): if __name__ == '__main__': for config_name, hf_id in config_name_to_hf_id.items(): + for env_var in ['NVTE_FLASH_ATTN', 'NVTE_FUSED_ATTN', 'NVTE_UNFUSED_ATTN']: + if env_var in os.environ: + del os.environ[env_var] src = f'hf:///home/TestData/nemo2_ckpt/{config_name}' import_from_hf(config_name, src) diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py index ef34d4d39a11..3a135b2346be 100644 --- a/tests/collections/llm/hf/peft_nemorun.py +++ b/tests/collections/llm/hf/peft_nemorun.py @@ -28,7 +28,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py index a3daa66ca774..b559c04f6cbd 100644 --- a/tests/collections/llm/hf/sft_nemorun.py +++ b/tests/collections/llm/hf/sft_nemorun.py @@ -29,7 +29,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py index 4123c7b37987..2a7b1fdfdad6 100644 --- a/tests/collections/llm/megatron_mixtral_pretraining.py +++ b/tests/collections/llm/megatron_mixtral_pretraining.py @@ -18,6 +18,7 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig +from megatron.core.transformer.enums import AttnBackend from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule from nemo.collections.llm.api import train @@ -117,6 +118,7 @@ def main(args): bf16=True, params_dtype=torch.bfloat16, pipeline_dtype=torch.bfloat16, + attention_backend=AttnBackend.unfused, ) mixtral_config.overlap_param_gather_with_optimizer_step = True diff --git a/tests/conftest.py b/tests/conftest.py index 118e978e63c7..989c937ab499 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os import os.path import shutil import tarfile @@ -122,6 +123,19 @@ def reset_singletons(): Singleton._Singleton__instances = {} +@pytest.fixture(autouse=True) +def reset_env_vars(): + # Store the original environment variables before the test + original_env = dict(os.environ) + + # Run the test + yield + + # After the test, restore the original environment + os.environ.clear() + os.environ.update(original_env) + + @pytest.fixture(scope="session") def test_data_dir(): """ diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 32d401b2051f..9dbdaa66a25e 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -280,7 +280,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "957").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) # Checks that use_datetime_version False toggle works test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) @@ -288,7 +288,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "version_0").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) # Checks that use_datetime_version False toggle works and version increments test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index e876e6965000..37ea326ad621 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from typing import List, Optional import pytest def set_env(): - os.environ['NVTE_FLASH_ATTN'] = '0' - os.environ['NVTE_FUSED_ATTN'] = '0' os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '0' @@ -28,6 +27,7 @@ def set_env(): import pytest import torch from megatron.core.optimizer import OptimizerConfig +from megatron.core.transformer.enums import AttnBackend import nemo.lightning as nl from nemo.collections import llm @@ -68,7 +68,8 @@ def load_dcp(ckpt_dir, torch_tensor=True): return state_dict -def compare_ckpts(a, b, path=[]): +def compare_ckpts(a, b, path: Optional[List[str]] = None): + path = path if path is not None else [] if isinstance(a, dict): assert isinstance(b, dict) assert set(a.keys()) == set(b.keys()) @@ -125,6 +126,7 @@ def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): make_vocab_size_divisible_by=128, normalization='RMSNorm', masked_softmax_fusion=False, + attention_backend=AttnBackend.local, ) model = llm.GPTModel(gpt_config, tokenizer=tokenizer) @@ -269,8 +271,6 @@ def train(n_steps, resume): trainer._teardown() set_env() - assert os.environ['NVTE_FLASH_ATTN'] == '0' - assert os.environ['NVTE_FUSED_ATTN'] == '0' assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '0' # Train for 40 steps diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index b3393d133a45..3895c3b74757 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -341,7 +341,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -457,7 +456,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb index e84ff916fc4e..0bb4367d50e9 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb @@ -482,7 +482,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -565,7 +564,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst index 197825c27d58..7f5e901659a4 100644 --- a/tutorials/llm/mamba/mamba.rst +++ b/tutorials/llm/mamba/mamba.rst @@ -103,9 +103,6 @@ Run Fine-Tuning CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= - export NVTE_FUSED_ATTN=1 - export NVTE_FLASH_ATTN=0 - torchrun --nproc_per_node=${NUM_DEVICES} \ /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -129,6 +126,7 @@ Run Fine-Tuning model.peft.peft_scheme='none' \ model.megatron_amp_O2=True \ model.encoder_seq_length=${SEQ_LEN} \ + model.attention_backend='fused' \ model.data.validation_ds.pad_to_max_length=True \ model.data.train_ds.pad_to_max_length=True \ model.optim.name="distributed_fused_adam" \ @@ -162,10 +160,6 @@ Evaluating the Fine-Tuned Model CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= - export NVTE_FUSED_ATTN=1 - export NVTE_FLASH_ATTN=0 - - CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_generate_config" @@ -185,6 +179,7 @@ Evaluating the Fine-Tuned Model exp_manager.exp_dir=${SAVE_DIR} \ exp_manager.resume_if_exists=False \ exp_manager.create_wandb_logger=False \ + model.attention_backend='fused' \ model.megatron_amp_O2=True \ model.peft.restore_from_path=False \ +model.peft.restore_from_ckpt.checkpoint_dir=False \