From 86010812e6b4b7cab81cf983c2bc2f7c6166ddcf Mon Sep 17 00:00:00 2001 From: xinyanhe Date: Tue, 7 Jan 2025 10:47:39 +0800 Subject: [PATCH] Add Ascend NPU as a backend for single device recipes change torch.npu.is_available() to is_npu_available in precision.py --- recipes/dev/generate_v2.py | 3 ++- recipes/eleuther_eval.py | 3 ++- recipes/full_finetune_single_device.py | 4 ++-- recipes/generate.py | 3 ++- recipes/knowledge_distillation_single_device.py | 14 +++++++++----- recipes/lora_dpo_single_device.py | 6 +++--- recipes/lora_finetune_single_device.py | 6 +++--- recipes/quantize.py | 3 ++- torchtune/training/precision.py | 2 +- 9 files changed, 26 insertions(+), 18 deletions(-) diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py index 3ce95a9fdf..acc5999277 100644 --- a/recipes/dev/generate_v2.py +++ b/recipes/dev/generate_v2.py @@ -111,8 +111,9 @@ def log_metrics(self, total_time: int, tokens_per_second: float) -> None: self._logger.info( f"Bandwidth achieved: {model_size * tokens_per_second / 1e9:.02f} GB/s" ) + torch_device = utils.get_torch_device_namespace() self._logger.info( - f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB" ) @torch.inference_mode() diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py index 5693e899ad..e3a16eeb14 100644 --- a/recipes/eleuther_eval.py +++ b/recipes/eleuther_eval.py @@ -534,6 +534,7 @@ def evaluate(self) -> None: # Initialize tasks for the harness task_manager = TaskManager(include_path=self.include_path) task_dict = get_task_dict(self.tasks, task_manager) + torch_device = utils.get_torch_device_namespace() # Run evaluation t0 = time.time() @@ -548,7 +549,7 @@ def evaluate(self) -> None: # Log metrics self.logger.info(f"Eval completed in {t1:.02f} seconds.") self.logger.info( - f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB" + f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB" ) formatted_output = make_table(output) self.logger.info(f"\n\n{formatted_output}\n") diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 946e970206..7870c3758e 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -131,9 +131,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type != "cuda": + if self._log_peak_memory_stats and self._device.type == "cpu": log.info( - "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False diff --git a/recipes/generate.py b/recipes/generate.py index 56723b04bd..848c3418b9 100644 --- a/recipes/generate.py +++ b/recipes/generate.py @@ -183,11 +183,12 @@ def generate(self, cfg: DictConfig): tokens_generated = len(generated_tokens[0]) - prompt.size(0) tokens_sec = tokens_generated / t + torch_device = utils.get_torch_device_namespace() logger.info( f"Time for inference: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec" ) logger.info(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s") - logger.info(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + logger.info(f"Memory used: {torch_device.max_memory_allocated() / 1e9:.02f} GB") @config.parse diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 71d850d791..79bf1d4b2b 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -120,9 +120,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type != "cuda": + if self._log_peak_memory_stats and self._device.type == "cpu": log.info( - "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False @@ -223,6 +223,10 @@ def setup(self, cfg: DictConfig) -> None: self._metric_logger.log_config(cfg) self._compile = cfg.compile + if cfg.device == "npu" and cfg.compile: + raise ValueError( + "NPU does not support model compilation. Please set `compile: False` in the config." + ) checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer) teacher_checkpoint_dict = self.load_teacher_checkpoint( cfg_checkpointer=cfg.teacher_checkpointer @@ -447,7 +451,7 @@ def _setup_model( log.info(f"Student model is initialized with precision {self._dtype}.") - if self._device.type == "cuda": + if self._device.type != "cpu": log.info("Memory stats initializing student model:") memory_stats = training.get_memory_stats(device=self._device) training.log_memory_stats( @@ -476,7 +480,7 @@ def _setup_teacher_model( ) log.info(f"Teacher model is initialized with precision {self._dtype}.") - if self._device.type == "cuda": + if self._device.type != "cpu": memory_stats = training.get_memory_stats(device=self._device) training.log_memory_stats( memory_stats, message="Memory stats after teacher model init:" @@ -753,7 +757,7 @@ def train(self) -> None: "tokens_per_second_per_gpu": num_tokens / time_per_step, } if ( - self._device.type == "cuda" + self._device.type != "cpu" and self._log_peak_memory_stats ): log_dict.update( diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index 9b5dc6fb1a..c493b65602 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -98,9 +98,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type != "cuda": + if self._log_peak_memory_stats and self._device.type == "cpu": log.info( - "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False @@ -327,7 +327,7 @@ def _setup_model( # Compile model, if enabled. if compile_model: training.compile_model(model) - if self._device == torch.device("cuda"): + if self._device.type != "cpu": memory_stats = training.get_memory_stats(device=self._device) training.log_memory_stats(memory_stats) return model diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 9a3f3eacfb..f35fbaeb59 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -136,9 +136,9 @@ def __init__(self, cfg: DictConfig) -> None: self._log_every_n_steps = cfg.get("log_every_n_steps", 1) self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False) - if self._log_peak_memory_stats and self._device.type != "cuda": + if self._log_peak_memory_stats and self._device.type == "cpu": log.info( - "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False." + "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False." ) self._log_peak_memory_stats = False @@ -735,7 +735,7 @@ def train(self) -> None: "tokens_per_second_per_gpu": num_tokens / time_per_step, } if ( - self._device.type == "cuda" + self._device.type != "cpu" and self._log_peak_memory_stats ): log_dict.update( diff --git a/recipes/quantize.py b/recipes/quantize.py index bb28d45b87..2b5fa96216 100644 --- a/recipes/quantize.py +++ b/recipes/quantize.py @@ -91,8 +91,9 @@ def quantize(self, cfg: DictConfig): else: self._model = self._quantizer.quantize(self._model) t = time.perf_counter() - t0 + torch_device = utils.get_torch_device_namespace() logger.info(f"Time for quantization: {t:.02f} sec") - logger.info(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB") + logger.info(f"Memory used: {torch_device.max_memory_allocated() / 1e9:.02f} GB") def save_checkpoint(self, cfg: DictConfig): ckpt_dict = self._model.state_dict() diff --git a/torchtune/training/precision.py b/torchtune/training/precision.py index 85a2c07e4f..6aa9a80b4d 100644 --- a/torchtune/training/precision.py +++ b/torchtune/training/precision.py @@ -33,7 +33,7 @@ def _set_float32_precision(precision: str = "high") -> None: Args: precision (str): The setting to determine which datatypes to use for matrix multiplication and convolution operations. """ - if not torch.cuda.is_available(): # Not relevant for non-CUDA devices + if not torch.cuda.is_available() or not is_npu_available: # Not relevant for non-CUDA devices return # set precision for matrix multiplications torch.set_float32_matmul_precision(precision)