From 86010812e6b4b7cab81cf983c2bc2f7c6166ddcf Mon Sep 17 00:00:00 2001
From: xinyanhe <hexinyan1@huawei.com>
Date: Tue, 7 Jan 2025 10:47:39 +0800
Subject: [PATCH] Add Ascend NPU as a backend for single device recipes

change torch.npu.is_available() to is_npu_available in precision.py
---
 recipes/dev/generate_v2.py                      |  3 ++-
 recipes/eleuther_eval.py                        |  3 ++-
 recipes/full_finetune_single_device.py          |  4 ++--
 recipes/generate.py                             |  3 ++-
 recipes/knowledge_distillation_single_device.py | 14 +++++++++-----
 recipes/lora_dpo_single_device.py               |  6 +++---
 recipes/lora_finetune_single_device.py          |  6 +++---
 recipes/quantize.py                             |  3 ++-
 torchtune/training/precision.py                 |  2 +-
 9 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/recipes/dev/generate_v2.py b/recipes/dev/generate_v2.py
index 3ce95a9fdf..acc5999277 100644
--- a/recipes/dev/generate_v2.py
+++ b/recipes/dev/generate_v2.py
@@ -111,8 +111,9 @@ def log_metrics(self, total_time: int, tokens_per_second: float) -> None:
         self._logger.info(
             f"Bandwidth achieved: {model_size * tokens_per_second / 1e9:.02f} GB/s"
         )
+        torch_device = utils.get_torch_device_namespace()
         self._logger.info(
-            f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB"
+            f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB"
         )
 
     @torch.inference_mode()
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index 5693e899ad..e3a16eeb14 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -534,6 +534,7 @@ def evaluate(self) -> None:
         # Initialize tasks for the harness
         task_manager = TaskManager(include_path=self.include_path)
         task_dict = get_task_dict(self.tasks, task_manager)
+        torch_device = utils.get_torch_device_namespace()
 
         # Run evaluation
         t0 = time.time()
@@ -548,7 +549,7 @@ def evaluate(self) -> None:
         # Log metrics
         self.logger.info(f"Eval completed in {t1:.02f} seconds.")
         self.logger.info(
-            f"Max memory allocated: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB"
+            f"Max memory allocated: {torch_device.max_memory_allocated() / 1e9:.02f} GB"
         )
         formatted_output = make_table(output)
         self.logger.info(f"\n\n{formatted_output}\n")
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 946e970206..7870c3758e 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -131,9 +131,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type != "cuda":
+        if self._log_peak_memory_stats and self._device.type == "cpu":
             log.info(
-                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/generate.py b/recipes/generate.py
index 56723b04bd..848c3418b9 100644
--- a/recipes/generate.py
+++ b/recipes/generate.py
@@ -183,11 +183,12 @@ def generate(self, cfg: DictConfig):
 
         tokens_generated = len(generated_tokens[0]) - prompt.size(0)
         tokens_sec = tokens_generated / t
+        torch_device = utils.get_torch_device_namespace()
         logger.info(
             f"Time for inference: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec"
         )
         logger.info(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
-        logger.info(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
+        logger.info(f"Memory used: {torch_device.max_memory_allocated() / 1e9:.02f} GB")
 
 
 @config.parse
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 71d850d791..79bf1d4b2b 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -120,9 +120,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type != "cuda":
+        if self._log_peak_memory_stats and self._device.type == "cpu":
             log.info(
-                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
@@ -223,6 +223,10 @@ def setup(self, cfg: DictConfig) -> None:
         self._metric_logger.log_config(cfg)
 
         self._compile = cfg.compile
+        if cfg.device == "npu" and cfg.compile:
+            raise ValueError(
+                "NPU does not support model compilation. Please set `compile: False` in the config."
+            )
         checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
         teacher_checkpoint_dict = self.load_teacher_checkpoint(
             cfg_checkpointer=cfg.teacher_checkpointer
@@ -447,7 +451,7 @@ def _setup_model(
 
         log.info(f"Student model is initialized with precision {self._dtype}.")
 
-        if self._device.type == "cuda":
+        if self._device.type != "cpu":
             log.info("Memory stats initializing student model:")
             memory_stats = training.get_memory_stats(device=self._device)
             training.log_memory_stats(
@@ -476,7 +480,7 @@ def _setup_teacher_model(
         )
         log.info(f"Teacher model is initialized with precision {self._dtype}.")
 
-        if self._device.type == "cuda":
+        if self._device.type != "cpu":
             memory_stats = training.get_memory_stats(device=self._device)
             training.log_memory_stats(
                 memory_stats, message="Memory stats after teacher model init:"
@@ -753,7 +757,7 @@ def train(self) -> None:
                                 "tokens_per_second_per_gpu": num_tokens / time_per_step,
                             }
                             if (
-                                self._device.type == "cuda"
+                                self._device.type != "cpu"
                                 and self._log_peak_memory_stats
                             ):
                                 log_dict.update(
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 9b5dc6fb1a..c493b65602 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -98,9 +98,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type != "cuda":
+        if self._log_peak_memory_stats and self._device.type == "cpu":
             log.info(
-                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
@@ -327,7 +327,7 @@ def _setup_model(
         # Compile model, if enabled.
         if compile_model:
             training.compile_model(model)
-        if self._device == torch.device("cuda"):
+        if self._device.type != "cpu":
             memory_stats = training.get_memory_stats(device=self._device)
             training.log_memory_stats(memory_stats)
         return model
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 9a3f3eacfb..f35fbaeb59 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -136,9 +136,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type != "cuda":
+        if self._log_peak_memory_stats and self._device.type == "cpu":
             log.info(
-                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training uses cpu. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
@@ -735,7 +735,7 @@ def train(self) -> None:
                                 "tokens_per_second_per_gpu": num_tokens / time_per_step,
                             }
                             if (
-                                self._device.type == "cuda"
+                                self._device.type != "cpu"
                                 and self._log_peak_memory_stats
                             ):
                                 log_dict.update(
diff --git a/recipes/quantize.py b/recipes/quantize.py
index bb28d45b87..2b5fa96216 100644
--- a/recipes/quantize.py
+++ b/recipes/quantize.py
@@ -91,8 +91,9 @@ def quantize(self, cfg: DictConfig):
         else:
             self._model = self._quantizer.quantize(self._model)
         t = time.perf_counter() - t0
+        torch_device = utils.get_torch_device_namespace()
         logger.info(f"Time for quantization: {t:.02f} sec")
-        logger.info(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
+        logger.info(f"Memory used: {torch_device.max_memory_allocated() / 1e9:.02f} GB")
 
     def save_checkpoint(self, cfg: DictConfig):
         ckpt_dict = self._model.state_dict()
diff --git a/torchtune/training/precision.py b/torchtune/training/precision.py
index 85a2c07e4f..6aa9a80b4d 100644
--- a/torchtune/training/precision.py
+++ b/torchtune/training/precision.py
@@ -33,7 +33,7 @@ def _set_float32_precision(precision: str = "high") -> None:
     Args:
         precision (str): The setting to determine which datatypes to use for matrix multiplication and convolution operations.
     """
-    if not torch.cuda.is_available():  # Not relevant for non-CUDA devices
+    if not torch.cuda.is_available() or not is_npu_available:  # Not relevant for non-CUDA devices
         return
     # set precision for matrix multiplications
     torch.set_float32_matmul_precision(precision)