apple · jiya-zhang · Sep 9, 2024 · Oct 1, 2024 · Oct 7, 2024 · Oct 9, 2024
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,5 @@
+logs
+jobsets
+.venv
+.circleci
+.vscode
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# IGNORE
+jobsets
+
 # test results
 test-results
 

@@ -93,8 +93,10 @@ RUN apt-get install -y google-perftools
 ENV PIP_FIND_LINKS=https://storage.googleapis.com/jax-releases/libtpu_releases.html
 # Ensure we install the TPU version, even if building locally.
 # Jax will fallback to CPU when run on a machine without TPU.
-RUN pip install .[core,tpu]
+RUN pip install .[core,tpu,pathways]
 RUN if [ -n "$EXTRAS" ]; then pip install .[$EXTRAS]; fi
+RUN pip install -U --pre libtpu-nightly==0.1.dev20241203+nightly requests \
+    -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
 COPY . .
 
 ################################################################################

diff --git a/axlearn/cloud/gcp/tpu.py b/axlearn/cloud/gcp/tpu.py
@@ -718,7 +718,7 @@ def infer_tpu_workers(tpu_type: str) -> int:
             tpu_version, tpu_cores = match.groups()
             if tpu_version in {"v3", "v4", "v5p"}:
                 return int(tpu_cores) // 8
-            if tpu_version in {"v5litepod"}:
+            if tpu_version in {"v5litepod", "v6e"}:
                 return int(tpu_cores) // 4
     except Exception as e:  # pylint: disable=broad-except
         logging.error("Failed to parse tpu_type %s: %s", tpu_type, e)

diff --git a/axlearn/cloud/gcp/utils.py b/axlearn/cloud/gcp/utils.py
@@ -76,7 +76,7 @@ def running_from_vm() -> bool:
         capture_output=True,
         text=True,
     )
-    return (out.returncode == 0) and "Metadata-Flavor: Google" in out.stdout
+    return False  # (out.returncode == 0) and "Metadata-Flavor: Google" in out.stdout
 
 
 def running_from_k8s() -> bool:

diff --git a/axlearn/common/compiler_options.py b/axlearn/common/compiler_options.py
@@ -44,6 +44,68 @@ def default_xla_options(
             xla_enable_async_all_gather="true",  # Allow async all-gather.
             xla_enable_async_collective_permute="true",  # Allow async collective permute.
         )
+    if version == "v6e":
+        options.update(
+            # improved performance for v6e
+            xla_tpu_scoped_vmem_limit_kib="98304",
+            # maxtext xla flags
+            # xla_enable_async_all_reduce="true",
+            # CF_FOR_ALL_GATHER
+            xla_tpu_enable_async_collective_fusion="true",
+            xla_tpu_enable_async_collective_fusion_fuse_all_gather="true",
+            xla_tpu_enable_async_collective_fusion_multiple_steps="true",
+            xla_tpu_overlap_compute_collective_tc="true",
+            xla_enable_async_all_gather="true",
+            # sparsecore offloading AR
+            xla_sc_disable_megacore_partitioning="true",
+            # xla_tpu_enable_async_collective_fusion_fuse_all_gather="false",
+            # xla_tpu_enable_all_gather_offload_tracing="true",
+            xla_tpu_use_tc_device_shape_on_sc="true",
+            # xla_tpu_enable_sparse_core_collective_offload_all_gather="true",
+            xla_sc_enable_instruction_fusion="false",
+            xla_sc_disjoint_spmem="false",
+            tpu_use_continuations="true",
+            xla_jf_crs_combiner_threshold_count="10",
+            xla_tpu_enable_sparse_core_collective_offload_all_reduce="true",
+            # Flag to enable some advanced scheduling features.
+            xla_tpu_enable_all_experimental_scheduler_features="true",
+            # Flag to enable memory tracking scheduling. The default AUTO only enables
+            # it in some situations. Not needed if
+            # xla_tpu_enable_all_experimental_scheduler_features is set to true already.
+            xla_tpu_enable_scheduler_memory_pressure_tracking="ENABLED",
+            # Flag controlling the maximum number of overlapping host offloadings.
+            xla_tpu_host_transfer_overlap_limit=24,
+            # Flag to enable the aggressive removal of opt-barriers.
+            xla_tpu_aggressive_opt_barrier_removal="ENABLED",
+            # Flag to enable more aggressive scheduling for async ops, such as pushing
+            # the async start to the beginning of the loop body.
+            xla_lhs_prioritize_async_depth_over_stall="ENABLED",
+            # For multi-slice configurations,
+            # Flag to enable pipelining of cross-DCN all-gathers.
+            xla_tpu_enable_ag_backward_pipelining="true",
+            xla_should_allow_loop_variant_parameter_in_chain="ENABLED",
+            xla_should_add_loop_invariant_op_in_chain="ENABLED",
+            # Flag controlling the maximum number of overlapping cross-DCN send/recv.
+            xla_max_concurrent_host_send_recv=100,
+            # If you are seeing OOM (out-of-memory) error, or bad performance when HBM memory
+            # usage is close to HBM capacity, tuning these two flags might help:
+            # Flag controlling the HBM memory limit as a percentage of the total HBM size.
+            # Default value is 95. Can tune up or down to give more or less memory for the
+            # scheduler. The scheduler favors more on less memory usage when it's under
+            # memory pressure, instead of hiding latency by overlapping more computations
+            # and communications.
+            # xla_tpu_scheduler_percent_shared_memory_limit=xx,
+            # Flag controlling the number of times the scheduler is run if the scheduled
+            # peak memory usage exceeds the initial memory limit, by setting memory limit
+            # to 90% of the previous memory limit each time. Default value is 1. Sometimes
+            # when the scheduler thinks it goes out memory, it may not actually happen due
+            # to other factors controlled by other compiler passes, or the initial memory
+            # limit is already set too low. Cutting the memory limit to 90% of previous one
+            # though, may make the scheduler weighting too much on the memory usage instead
+            # of latency side.
+            xla_latency_hiding_scheduler_rerun=0,
+        )
+        options["2a886c8_chip_config_name"] = "megachip_tccontrol"
     if num_slices > 1:
         # Support multiple TPU slices connected over a data center network.
         options.update(
@@ -58,8 +120,8 @@ def default_xla_options(
         )
 
     # Validate options. Will never fail if this function is implemented correctly.
-    for k, v in options.items():
-        assert v in [True, False, "true", "false"], (k, v)
+    # for k, v in options.items():
+    #     assert v in [True, False, "true", "false"], (k, v)
 
     return options
 
@@ -166,4 +228,4 @@ def infer_xsc_compiler_options(
     return options
 
 
-_TPU_VERSIONS = ("v3", "v4", "v5litepod", "v5p")
+_TPU_VERSIONS = ("v3", "v4", "v5litepod", "v5p", "v6e")
@@ -23,6 +23,7 @@
         instance_type=instance_type, num_slices=num_tpu_slices, backend="tpu"
     )
     os.environ["LIBTPU_INIT_ARGS"] = compiler_options.xla_flags_from_options(libtpu_init_options)
+    print("LIBTPU_INIT_ARGS: ", os.environ["LIBTPU_INIT_ARGS"], file=sys.stderr)
 except compiler_options.NotTpuError as e:
     # Log this when setup() is called.
     tpu_flags_exc = e
@@ -132,7 +133,9 @@ def setup():
     logging.info("Devices: %s", devices)
     local_devices = jax.local_devices()
     logging.info("Local Devices: %s", local_devices)
-    if not devices or not all(device.platform == FLAGS.jax_backend for device in devices):
+    if FLAGS.jax_backend != "proxy" and (
+        not devices or not all(device.platform == FLAGS.jax_backend for device in devices)
+    ):
         raise RuntimeError(f"Expected backend {FLAGS.jax_backend}. Got {devices}.")
     if FLAGS.data_dir:
         # TODO(ruoming): Get rid of --data_dir and use only env var DATA_DIR.

diff --git a/axlearn/common/launch_trainer.py b/axlearn/common/launch_trainer.py
@@ -68,6 +68,16 @@
     None,
     "The mesh selector string. See `SpmdTrainer.Config.mesh_rules` for details.",
 )
+flags.DEFINE_string(
+    "pdbs",
+    None,
+    "Per device batch size (Overrides global batch size).",
+)
+flags.DEFINE_integer(
+    "slices",
+    1,
+    "Number of slices for the TPU job.",
+)
 
 FLAGS = flags.FLAGS
 

@@ -2,6 +2,8 @@
 
 """Main function for launching the trainer."""
 
+# Temp hack to bypass invalid backend error
+import pathwaysutils
 from absl import app, flags
 
 from axlearn.common import launch, launch_trainer, measurement

diff --git a/axlearn/common/trainer.py b/axlearn/common/trainer.py
@@ -200,6 +200,9 @@ class Config(Module.Config):
         # The provided config should instantiate to a thunk that returns the context manager.
         context_manager: Optional[ConfigOr[Callable[[], ContextManager]]] = None
 
+        # The Global Batch Size
+        train_batch_size: Optional[int] = None
+
     def __init__(
         self,
         cfg: Config,
@@ -569,13 +572,11 @@ def run(
                     self.vlog(3, "Start step %s", self.step)
                     output = self._run_step(
                         utils.host_to_global_device_array(input_batch),
-                        force_run_evals=(
-                            force_run_eval_sets_at_max_step if self.step >= cfg.max_step else None
-                        ),
+                        force_run_evals=None,
                     )
                     self.vlog(3, "Done step %s", self.step)
                     num_steps += 1
-                    if num_steps % 100 == 0:
+                    if num_steps % 5 == 0:
                         now = time.perf_counter()
                         average_step_time = (now - start_time) / num_steps
                         self._step_log("Average step time: %s seconds", average_step_time)
@@ -1020,12 +1021,12 @@ def _run_step(
             # Run the compiled function.
             self._trainer_state, outputs = compiled_train_step_fn(self.trainer_state, input_batch)
 
-        if self.step % 100 == 0 or 0 <= self.step <= 5:
-            self._step_log(
-                "loss=%s aux=%s",
-                outputs["loss"],
-                jax.tree.map(lambda x: x.item() if x.ndim == 0 else f"T{x.shape}", outputs["aux"]),
-            )
+        # if self.step % 100 == 0 or 0 <= self.step <= 5:
+        self._step_log(
+            "loss=%s aux=%s",
+            outputs["loss"],
+            jax.tree.map(lambda x: x.item() if x.ndim == 0 else f"T{x.shape}", outputs["aux"]),
+        )
 
         self.summary_writer(self.step, {"loss": outputs["loss"], **outputs["summaries"]})
         # Aggregate summaries across evalers.

@@ -52,7 +52,8 @@ def setup(
         if initialization_timeout is not None:
             init_kwargs["initialization_timeout"] = initialization_timeout
 
-        if jax_backend == "tpu":
+        # TPU resources orchestrated by Pathways use 'proxy' as the JAX backend
+        if jax_backend in ("tpu", "proxy"):
             if not (
                 distributed_coordinator is None and num_processes is None and process_id is None
             ):
@@ -92,5 +93,7 @@ def setup(
                 # local_device_ids arg allows us to maintain expected behavior
                 init_kwargs["local_device_ids"] = list(range(8))
 
-        jax.distributed.initialize(**init_kwargs)
+        # When using Pathways proxy for TPU backend, jax distributed init is not needed
+        if jax_backend != "proxy":
+            jax.distributed.initialize(**init_kwargs)
         _jax_distributed_initialized = True
diff --git a/axlearn/experiments/text/gpt/fuji.py b/axlearn/experiments/text/gpt/fuji.py
@@ -15,8 +15,10 @@
 import itertools
 from typing import Any, Optional, Union
 
+from absl import flags
 from jax.ad_checkpoint import checkpoint_policies as jax_remat_policies
 
+from axlearn.cloud.gcp.system_characteristics import USER_FACING_NAME_TO_SYSTEM_CHARACTERISTICS
 from axlearn.common import causal_lm, config
 from axlearn.common.attention import (
     BaseStackedTransformerLayer,
@@ -54,6 +56,8 @@
 from axlearn.experiments.text.gpt.common import scaled_hidden_dim
 from axlearn.experiments.trainer_config_utils import TrainerConfigFn
 
+FLAGS = flags.FLAGS
+
 MODEL_SIZES = ("test", "1B", "3B", "7B", "8B", "70B")
 
 
@@ -122,6 +126,10 @@ def get_trainer_kwargs(
     max_step = TOTAL_TOKENS[version][model_size] // tokens_per_batch
     max_sequence_length = MAX_SEQUENCE_LENGTH[version]
     train_batch_size = tokens_per_batch // max_sequence_length
+    if FLAGS.pdbs:
+        import jax
+
+        train_batch_size = len(jax.devices()) * int(FLAGS.pdbs)
 
     # Whether to use grouped query attention.
     num_kv_heads = None
@@ -287,6 +295,25 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge|a3-highgpu-8g)-(256|512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=8),
                 ),
+                # tpu-v6e.
+                (
+                    "tpu-v6e-.*",
+                    ChainConfigModifier.default_config().set(
+                        config_modifiers=[
+                            MeshShapeModifier.default_config().set(
+                                mesh_shape=mesh_shape_from_axes(data=-1, fsdp=256)
+                            ),
+                            RematSpecModifier.default_config().set(
+                                remat_policies={
+                                    "model.decoder.transformer.layer": RematSpec(
+                                        prevent_cse=True,
+                                        policy=jax_remat_policies.nothing_saveable,
+                                    ),
+                                }
+                            ),
+                        ],
+                    ),
+                ),
             ),
         )
     elif model_size == "8B":
@@ -367,9 +394,40 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge|a3-highgpu-8g)-(256|512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=8),
                 ),
+                # tpu-v6e.
+                (
+                    "tpu-v6e-.*",
+                    ChainConfigModifier.default_config().set(
+                        config_modifiers=[
+                            MeshShapeModifier.default_config().set(
+                                mesh_shape=mesh_shape_from_axes(data=-1, fsdp=256)
+                            ),
+                            RematSpecModifier.default_config().set(
+                                remat_policies={
+                                    "model.decoder.transformer.layer": RematSpec(
+                                        prevent_cse=True,
+                                        policy=jax_remat_policies.nothing_saveable,
+                                    ),
+                                }
+                            ),
+                        ],
+                    ),
+                ),
             ),
         )
     elif model_size == "70B":
+        remat_policy_70b = config_for_function(
+            jax_remat_policies.save_and_offload_only_these_names
+        ).set(
+            names_which_can_be_saved=[],
+            names_which_can_be_offloaded=[
+                "FlashAttention.q_proj",
+                "FlashAttention.k_proj",
+                "FlashAttention.v_proj",
+            ],
+            offload_src="device",
+            offload_dst="pinned_host",
+        )
         trainer_kwargs = dict(
             model_kwargs=dict(
                 num_layers=80,
@@ -387,6 +445,8 @@ def get_trainer_kwargs(
             max_sequence_length=max_sequence_length,
             train_batch_size=train_batch_size,
             max_step=max_step,
+            # eval_every_n_steps=500,
+            save_every_n_steps=100,
             mesh_shape=mesh_shape_from_axes(fsdp=-1),
             mesh_rules=(
                 # TPU V5e maximum per device batch is 1.
@@ -398,13 +458,13 @@ def get_trainer_kwargs(
                     ChainConfigModifier.default_config().set(
                         config_modifiers=[
                             MeshShapeModifier.default_config().set(
-                                mesh_shape=mesh_shape_from_axes(data=-1, fsdp=256)
+                                mesh_shape=mesh_shape_from_axes(data=-1, fsdp=128)
                             ),
                             RematSpecModifier.default_config().set(
                                 remat_policies={
                                     "model.decoder.transformer.layer": RematSpec(
                                         prevent_cse=True,
-                                        policy=offload_dots_saveable_policy,
+                                        policy=jax_remat_policies.dots_saveable,
                                     ),
                                 }
                             ),
@@ -417,6 +477,26 @@ def get_trainer_kwargs(
                     "gpu-(p5.48xlarge|p4de.24xlarge)-(512|1024)",
                     mesh_shape_from_axes(data=-1, fsdp=128),
                 ),
+                # tpu-v6e.
+                (
+                    "tpu-v6e-.*",
+                    ChainConfigModifier.default_config().set(
+                        config_modifiers=[
+                            MeshShapeModifier.default_config().set(
+                                mesh_shape=mesh_shape_from_axes(data=-1, fsdp=256)
+                            ),
+                            RematSpecModifier.default_config().set(
+                                remat_policies={
+                                    "model.decoder.transformer.layer": RematSpec(
+                                        prevent_cse=True,
+                                        policy=remat_policy_70b,
+                                        # policy=jax_remat_policies.nothing_saveable,
+                                    ),
+                                }
+                            ),
+                        ],
+                    ),
+                ),
             ),
         )
     else: