huggingface · dacorvo · Jan 30, 2025 · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/README.md b/README.md
@@ -26,13 +26,13 @@ To install the latest release of this package:
 * For AWS Trainium (trn1) or AWS inferentia2 (inf2)
 
 ```bash
-pip install --upgrade-strategy eager optimum[neuronx]
+pip install --upgrade-strategy eager optimum-neuron[neuronx]
 ```
 
 * For AWS inferentia (inf1)
 
 ```bash
-pip install --upgrade-strategy eager optimum[neuron]
+pip install --upgrade-strategy eager optimum-neuron[neuron]
 ```
 
 Optimum Neuron is a fast-moving project, and you may want to install it from source:

diff --git a/docs/source/guides/export_model.mdx b/docs/source/guides/export_model.mdx
@@ -65,13 +65,13 @@ To export a 🤗 Transformers model to Neuron, you'll first need to install some
 **For Inf2**
 
 ```bash
-pip install optimum[neuronx]
+pip install optimum-neuron[neuronx]
 ```
 
 **For Inf1**
 
 ```bash
-pip install optimum[neuron]
+pip install optimum-neuron[neuron]
 ```
 
 The Optimum Neuron export can be used through Optimum command-line:

diff --git a/docs/source/inference_tutorials/stable_diffusion.mdx b/docs/source/inference_tutorials/stable_diffusion.mdx
@@ -25,7 +25,7 @@ limitations under the License.
 To get started, make sure you have [configured your inf2 / trn1 instance](../installation), and installed optimum:
 
 ```bash
-pip install "optimum[neuronx, diffusers]"
+pip install optimum-neuron[neuronx] diffusers
 ```
 
 ### Compile Stable Diffusion
@@ -585,7 +585,7 @@ pipe.save_pretrained("sd_neuron_controlnet")
 
 ### Text-to-Image
 
-For text-to-image, we can specify an additional conditioning input. 
+For text-to-image, we can specify an additional conditioning input.
 
 Here is an example with a canny image, a white outline of an image on a black background. The ControlNet will use the canny image as a control to guide the model to generate an image with the same outline.
 

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -29,11 +29,11 @@ python -m pip config set global.extra-index-url https://pip.repos.neuron.amazona
 ## Installing `optimum-neuron` for AWS Trainium (`trn1`) or AWS inferentia2 (`inf2`)
 
 ```bash
-python -m pip install --upgrade-strategy eager optimum[neuronx]
+python -m pip install --upgrade-strategy eager optimum-neuron[neuronx]
 ```
 
 ## Installing `optimum-neuron` for AWS inferentia (`inf1`)
 
 ```bash
-python -m pip install --upgrade-strategy eager optimum[neuron]
+python -m pip install --upgrade-strategy eager optimum-neuron[neuron]
 ```
diff --git a/infrastructure/ami/hcl2-files/build.pkr.hcl b/infrastructure/ami/hcl2-files/build.pkr.hcl
@@ -14,7 +14,7 @@ build {
     ]
   }
   provisioner "shell" {
-    inline = ["echo 'source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"]
+    inline = ["echo 'source /opt/aws_neuronx_venv_pytorch/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"]
   }
   provisioner "file" {
     source      = "scripts/welcome-msg.sh"

diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -10,7 +10,7 @@ variable "instance_type" {
 }
 
 variable "source_ami" {
-  default     = "ami-0980ce83654efe544"
+  default     = "ami-034a7ef9c22c72085"
   description = "Base Image"
   type        = string
   /*

diff --git a/infrastructure/ami/scripts/install-huggingface-libraries.sh b/infrastructure/ami/scripts/install-huggingface-libraries.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Activate the neuron virtual environment
-source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
+source /opt/aws_neuronx_venv_pytorch/bin/activate
 
 echo "Step: install-hugging-face-libraries"
 
@@ -15,12 +15,12 @@ pip install --upgrade --no-cache-dir \
     "markupsafe==2.1.1" \
     "jinja2==3.1.2" \
     "attrs==23.1.0" \
-    "hf_transfer>=0.1.4" 
+    "hf_transfer>=0.1.4"
 
 # Temporary fix for the issue: https://github.com/huggingface/optimum-neuron/issues/142
 pip install -U optimum
 echo 'export PATH="${HOME}/.local/bin:$PATH"' >> "${HOME}/.bashrc"
-# Add HF_TRANSFER by default 
+# Add HF_TRANSFER by default
 echo 'export HF_HUB_ENABLE_HF_TRANSFER=1' >> "${HOME}/.bashrc"
 
 echo "Step: install-and-copy-optimum-neuron-examples"

diff --git a/infrastructure/ami/scripts/validate-neuron.sh b/infrastructure/ami/scripts/validate-neuron.sh
@@ -3,7 +3,7 @@ echo "Step: validate-neuron-devices"
 neuron-ls
 
 # Activate the neuron virtual environment
-source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
+source /opt/aws_neuronx_venv_pytorch/bin/activate
 
 python -c 'import torch'
 python -c 'import torch_neuronx'

diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
@@ -25,7 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"optimum[neuronx, diffusers]\" matplotlib"
+    "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib"
    ]
   },
   {

diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
@@ -25,7 +25,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"optimum[neuronx, diffusers]\" matplotlib"
+    "!pip install \"optimum-neuron[neuronx]\" diffusers matplotlib"
    ]
   },
   {

diff --git a/optimum/exporters/neuron/base.py b/optimum/exporters/neuron/base.py
@@ -458,7 +458,7 @@ class NeuronDecoderConfig(NeuronConfig):
     def __init__(self, task: str):
         if not is_transformers_neuronx_available():
             raise ModuleNotFoundError(
-                "The mandatory transformers-neuronx package is missing. Please install optimum[neuronx]."
+                "The mandatory transformers-neuronx package is missing. Please install optimum-neuron[neuronx]."
             )
         if isinstance(self.NEURONX_CLASS, type):
             self._neuronx_class = self.NEURONX_CLASS

diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
@@ -20,7 +20,7 @@
 from typing import Any, Callable, Dict, List, Literal, Union
 
 import torch
-from transformers.modeling_utils import shard_checkpoint
+from huggingface_hub import split_torch_state_dict_into_shards
 from transformers.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -255,16 +255,27 @@ def consolidate_model_parallel_checkpoints_to_unified_checkpoint(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     state_dict = consolidate_model_parallel_checkpoints(checkpoint_dir)
-    shards, index = shard_checkpoint(
-        state_dict, weights_name=safe_weights_name if save_format == "safetensors" else weights_name
+    state_dict_split = split_torch_state_dict_into_shards(
+        state_dict, filename_pattern=safe_weights_name if save_format == "safetensors" else weights_name
     )
-    for shard_file, shard in shards.items():
-        if save_format == "safetensors":
-            save_file(shard, output_dir / shard_file, metadata={"format": "pt"})
-        else:
-            torch.save(shard, output_dir / shard_file)
-    if index is not None:
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
         save_index_file = SAFE_WEIGHTS_INDEX_NAME if save_format == "safetensors" else WEIGHTS_INDEX_NAME
         with open(output_dir / save_index_file, "w") as fp:
             content = json.dumps(index, indent=2, sort_keys=True) + "\n"
             fp.write(content)
+    # Save the model
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in filename_to_tensors:
+        shard = {}
+        for tensor in tensors:
+            shard[tensor] = state_dict[tensor].contiguous()
+            del state_dict[tensor]
+        if save_format == "safetensors":
+            save_file(shard, output_dir / shard_file, metadata={"format": "pt"})
+        else:
+            torch.save(shard, output_dir / shard_file)
diff --git a/optimum/neuron/distributed/decoder_models.py b/optimum/neuron/distributed/decoder_models.py
@@ -441,12 +441,11 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
         def attention_forward(
             self,
             hidden_states: torch.Tensor,
+            position_embeddings: Tuple[torch.Tensor, torch.Tensor],
             attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Cache] = None,
-            output_attentions: bool = False,
-            use_cache: bool = False,
             cache_position: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = False,
             **kwargs,
         ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
             if self.config.pretraining_tp > 1:
@@ -489,8 +488,8 @@ def attention_forward(
                 value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
             past_key_value = getattr(self, "past_key_value", past_key_value)
-            cos, sin = self.rotary_emb(value_states, position_ids)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
             if past_key_value is not None:
                 # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -539,7 +538,7 @@ def attention_forward(
             if not output_attentions:
                 attn_weights = None
 
-            return attn_output, attn_weights, past_key_value
+            return attn_output, attn_weights
 
         for module in model.modules():
             if isinstance(module, LlamaAttention):
@@ -588,6 +587,10 @@ def _parallelize(
             layers = model.model.layers
 
         for layer in layers:
+            # FIXME: temporary workaround to avoid too many changes in the transformation code
+            layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads
+            layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads
+            layer.self_attn.hidden_size = layer.self_attn.config.hidden_size
             layer.self_attn = LlamaParallelSelfAttention.transform(
                 model,
                 layer.self_attn,
@@ -717,12 +720,10 @@ def patch_for_sequence_parallelism(cls, model: "PreTrainedModel", sequence_paral
         def attention_forward(
             self,
             hidden_states: torch.Tensor,
+            position_embeddings: Tuple[torch.Tensor, torch.Tensor],
             attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
             past_key_value: Optional[Cache] = None,
             output_attentions: bool = False,
-            use_cache: bool = False,
-            cache_position: Optional[torch.LongTensor] = None,
         ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
             query_states = self.q_proj(hidden_states)
             key_states = self.k_proj(hidden_states)
@@ -745,12 +746,8 @@ def attention_forward(
                 key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
                 value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
 
-            kv_seq_len = key_states.shape[-2]
-            if past_key_value is not None:
-                kv_seq_len += cache_position[0]
-
-            cos, sin = self.rotary_emb(value_states, position_ids)
-            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+            cos, sin = position_embeddings
+            query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
 
             if past_key_value is not None:
                 cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
@@ -764,18 +761,7 @@ def attention_forward(
 
             attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
-            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
-                )
-
             if attention_mask is not None:
-                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                    raise ValueError(
-                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                    )
-
                 attn_weights = attn_weights + attention_mask
 
             # upcast attention to fp32
@@ -801,7 +787,7 @@ def attention_forward(
             if not output_attentions:
                 attn_weights = None
 
-            return attn_output, attn_weights, past_key_value
+            return attn_output, attn_weights
 
         for module in model.modules():
             if isinstance(module, MistralAttention):
@@ -831,6 +817,10 @@ def _parallelize(
                 **parallel_layer_specific_kwargs,
             )
         for layer in model.model.layers:
+            # FIXME: temporary workaround to avoid too many changes in the transformation code
+            layer.self_attn.num_heads = layer.self_attn.config.num_attention_heads
+            layer.self_attn.num_key_value_heads = layer.self_attn.config.num_key_value_heads
+            layer.self_attn.hidden_size = layer.self_attn.config.hidden_size
             layer.self_attn = MistralParallelSelfAttention.transform(
                 model,
                 layer.self_attn,

diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.0.28.dev0"
+__version__ = "0.0.28.dev1"
 
 __sdk_version__ = "2.20.2"
diff --git a/setup.py b/setup.py
@@ -13,10 +13,10 @@
 
 
 INSTALL_REQUIRES = [
-    "transformers == 4.46.2",
+    "transformers ~= 4.48.1",
     "accelerate == 0.29.2",
-    "optimum ~= 1.23.0",
-    "huggingface_hub >= 0.20.1",
+    "optimum ~= 1.23.3",
+    "huggingface_hub >= 0.28.0",
     "numpy>=1.22.2, <=1.25.2",
     "protobuf>=3.20.3, <4",
 ]
@@ -41,6 +41,7 @@
     "opencv-python-headless",
     "controlnet-aux",
     "mediapipe",
+    "timm >= 1.0.0",
 ]
 
 QUALITY_REQUIRES = [