diff --git a/.github/workflows/inference_cache.yml b/.github/workflows/inference_cache_llm.yml
similarity index 84%
rename from .github/workflows/inference_cache.yml
rename to .github/workflows/inference_cache_llm.yml
index 49dd1565b..bc4106b84 100644
--- a/.github/workflows/inference_cache.yml
+++ b/.github/workflows/inference_cache_llm.yml
@@ -1,4 +1,4 @@
-name: Optimum neuron inference cache builder
+name: Optimum neuron LLM inference cache builder
 
 on:
   workflow_dispatch:
@@ -12,7 +12,8 @@ concurrency:
 jobs:
   cache:
     name: Create optimum-neuron inference cache
-    runs-on: [self-hosted, 12-aws-inf2, 192-cpu, ci] # run the job on the newly created runner
+    runs-on:
+      group: aws-inf2-48xlarge
     env:
       AWS_REGION: us-east-1
     strategy:
@@ -38,13 +39,13 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v4
       - name: Install python and create venv
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/inference_cache_stable_diffusion.yml b/.github/workflows/inference_cache_stable_diffusion.yml
new file mode 100644
index 000000000..857aca840
--- /dev/null
+++ b/.github/workflows/inference_cache_stable_diffusion.yml
@@ -0,0 +1,52 @@
+name: Optimum neuron SD inference cache builder
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Schedule the workflow to run every Saturday at midnight UTC
+    - cron: '0 0 * * 6'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+
+jobs:
+  cache:
+    name: Create optimum-neuron inference cache
+    runs-on:
+      group: aws-inf2-8xlarge
+    env:
+      AWS_REGION: us-east-1
+    strategy:
+      fail-fast: false
+      matrix:
+        config: [stable-diffusion]
+    steps:
+      - name: Install Neuron runtime
+        run: |
+          . /etc/os-release
+          sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
+          deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
+          EOF
+          wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+          sudo apt-get update -y
+          sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b  -y
+          export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install python and create venv
+        run: |
+          sudo apt install python3-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+      - name: Install optimum neuron
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install .[neuronx,diffusers]
+      - name: Create cache for ${{matrix.config}} models
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          config_prefix_url=https://huggingface.co/aws-neuron/optimum-neuron-cache/raw/main/inference-cache-config
+          HF_TOKEN=${{secrets.HF_TOKEN_OPTIMUM_NEURON_CACHE}} \
+            python tools/auto_fill_inference_cache.py --config_file ${config_prefix_url}/${{matrix.config}}.json
diff --git a/.github/workflows/test_inf1_export.yml b/.github/workflows/test_inf1_export.yml
index 3512a5b1e..4fcf3a8b5 100644
--- a/.github/workflows/test_inf1_export.yml
+++ b/.github/workflows/test_inf1_export.yml
@@ -10,9 +10,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF1 export tests
-    runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf1-6xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -28,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch
diff --git a/.github/workflows/test_inf1_full_export.yml b/.github/workflows/test_inf1_full_export.yml
index fbaf70727..0498fe372 100644
--- a/.github/workflows/test_inf1_full_export.yml
+++ b/.github/workflows/test_inf1_full_export.yml
@@ -10,9 +10,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF1 full export tests
-    runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf1-6xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -28,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch
diff --git a/.github/workflows/test_inf1_inference.yml b/.github/workflows/test_inf1_inference.yml
index b587f8cd3..3b081e2df 100644
--- a/.github/workflows/test_inf1_inference.yml
+++ b/.github/workflows/test_inf1_inference.yml
@@ -10,9 +10,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF1 tests
-    runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf1-6xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -28,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch
diff --git a/.github/workflows/test_inf1_pipelines.yml b/.github/workflows/test_inf1_pipelines.yml
index 7351f8dcf..d4688fe1f 100644
--- a/.github/workflows/test_inf1_pipelines.yml
+++ b/.github/workflows/test_inf1_pipelines.yml
@@ -10,9 +10,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF1 tests
-    runs-on: [self-hosted, 4-aws-inf1, 24-cpu, ci]
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf1-6xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -28,7 +27,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install system packages
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
       - name: Install python packages
         run: |
           python3 -m venv aws_neuron_venv_pytorch
diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
index c709a30df..fbcc14e3e 100644
--- a/.github/workflows/test_inf2.yml
+++ b/.github/workflows/test_inf2.yml
@@ -6,11 +6,13 @@ on:
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,9 +21,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF2 tests
-    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf2-8xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -37,7 +38,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_inf2_export.yml b/.github/workflows/test_inf2_export.yml
index d0bebb778..f7fc25206 100644
--- a/.github/workflows/test_inf2_export.yml
+++ b/.github/workflows/test_inf2_export.yml
@@ -6,11 +6,13 @@ on:
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2_export.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2_export.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,9 +21,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF2 export tests
-    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf2-8xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -37,7 +38,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_inf2_full_export.yml b/.github/workflows/test_inf2_full_export.yml
index b16274920..23ea4148f 100644
--- a/.github/workflows/test_inf2_full_export.yml
+++ b/.github/workflows/test_inf2_full_export.yml
@@ -5,10 +5,12 @@ on:
     branches: [ main ]
     paths:
       - "optimum/exporters/neuron/*.py"
+      - ".github/workflows/test_inf2_full_export.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "optimum/exporters/neuron/*.py"
+      - ".github/workflows/test_inf2_full_export.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,9 +19,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF2 full export tests
-    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf2-8xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -35,7 +36,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_inf2_inference.yml b/.github/workflows/test_inf2_inference.yml
index 006a73110..ae9c3b8c0 100644
--- a/.github/workflows/test_inf2_inference.yml
+++ b/.github/workflows/test_inf2_inference.yml
@@ -6,11 +6,13 @@ on:
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2_inference.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_inf2_inference.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,9 +21,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run INF2 tests
-    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf2-8xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -37,7 +38,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
index 02691a828..79e3f5f26 100644
--- a/.github/workflows/test_inf2_tgi.yml
+++ b/.github/workflows/test_inf2_tgi.yml
@@ -7,12 +7,14 @@ on:
       - "setup.py"
       - "optimum/**.py"
       - "text-generation-inference/**"
+      - ".github/workflows/test_inf2_tgi.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/**.py"
       - "text-generation-inference/**"
+      - ".github/workflows/test_inf2_tgi.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -21,9 +23,8 @@ concurrency:
 jobs:
   do-the-job:
     name: Run TGI tests
-    runs-on: [self-hosted, 1-aws-inf2, 32-cpu, ci] # run the job on the newly created runner
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-inf2-8xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -39,7 +40,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python and create venv
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
index 738d4d482..017f24a55 100644
--- a/.github/workflows/test_trainium_common.yml
+++ b/.github/workflows/test_trainium_common.yml
@@ -6,11 +6,13 @@ on:
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_trainium_common.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/**.py"
+      - ".github/workflows/test_trainium_common.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -19,9 +21,9 @@ concurrency:
 jobs:
   optimum-neuron-tests:
     name: Run common tests on Trainium 1
-    runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
+    runs-on:
+      group: aws-trn1-32xlarge
     env:
-      AWS_REGION: us-east-1
       TESTS_TO_IGNORE_FLAGS: --ignore tests/distributed/ --ignore tests/test_examples.py
     steps:
       - name: Install Neuron runtime
@@ -38,7 +40,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
index e6801b67b..4c4081733 100644
--- a/.github/workflows/test_trainium_distributed.yml
+++ b/.github/workflows/test_trainium_distributed.yml
@@ -6,11 +6,13 @@ on:
     paths:
       - "setup.py"
       - "optimum/neuron/distributed/**.py"
+      - ".github/workflows/test_trainium_distributed.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "setup.py"
       - "optimum/neuron/distributed/**.py"
+      -  ".github/workflows/test_trainium_distributed.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -20,9 +22,8 @@ concurrency:
 jobs:
   optimum-neuron-tests:
     name: Run distributed tests on Trainium 1
-    runs-on: [self-hosted, 16-aws-trn, 128-cpu, ci]
-    env:
-      AWS_REGION: us-east-1
+    runs-on:
+      group: aws-trn1-32xlarge
     steps:
       - name: Install Neuron runtime
         run: |
@@ -40,7 +41,7 @@ jobs:
         run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml
index bb96965ed..52397a2ac 100644
--- a/.github/workflows/test_trainium_examples.yml
+++ b/.github/workflows/test_trainium_examples.yml
@@ -28,9 +28,9 @@ concurrency:
 jobs:
   optimum-neuron-tests:
     name: Run example script tests on Trainium 1
-    runs-on: [self-hosted, 1-aws-trn, 8-cpu, ci] # run the job on the newly created runner
+    runs-on:
+      group: aws-trn1-2xlarge
     env:
-      AWS_REGION: us-east-1
       RUN_TINY: ${{ github.event.inputs.model_size == "tiny" && "1" || "0" }}
     steps:
       - name: Install Neuron runtime
@@ -49,7 +49,7 @@ jobs:
         run: echo "/home/ubuntu/.local/bin" >> $GITHUB_PATH
       - name: Install python dependencies
         run: |
-          sudo apt install python3.8-venv python3-dev -y
+          sudo apt install python3-venv python3-dev -y
           python3 -m venv aws_neuron_venv_pytorch
           source aws_neuron_venv_pytorch/bin/activate
           python -m pip install -U pip
diff --git a/Makefile b/Makefile
index 3fe53135d..1ae0aa514 100644
--- a/Makefile
+++ b/Makefile
@@ -40,7 +40,7 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 2.0.2
+TGI_VERSION ?= 2.1.1
 
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
diff --git a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-1.csv b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-1.csv
index a73b2409b..be95fdf57 100644
--- a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-1.csv
+++ b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-1.csv
@@ -1,11 +1,8 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,38.29638310438374,0.5521726660008426,24.784959740501066
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,38.98036959617541,2.72243953349971,32.827924415254174
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,39.39299322930307,8.926065296996967,63.795771842799695
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,39.85480734427003,22.479033984491252,110.33245410384168
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,39.797703130119444,48.74777327400079,218.4971534548553
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,39.88112179496438,98.32968477499526,419.0164926030421
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,40.021570341867225,201.50347035600862,787.0418267487788
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,40.15190355766733,412.9219288924942,1608.1377339868322
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,40.10404829156176,831.7238280020028,3167.7755826448656
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,39.94606130182408,1654.066714687011,6348.469898092637
\ No newline at end of file
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,47.036052229904286,0.5264043899951503,20.465843433893646
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,47.80577679519902,2.3793273210758343,28.003227711219164
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,48.461899623182426,7.175522217527032,52.262069554218435
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,48.773082055567116,18.760041670990176,84.57645582900525
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,49.25268191199813,38.44228728755843,170.77333503909537
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,49.416289879279326,80.52483583055437,339.16646354168245
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,49.64596189738265,162.74696793837938,653.7145961071376
\ No newline at end of file
diff --git a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-8.csv b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-8.csv
index a6a749456..3f1359b72 100644
--- a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-8.csv
+++ b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results-batchsize-8.csv
@@ -1,11 +1,8 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,17.8322790536497,0.9939256490033586,54.45429111182844
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,31.140113024869468,1.418605798491626,58.17940704286386
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,52.71447508703364,3.691673280511168,65.510341492747
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,85.23757246875635,7.40343523149204,79.86574747355823
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,83.41704442714865,12.134337133495137,119.80365178993138
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,86.31413401709217,33.19637775150477,221.51387761253872
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,91.54051788296289,78.17263232148252,378.5575452672668
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,93.59227409861985,163.85781266850245,709.4836254794548
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,94.49695504491365,332.89309809000406,1342.054465909721
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,94.76202310893393,671.8385370509932,2633.1926459323054
\ No newline at end of file
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,30.427246781641298,0.8465556244991603,31.940039622234323
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,51.814449828727014,1.6578905005007982,35.303453482741524
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,80.2709630081468,3.330909075506497,42.56936121417554
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,123.95265621601503,6.631509564504086,55.705136697349
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,123.41551351427066,9.613880677999987,86.92725453610271
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,129.01888179479806,24.736346793506527,151.7353908036725
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,133.1848930344421,56.04210297649843,269.5292990644778
\ No newline at end of file
diff --git a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results.csv b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results.csv
index c29c7211e..e764ef36c 100644
--- a/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results.csv
+++ b/benchmark/text-generation-inference/llama3-70b-trn1.32xlarge/tgi-results.csv
@@ -1,11 +1,9 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,27.321283482983713,0.9897541589998582,34.53017190612728
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,47.14780790833105,1.4317841799993403,38.47682874008382
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,75.46880157534952,3.7293467640001836,45.219761063884626
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,76.656177664245,6.710071522500584,67.5562098563004
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,78.10745154737947,18.174910198499674,130.32796764867985
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,80.94695720514072,42.99618862100033,211.52529640942643
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,83.41961944293132,90.68870028399942,387.7336944140728
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,84.68410927601217,187.20342993849863,761.1909438667759
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,85.08930039980858,376.98190486400017,1484.3806421055476
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,84.99711473871804,758.8232675055006,2947.3092666464
\ No newline at end of file
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,33.13623382956197,0.847853995503101,29.194137679169646
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,55.61050569710368,1.2736788609981886,32.90289103801151
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,86.58456489701315,3.34435009349545,39.293303536708166
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,90.20366126074002,5.904863483490772,55.9280860250687
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,93.46094649258835,16.161018327497004,108.1654402765888
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,94.60090167352047,37.34856389850029,182.42863576574968
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,97.6200946608771,80.71991845800221,357.9967863853603
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,98.4129882974856,165.12605451900163,684.5967867979082
\ No newline at end of file
diff --git a/docs/source/containers.mdx b/docs/source/containers.mdx
index c05c59b3d..a03c31861 100644
--- a/docs/source/containers.mdx
+++ b/docs/source/containers.mdx
@@ -15,15 +15,26 @@ specific language governing permissions and limitations under the License.
 We provide pre-built Optimum Neuron containers for Amazon SageMaker. These containers come with all of the Hugging Face libraries and dependencies pre-installed, so you can start using them right away.
 We have containers for training and inference, and optimized text generation containers with TGI. The table is up to date and only includes the latest versions of each container. You can find older versions in the [Deep Learning Container Release Notes](https://github.com/aws/deep-learning-containers/releases?q=hf-neuronx&expanded=true)
 
-We recommend using the `sagemaker` Python SDK to retrieve the image URI for the container you want to use.
+We recommend using the `sagemaker` Python SDK to retrieve the image URI for the container you want to use. Here is a code snippet to retrieve the latest Text Generation Inference container Image URI:
+```python
+from sagemaker.huggingface import get_huggingface_llm_image_uri
+ 
+# retrieve the llm image uri
+llm_image = get_huggingface_llm_image_uri(
+  "huggingface-neuronx"
+)
+
+print(f"llm image uri: {llm_image}")
+
+```
 
 ## Available Optimum Neuron Containers
 
 | Type                       | Optimum Version | Image URI                                   |
 |-----------------------------|-----------------|---------------------------------------------|
-| Training  | 0.0.21           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training-neuronx:1.13.1-transformers4.36.2-neuronx-py310-sdk2.18.0-ubuntu20.04`   |
-| Inference      | 0.0.22           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuronx:2.1.2-transformers4.36.2-neuronx-py310-sdk2.18.0-ubuntu20.04`      |
-| Text Generation Inference        | 0.0.22           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.2-optimum0.0.22-neuronx-py310-ubuntu22.04`        |
+| Training  | 0.0.24           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training-neuronx:2.1.2-transformers4.41.1-neuronx-py310-sdk2.19.1-ubuntu20.04`   |
+| Inference      | 0.0.24           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-inference-neuronx:2.1.2-transformers4.41.1-neuronx-py310-sdk2.19.1-ubuntu20.04`      |
+| Text Generation Inference        | 0.0.24           | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.1.2-optimum0.0.24-neuronx-py310-ubuntu22.04`        |
 
 
 Please replace `763104351884` with the correct [AWS account ID](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/image_uri_config/huggingface-neuronx.json) and `region` with the AWS region you are working in.
diff --git a/docs/source/inference_tutorials/stable_diffusion.mdx b/docs/source/inference_tutorials/stable_diffusion.mdx
index 408924cd1..df9530875 100644
--- a/docs/source/inference_tutorials/stable_diffusion.mdx
+++ b/docs/source/inference_tutorials/stable_diffusion.mdx
@@ -635,4 +635,55 @@ compare.save("compare.png")
 />
 
 
+
+## ControlNet with Stable Diffusion XL
+
+### Compile
+
+```bash
+optimum-cli export neuron -m stabilityai/stable-diffusion-xl-base-1.0 --task stable-diffusion-xl --batch_size 1 --height 1024 --width 1024 --controlnet_ids diffusers/controlnet-canny-sdxl-1.0-small --num_images_per_prompt 1 sdxl_neuron_controlnet/
+```
+
+### Text-to-Image
+
+```python
+import cv2
+import numpy as np
+from diffusers.utils import load_image
+from PIL import Image
+from optimum.neuron import NeuronStableDiffusionXLControlNetPipeline
+
+# Inputs
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = "low quality, bad quality, sketches"
+
+image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+image = np.array(image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+image = Image.fromarray(image)
+
+controlnet_conditioning_scale = 0.5  # recommended for good generalization
+
+pipe = NeuronStableDiffusionXLControlNetPipeline.from_pretrained("sdxl_neuron_controlnet")
+
+images = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    image=image,
+    controlnet_conditioning_scale=controlnet_conditioning_scale,
+).images
+images[0].save("hug_lab.png")
+```
+
+<img
+  src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/neuron/models/12-sdxl-text2img-controlnet.png?download=true"
+  width="768"
+  height="256"
+  alt="stable diffusion xl generated image with controlnet."
+/>
+
 Are there any other stable diffusion features that you want us to support in 🤗`Optimum-neuron`? Please file an issue to [`Optimum-neuron` Github repo](https://github.com/huggingface/optimum-neuron) or discuss with us on [HuggingFace’s community forum](https://discuss.huggingface.co/c/optimum/), cheers 🤗 !
diff --git a/docs/source/package_reference/modeling.mdx b/docs/source/package_reference/modeling.mdx
index f0258b772..121275fb2 100644
--- a/docs/source/package_reference/modeling.mdx
+++ b/docs/source/package_reference/modeling.mdx
@@ -139,3 +139,8 @@ The following Neuron model classes are available for stable diffusion tasks.
 ### NeuronStableDiffusionXLInpaintPipeline
 [[autodoc]] modeling_diffusion.NeuronStableDiffusionXLInpaintPipeline
     - __call__
+
+### NeuronStableDiffusionXLControlNetPipeline
+
+[[autodoc]] modeling_diffusion.NeuronStableDiffusionXLControlNetPipeline
+    - __call__
diff --git a/docs/source/training_tutorials/finetune_llm.mdx b/docs/source/training_tutorials/finetune_llm.mdx
index f64a144df..4928ca37c 100644
--- a/docs/source/training_tutorials/finetune_llm.mdx
+++ b/docs/source/training_tutorials/finetune_llm.mdx
@@ -16,7 +16,7 @@ limitations under the License.
 
 # Fine-tune and Test Llama-3 8B on AWS Trainium 
 
-_Note: The complete script for this tutorial can be downloaded [here](https://github.com/huggingface/optimum-neuron/docs/source/training_tutorials/finetune_llm.py)._
+_Note: The complete script for this tutorial can be downloaded [here](https://github.com/huggingface/optimum-neuron/blob/main/docs/source/training_tutorials/finetune_llm.py)._
 
 This tutorial will teach you how to fine-tune open source LLMs like [Llama 3](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on AWS Trainium. In our example, we are going to leverage the [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index), [Transformers](https://huggingface.co/docs/transformers/index) and [Datasets](https://huggingface.co/docs/datasets/index) libraries.
 
diff --git a/infrastructure/ami/hcl2-files/build.pkr.hcl b/infrastructure/ami/hcl2-files/build.pkr.hcl
index f9327dacf..e412ca2e8 100644
--- a/infrastructure/ami/hcl2-files/build.pkr.hcl
+++ b/infrastructure/ami/hcl2-files/build.pkr.hcl
@@ -14,7 +14,7 @@ build {
     ]
   }
   provisioner "shell" {
-    inline = ["echo 'source /opt/aws_neuron_venv_pytorch/bin/activate' >> /home/ubuntu/.bashrc"]
+    inline = ["echo 'source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate' | sudo tee -a /home/ubuntu/.bashrc"]
   }
   provisioner "file" {
     source      = "scripts/welcome-msg.sh"
@@ -26,4 +26,4 @@ build {
       "sudo chmod +x /etc/update-motd.d/99-custom-message",
     ]
   }
-}
\ No newline at end of file
+}
diff --git a/infrastructure/ami/hcl2-files/variables.pkr.hcl b/infrastructure/ami/hcl2-files/variables.pkr.hcl
index fe23fb9ca..d20b3ff77 100644
--- a/infrastructure/ami/hcl2-files/variables.pkr.hcl
+++ b/infrastructure/ami/hcl2-files/variables.pkr.hcl
@@ -10,7 +10,7 @@ variable "instance_type" {
 }
 
 variable "source_ami" {
-  default     = "ami-0274e546d67626305"
+  default     = "ami-0bcb701dd3cace633"
   description = "Base Image"
   type        = string
   /*
@@ -18,7 +18,7 @@ variable "source_ami" {
   aws ec2 describe-images \
       --region us-east-1 \
       --owners amazon \
-      --filters 'Name=name,Values=Deep Learning AMI Neuron PyTorch 1.13 (Ubuntu 20.04) ????????' 'Name=state,Values=available' \
+      --filters 'Name=name,Values=Deep Learning AMI Neuron ????????' 'Name=state,Values=available' \
       --query 'reverse(sort_by(Images, &CreationDate))[:1].ImageId' \
       --output text
   */
diff --git a/infrastructure/ami/scripts/install-huggingface-libraries.sh b/infrastructure/ami/scripts/install-huggingface-libraries.sh
index c9825ddec..406691ff2 100644
--- a/infrastructure/ami/scripts/install-huggingface-libraries.sh
+++ b/infrastructure/ami/scripts/install-huggingface-libraries.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 # Activate the neuron virtual environment
-source /opt/aws_neuron_venv_pytorch/bin/activate
+source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
 
 echo "Step: install-hugging-face-libraries"
 
@@ -34,4 +34,4 @@ rm -rf optimum-neuron
 chmod -R 777 /home/ubuntu/huggingface-neuron-samples /home/ubuntu/huggingface-neuron-notebooks
 
 echo "Step: validate-imports-of-huggingface-libraries"
-bash -c 'python -c "import transformers;import datasets;import accelerate;import evaluate;import tensorboard; import torch;from optimum.neuron import pipeline"'
\ No newline at end of file
+bash -c 'python -c "import transformers;import datasets;import accelerate;import evaluate;import tensorboard; import torch;from optimum.neuron import pipeline"'
diff --git a/infrastructure/ami/scripts/validate-neuron.sh b/infrastructure/ami/scripts/validate-neuron.sh
index c2fdcb7de..5d8c99109 100644
--- a/infrastructure/ami/scripts/validate-neuron.sh
+++ b/infrastructure/ami/scripts/validate-neuron.sh
@@ -3,11 +3,11 @@ echo "Step: validate-neuron-devices"
 neuron-ls
 
 # Activate the neuron virtual environment
-source /opt/aws_neuron_venv_pytorch/bin/activate
+source /opt/aws_neuronx_venv_pytorch_2_1/bin/activate
 
 python -c 'import torch'
 python -c 'import torch_neuronx'
 
 echo "Installing Tensorboard Plugin for Neuron"
 pip install --upgrade --no-cache-dir \
-    "tensorboard-plugin-neuronx"
\ No newline at end of file
+    "tensorboard-plugin-neuronx"
diff --git a/notebooks/sentence-transformers/getting-started.ipynb b/notebooks/sentence-transformers/getting-started.ipynb
index b72071dea..148022fe8 100644
--- a/notebooks/sentence-transformers/getting-started.ipynb
+++ b/notebooks/sentence-transformers/getting-started.ipynb
@@ -46,6 +46,7 @@
    "source": [
     "from optimum.neuron import NeuronModelForSentenceTransformers\n",
     "\n",
+    "\n",
     "# Sentence Transformers model from HuggingFace\n",
     "model_id = \"BAAI/bge-small-en-v1.5\"\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 384}  # mandatory shapes\n",
@@ -88,9 +89,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from optimum.neuron import NeuronModelForSentenceTransformers\n",
     "from transformers import AutoTokenizer\n",
     "\n",
+    "from optimum.neuron import NeuronModelForSentenceTransformers\n",
+    "\n",
+    "\n",
     "model_id_or_path = \"bge_emb_inf2/\"\n",
     "tokenizer_id = \"BAAI/bge-small-en-v1.5\"\n",
     "\n",
diff --git a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
index dc26906f8..775fd254c 100644
--- a/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-txt2img.ipynb
@@ -55,6 +55,7 @@
    "source": [
     "from optimum.neuron import NeuronStableDiffusionPipeline\n",
     "\n",
+    "\n",
     "model_id = \"stabilityai/stable-diffusion-2-1\"\n",
     "num_image_per_prompt = 1\n",
     "input_shapes = {\"batch_size\": 1, \"height\": 768, \"width\": 768, \"num_image_per_prompt\": num_image_per_prompt}\n",
@@ -374,6 +375,8 @@
    "outputs": [],
    "source": [
     "from diffusers import DPMSolverMultistepScheduler\n",
+    "\n",
+    "\n",
     "stable_diffusion.scheduler = DPMSolverMultistepScheduler.from_config(stable_diffusion.scheduler.config)"
    ]
   },
@@ -384,11 +387,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from matplotlib import pyplot as plt\n",
-    "from matplotlib import image as mpimg\n",
     "import time\n",
-    "import copy\n",
-    "import numpy as np "
+    "\n",
+    "import numpy as np\n",
+    "from matplotlib import image as mpimg\n",
+    "from matplotlib import pyplot as plt"
    ]
   },
   {
diff --git a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
index 189890438..c8fafda7c 100644
--- a/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
+++ b/notebooks/stable-diffusion/stable-diffusion-xl-txt2img.ipynb
@@ -56,6 +56,7 @@
    "source": [
     "from optimum.neuron import NeuronStableDiffusionXLPipeline\n",
     "\n",
+    "\n",
     "model_id = \"stabilityai/stable-diffusion-xl-base-1.0\"\n",
     "num_image_per_prompt = 1\n",
     "input_shapes = {\"batch_size\": 1, \"height\": 1024, \"width\": 1024, \"num_image_per_prompt\": num_image_per_prompt}\n",
@@ -423,6 +424,8 @@
    "outputs": [],
    "source": [
     "from diffusers import DPMSolverMultistepScheduler\n",
+    "\n",
+    "\n",
     "stable_diffusion_xl.scheduler = DPMSolverMultistepScheduler.from_config(stable_diffusion_xl.scheduler.config)"
    ]
   },
@@ -433,11 +436,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from matplotlib import pyplot as plt\n",
-    "from matplotlib import image as mpimg\n",
     "import time\n",
-    "import copy\n",
-    "import numpy as np "
+    "\n",
+    "import numpy as np\n",
+    "from matplotlib import image as mpimg\n",
+    "from matplotlib import pyplot as plt"
    ]
   },
   {
diff --git a/notebooks/text-classification/notebook.ipynb b/notebooks/text-classification/notebook.ipynb
index 7b0343d09..b03ac1502 100644
--- a/notebooks/text-classification/notebook.ipynb
+++ b/notebooks/text-classification/notebook.ipynb
@@ -85,6 +85,7 @@
    "source": [
     "from datasets import load_dataset\n",
     "\n",
+    "\n",
     "# Dataset id from huggingface.co/dataset\n",
     "dataset_id = \"philschmid/emotion\"\n",
     "\n",
@@ -116,6 +117,7 @@
    "source": [
     "from random import randrange\n",
     "\n",
+    "\n",
     "random_id = randrange(len(raw_dataset['train']))\n",
     "raw_dataset['train'][random_id]\n",
     "# {'text': 'i feel isolated and alone in my trade', 'label': 0}"
@@ -139,8 +141,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from transformers import AutoTokenizer\n",
     "import os\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "\n",
     "# Model id to load the tokenizer\n",
     "model_id = \"bert-base-uncased\"\n",
     "save_dataset_path = \"lm_dataset\"\n",
diff --git a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
index 1a50dbafc..f3ebf98fc 100644
--- a/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
+++ b/notebooks/text-generation/CodeLlama-7B-Compilation.ipynb
@@ -96,6 +96,7 @@
    "source": [
     "from optimum.neuron import pipeline\n",
     "\n",
+    "\n",
     "p = pipeline('text-generation', 'aws-neuron/CodeLlama-7b-hf-neuron-8xlarge')\n",
     "p(\"import socket\\n\\ndef ping_exponential_backoff(host: str):\",\n",
     "    do_sample=True,\n",
@@ -188,10 +189,12 @@
    "outputs": [],
    "source": [
     "from optimum.neuron import NeuronModelForCausalLM\n",
+    "\n",
+    "\n",
     "#num_cores should be changed based on the instance.  inf2.24xlarge has 6 neuron processors (they have two cores each) so 12 total\n",
     "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
-    "model = NeuronModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes) "
+    "model = NeuronModelForCausalLM.from_pretrained(\"codellama/CodeLlama-7b-hf\", export=True, **compiler_args, **input_shapes)"
    ]
   },
   {
@@ -211,8 +214,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")\n",
-    " "
+    "model.save_pretrained(\"CodeLlama-7b-hf-neuron-8xlarge\")\n"
    ]
   },
   {
@@ -251,10 +253,21 @@
    "outputs": [],
    "source": [
     "from huggingface_hub.hf_api import HfFolder\n",
-    "HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')\n",
     "\n",
-    "from huggingface_hub import login\n",
-    "from huggingface_hub import HfApi\n",
+    "\n",
+    "HfFolder.save_token('MY_HUGGINGFACE_TOKEN_HERE')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdbc2537",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi, login\n",
+    "\n",
+    "\n",
     "api = HfApi()\n",
     "login()\n",
     "\n",
@@ -264,9 +277,7 @@
     "    repo_type=\"model\",\n",
     "    multi_commits=True,\n",
     "    multi_commits_verbose=True,\n",
-    ")\n",
-    "\n",
-    "\n"
+    ")"
    ]
   }
  ],
diff --git a/notebooks/text-generation/llama2-13b-chatbot.ipynb b/notebooks/text-generation/llama2-13b-chatbot.ipynb
index 59ece3802..788ee756f 100644
--- a/notebooks/text-generation/llama2-13b-chatbot.ipynb
+++ b/notebooks/text-generation/llama2-13b-chatbot.ipynb
@@ -61,7 +61,6 @@
    "outputs": [],
    "source": [
     "# Special widgets are required for a nicer display\n",
-    "import sys\n",
     "!{sys.executable} -m pip install ipywidgets"
    ]
   },
@@ -103,6 +102,7 @@
    "source": [
     "from optimum.neuron import NeuronModelForCausalLM\n",
     "\n",
+    "\n",
     "compiler_args = {\"num_cores\": 24, \"auto_cast_type\": 'fp16'}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
     "model = NeuronModelForCausalLM.from_pretrained(\n",
@@ -153,6 +153,7 @@
    "source": [
     "from huggingface_hub import notebook_login\n",
     "\n",
+    "\n",
     "notebook_login(new_session=False)"
    ]
   },
@@ -175,6 +176,7 @@
    "source": [
     "from huggingface_hub import whoami\n",
     "\n",
+    "\n",
     "org = whoami()['name']\n",
     "\n",
     "repo_id = f\"{org}/llama-2-13b-chat-neuron\"\n",
@@ -238,6 +240,7 @@
    "source": [
     "from optimum.neuron import NeuronModelForCausalLM\n",
     "\n",
+    "\n",
     "try:\n",
     "    model\n",
     "except NameError:\n",
@@ -262,6 +265,7 @@
    "source": [
     "from transformers import AutoTokenizer\n",
     "\n",
+    "\n",
     "tokenizer = AutoTokenizer.from_pretrained(\"NousResearch/Llama-2-13b-chat-hf\")"
    ]
   },
@@ -320,13 +324,10 @@
    "source": [
     "def format_chat_prompt(message, history, max_tokens):\n",
     "    \"\"\" Convert a history of messages to a chat prompt\n",
-    "    \n",
-    "    \n",
     "    Args:\n",
     "        message(str): the new user message.\n",
     "        history (List[str]): the list of user messages and assistant responses.\n",
     "        max_tokens (int): the maximum number of input tokens accepted by the model.\n",
-    "    \n",
     "    Returns:\n",
     "        a `str` prompt.\n",
     "    \"\"\"\n",
diff --git a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
index f86eef356..c8db71270 100644
--- a/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
+++ b/notebooks/text-generation/llama2-7b-fine-tuning.ipynb
@@ -154,9 +154,11 @@
     }
    ],
    "source": [
-    "from datasets import load_dataset\n",
     "from random import randrange\n",
     "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "\n",
     "# Load dataset from the hub\n",
     "dataset = load_dataset(\"databricks/databricks-dolly-15k\", split=\"train\")\n",
     "\n",
@@ -215,6 +217,7 @@
    "source": [
     "from random import randrange\n",
     "\n",
+    "\n",
     "print(format_dolly(dataset[randrange(len(dataset))]))"
    ]
   },
@@ -233,6 +236,7 @@
    "source": [
     "from transformers import AutoTokenizer\n",
     "\n",
+    "\n",
     "# Hugging Face model id\n",
     "model_id = \"philschmid/Llama-2-7b-hf\" # ungated\n",
     "# model_id = \"meta-llama/Llama-2-7b-hf\" # gated\n",
@@ -257,10 +261,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from random import randint\n",
     "# add utils method to path for loading dataset\n",
     "import sys\n",
-    "sys.path.append(\"./scripts/utils\") # make sure you change this to the correct path \n",
+    "from random import randint\n",
+    "\n",
+    "\n",
+    "sys.path.append(\"./scripts/utils\") # make sure you change this to the correct path\n",
     "from pack_dataset import pack_dataset\n",
     "\n",
     "\n",
@@ -337,7 +343,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# precompilation command \n",
+    "# precompilation command\n",
     "!MALLOC_ARENA_MAX=64 neuron_parallel_compile torchrun --nproc_per_node=32 scripts/run_clm.py \\\n",
     " --model_id {model_id} \\\n",
     " --dataset_path {dataset_path} \\\n",
@@ -455,9 +461,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from optimum.neuron import NeuronModelForCausalLM\n",
     "from transformers import AutoTokenizer\n",
     "\n",
+    "from optimum.neuron import NeuronModelForCausalLM\n",
+    "\n",
+    "\n",
     "compiler_args = {\"num_cores\": 2, \"auto_cast_type\": 'fp16'}\n",
     "input_shapes = {\"batch_size\": 1, \"sequence_length\": 2048}\n",
     "\n",
@@ -502,13 +510,13 @@
     "def format_dolly_infernece(sample):\n",
     "    instruction = f\"### Instruction\\n{sample['instruction']}\"\n",
     "    context = f\"### Context\\n{sample['context']}\" if \"context\" in sample else None\n",
-    "    response = f\"### Answer\\n\"\n",
+    "    response = \"### Answer\\n\"\n",
     "    # join all the parts together\n",
     "    prompt = \"\\n\\n\".join([i for i in [instruction, context, response] if i is not None])\n",
     "    return prompt\n",
     "\n",
     "\n",
-    "def generate(sample): \n",
+    "def generate(sample):\n",
     "    prompt = format_dolly_infernece(sample)\n",
     "    inputs = tokenizer(prompt, return_tensors=\"pt\")\n",
     "    outputs = model.generate(**inputs,\n",
diff --git a/optimum/exporters/neuron/__main__.py b/optimum/exporters/neuron/__main__.py
index 74b4d1cf1..2fa28e68a 100644
--- a/optimum/exporters/neuron/__main__.py
+++ b/optimum/exporters/neuron/__main__.py
@@ -52,7 +52,6 @@
     check_mandatory_input_shapes,
     get_encoder_decoder_models_for_export,
     get_stable_diffusion_models_for_export,
-    load_controlnets,
     replace_stable_diffusion_submodels,
 )
 
@@ -76,7 +75,7 @@
     from transformers import PreTrainedModel
 
     if is_diffusers_available():
-        from diffusers import ControlNetModel, DiffusionPipeline, ModelMixin, StableDiffusionPipeline
+        from diffusers import DiffusionPipeline, ModelMixin, StableDiffusionPipeline
 
 
 logger = logging.get_logger()
@@ -207,7 +206,7 @@ def normalize_stable_diffusion_input_shapes(
 def infer_stable_diffusion_shapes_from_diffusers(
     input_shapes: Dict[str, Dict[str, int]],
     model: Union["StableDiffusionPipeline", "StableDiffusionXLPipeline"],
-    controlnets: Optional[List["ControlNetModel"]] = None,
+    has_controlnets: bool,
 ):
     if model.tokenizer is not None:
         sequence_length = model.tokenizer.model_max_length
@@ -242,7 +241,10 @@ def infer_stable_diffusion_shapes_from_diffusers(
     )
 
     # ControlNet
-    if controlnets:
+    if has_controlnets:
+        encoder_hidden_size = model.text_encoder.config.hidden_size
+        if hasattr(model, "text_encoder_2"):
+            encoder_hidden_size += model.text_encoder_2.config.hidden_size
         input_shapes["controlnet"] = {
             "batch_size": input_shapes["unet"]["batch_size"],
             "sequence_length": sequence_length,
@@ -250,7 +252,7 @@ def infer_stable_diffusion_shapes_from_diffusers(
             "height": scaled_height,
             "width": scaled_width,
             "vae_scale_factor": vae_scale_factor,
-            "encoder_hidden_size": model.text_encoder.config.hidden_size,
+            "encoder_hidden_size": encoder_hidden_size,
         }
 
     return input_shapes
@@ -272,7 +274,7 @@ def get_submodels_and_neuron_configs(
     lora_weight_names: Optional[Union[str, List[str]]] = None,
     lora_adapter_names: Optional[Union[str, List[str]]] = None,
     lora_scales: Optional[Union[float, List[float]]] = None,
-    controlnets: Optional[List["ControlNetModel"]] = None,
+    controlnet_ids: Optional[Union[str, List[str]]] = None,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     is_encoder_decoder = (
@@ -295,7 +297,7 @@ def get_submodels_and_neuron_configs(
             lora_weight_names=lora_weight_names,
             lora_adapter_names=lora_adapter_names,
             lora_scales=lora_scales,
-            controlnets=controlnets,
+            controlnet_ids=controlnet_ids,
         )
     elif is_encoder_decoder:
         optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}
@@ -356,7 +358,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     lora_weight_names: Optional[Union[str, List[str]]] = None,
     lora_adapter_names: Optional[Union[str, List[str]]] = None,
     lora_scales: Optional[Union[float, List[float]]] = None,
-    controlnets: Optional[List["ControlNetModel"]] = None,
+    controlnet_ids: Optional[Union[str, List[str]]] = None,
 ):
     check_compiler_compatibility_for_stable_diffusion()
     model = replace_stable_diffusion_submodels(model, submodels)
@@ -367,7 +369,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
     input_shapes = infer_stable_diffusion_shapes_from_diffusers(
         input_shapes=input_shapes,
         model=model,
-        controlnets=controlnets,
+        has_controlnets=controlnet_ids is not None,
     )
 
     # Saving the model config and preprocessor as this is needed sometimes.
@@ -396,7 +398,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
         lora_weight_names=lora_weight_names,
         lora_adapter_names=lora_adapter_names,
         lora_scales=lora_scales,
-        controlnets=controlnets,
+        controlnet_ids=controlnet_ids,
         controlnet_input_shapes=input_shapes.get("controlnet", None),
     )
     output_model_names = {
@@ -414,13 +416,14 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
         )
 
     # ControlNet models
-    if controlnets:
-        for idx in range(len(controlnets)):
+    if controlnet_ids:
+        if isinstance(controlnet_ids, str):
+            controlnet_ids = [controlnet_ids]
+        for idx in range(len(controlnet_ids)):
             controlnet_name = DIFFUSION_MODEL_CONTROLNET_NAME + "_" + str(idx)
             output_model_names[controlnet_name] = os.path.join(controlnet_name, NEURON_FILE_NAME)
 
     del model
-    del controlnets
 
     return models_and_neuron_configs, output_model_names
 
@@ -475,7 +478,7 @@ def load_models_and_neuron_configs(
     lora_weight_names: Optional[Union[str, List[str]]],
     lora_adapter_names: Optional[Union[str, List[str]]],
     lora_scales: Optional[Union[float, List[float]]],
-    controlnet_ids: Optional[Union[str, List[str]]],
+    controlnet_ids: Optional[Union[str, List[str]]] = None,
     output_attentions: bool = False,
     output_hidden_states: bool = False,
     library_name: Optional[str] = None,
@@ -500,7 +503,6 @@ def load_models_and_neuron_configs(
     }
     if model is None:
         model = TasksManager.get_model_from_task(**model_kwargs)
-    controlnets = load_controlnets(controlnet_ids)
 
     models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs(
         model=model,
@@ -518,7 +520,7 @@ def load_models_and_neuron_configs(
         lora_weight_names=lora_weight_names,
         lora_adapter_names=lora_adapter_names,
         lora_scales=lora_scales,
-        controlnets=controlnets,
+        controlnet_ids=controlnet_ids,
     )
 
     return models_and_neuron_configs, output_model_names
diff --git a/optimum/exporters/neuron/convert.py b/optimum/exporters/neuron/convert.py
index 810bbbedf..51607f82e 100644
--- a/optimum/exporters/neuron/convert.py
+++ b/optimum/exporters/neuron/convert.py
@@ -194,6 +194,10 @@ def validate_model_outputs(
             ref_inputs = tuple(ref_inputs.values())
             ref_outputs = reference_model(*ref_inputs)
             neuron_inputs = tuple(inputs.values())
+        elif "controlnet" in getattr(config._config, "_class_name", "").lower():
+            reference_model = config.patch_model_for_export(reference_model, ref_inputs)
+            neuron_inputs = ref_inputs = tuple(ref_inputs.values())
+            ref_outputs = reference_model(*ref_inputs)
         else:
             ref_outputs = reference_model(**ref_inputs)
             neuron_inputs = tuple(config.flatten_inputs(inputs).values())
@@ -351,66 +355,58 @@ def export_models(
         output_path = output_dir / output_file_name
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
-        try:
-            # TODO: Remove after the weights/neff separation compilation of sdxl is patched by a neuron sdk release: https://github.com/aws-neuron/aws-neuron-sdk/issues/859
-            if not inline_weights_to_neff and getattr(sub_neuron_config, "is_sdxl", False):
-                logger.warning(
-                    "The compilation of SDXL's unet with the weights/neff separation is broken since the Neuron sdk 2.18 release. `inline_weights_to_neff` will be set to True and the caching will be disabled. If you still want to separate the neff and weights, please downgrade your Neuron setup to the 2.17.1 release."
-                )
-                inline_weights_to_neff = True
-
-            start_time = time.time()
-            neuron_inputs, neuron_outputs = export(
-                model=submodel,
-                config=sub_neuron_config,
-                output=output_path,
-                compiler_workdir=compiler_workdir,
-                inline_weights_to_neff=inline_weights_to_neff,
-                optlevel=optlevel,
-                **compiler_kwargs,
-            )
-            compilation_time = time.time() - start_time
-            total_compilation_time += compilation_time
-            logger.info(f"[Compilation Time] {np.round(compilation_time, 2)} seconds.")
-            all_inputs[model_name] = neuron_inputs
-            all_outputs[model_name] = neuron_outputs
-            # Add neuron specific configs to model components' original config
-            if hasattr(submodel, "config"):
-                model_config = submodel.config
-            elif configs and (model_name in configs.keys()):
-                model_config = configs[model_name]
-            else:
-                raise AttributeError("Cannot find model's configuration, please pass it with `configs`.")
-
-            if is_diffusers_available() and isinstance(model_config, FrozenDict):
-                model_config = OrderedDict(model_config)
-                model_config = DiffusersPretrainedConfig.from_dict(model_config)
-
-            model_config = store_compilation_config(
-                config=model_config,
-                input_shapes=sub_neuron_config.input_shapes,
-                compiler_kwargs=compiler_kwargs,
-                input_names=neuron_inputs,
-                output_names=neuron_outputs,
-                dynamic_batch_size=sub_neuron_config.dynamic_batch_size,
-                compiler_type=NEURON_COMPILER_TYPE,
-                compiler_version=NEURON_COMPILER_VERSION,
-                inline_weights_to_neff=inline_weights_to_neff,
-                optlevel=optlevel,
-                model_type=getattr(sub_neuron_config, "MODEL_TYPE", None),
-                task=getattr(sub_neuron_config, "task", None),
-                output_attentions=getattr(sub_neuron_config, "output_attentions", False),
-                output_hidden_states=getattr(sub_neuron_config, "output_hidden_states", False),
-            )
-            model_config.save_pretrained(output_path.parent)
-            compile_configs[model_name] = model_config
-        except Exception as e:
-            failed_models.append((i, model_name))
-            output_path.parent.rmdir()
-            logger.error(
-                f"An error occured when trying to trace {model_name} with the error message: {e}.\n"
-                f"The export is failed and {model_name} neuron model won't be stored."
+        # TODO: Remove after the weights/neff separation compilation of sdxl is patched by a neuron sdk release: https://github.com/aws-neuron/aws-neuron-sdk/issues/859
+        if not inline_weights_to_neff and getattr(sub_neuron_config, "is_sdxl", False):
+            logger.warning(
+                "The compilation of SDXL's unet with the weights/neff separation is broken since the Neuron SDK 2.18 release. `inline_weights_to_neff` will be set to True and the caching will be disabled. If you still want to separate the neff and weights, please downgrade your Neuron setup to the 2.17.1 release."
             )
+            inline_weights_to_neff = True
+
+        start_time = time.time()
+        neuron_inputs, neuron_outputs = export(
+            model=submodel,
+            config=sub_neuron_config,
+            output=output_path,
+            compiler_workdir=compiler_workdir,
+            inline_weights_to_neff=inline_weights_to_neff,
+            optlevel=optlevel,
+            **compiler_kwargs,
+        )
+        compilation_time = time.time() - start_time
+        total_compilation_time += compilation_time
+        logger.info(f"[Compilation Time] {np.round(compilation_time, 2)} seconds.")
+        all_inputs[model_name] = neuron_inputs
+        all_outputs[model_name] = neuron_outputs
+        # Add neuron specific configs to model components' original config
+        if hasattr(submodel, "config"):
+            model_config = submodel.config
+        elif configs and (model_name in configs.keys()):
+            model_config = configs[model_name]
+        else:
+            raise AttributeError("Cannot find model's configuration, please pass it with `configs`.")
+
+        if is_diffusers_available() and isinstance(model_config, FrozenDict):
+            model_config = OrderedDict(model_config)
+            model_config = DiffusersPretrainedConfig.from_dict(model_config)
+
+        model_config = store_compilation_config(
+            config=model_config,
+            input_shapes=sub_neuron_config.input_shapes,
+            compiler_kwargs=compiler_kwargs,
+            input_names=neuron_inputs,
+            output_names=neuron_outputs,
+            dynamic_batch_size=sub_neuron_config.dynamic_batch_size,
+            compiler_type=NEURON_COMPILER_TYPE,
+            compiler_version=NEURON_COMPILER_VERSION,
+            inline_weights_to_neff=inline_weights_to_neff,
+            optlevel=optlevel,
+            model_type=getattr(sub_neuron_config, "MODEL_TYPE", None),
+            task=getattr(sub_neuron_config, "task", None),
+            output_attentions=getattr(sub_neuron_config, "output_attentions", False),
+            output_hidden_states=getattr(sub_neuron_config, "output_hidden_states", False),
+        )
+        model_config.save_pretrained(output_path.parent)
+        compile_configs[model_name] = model_config
 
     logger.info(f"[Total compilation Time] {np.round(total_compilation_time, 2)} seconds.")
 
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 689de5331..1f59c9031 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -681,12 +681,19 @@ class ControlNetNeuronConfig(VisionNeuronConfig):
 
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyVisionInputGenerator,
-        DummyControNetInputGenerator,
+        DummyControNetInputGenerator,  # Instead of `encoder_hidden_states` generated by `DummySeq2SeqDecoderTextInputGenerator`
+        DummyTimestepInputGenerator,
+        DummySeq2SeqDecoderTextInputGenerator,
     )
 
     @property
     def inputs(self) -> List[str]:
         common_inputs = ["sample", "timestep", "encoder_hidden_states", "controlnet_cond", "conditioning_scale"]
+
+        if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
+            common_inputs.append("text_embeds")
+            common_inputs.append("time_ids")
+
         return common_inputs
 
     @property
diff --git a/optimum/exporters/neuron/model_wrappers.py b/optimum/exporters/neuron/model_wrappers.py
index 8a47c779e..9c83168ce 100644
--- a/optimum/exporters/neuron/model_wrappers.py
+++ b/optimum/exporters/neuron/model_wrappers.py
@@ -94,12 +94,19 @@ def forward(self, *inputs):
         controlnet_cond = ordered_inputs.pop("controlnet_cond", None)
         conditioning_scale = ordered_inputs.pop("conditioning_scale", None)
 
+        # Additional conditions for the Stable Diffusion XL UNet.
+        added_cond_kwargs = {
+            "text_embeds": ordered_inputs.pop("text_embeds", None),
+            "time_ids": ordered_inputs.pop("time_ids", None),
+        }
+
         out_tuple = self.model(
             sample=sample,
             timestep=timestep,
             encoder_hidden_states=encoder_hidden_states,
             controlnet_cond=controlnet_cond,
             conditioning_scale=conditioning_scale,
+            added_cond_kwargs=added_cond_kwargs,
             guess_mode=False,  # TODO: support guess mode of ControlNet
             return_dict=False,
             **ordered_inputs,
diff --git a/optimum/exporters/neuron/utils.py b/optimum/exporters/neuron/utils.py
index 0d9f863bf..81ff32c81 100644
--- a/optimum/exporters/neuron/utils.py
+++ b/optimum/exporters/neuron/utils.py
@@ -119,7 +119,7 @@ def get_stable_diffusion_models_for_export(
     lora_weight_names: Optional[List[str]] = None,
     lora_adapter_names: Optional[List[str]] = None,
     lora_scales: Optional[List[float]] = None,
-    controlnets: Optional[List["ControlNetModel"]] = None,
+    controlnet_ids: Optional[Union[str, List[str]]] = None,
     controlnet_input_shapes: Optional[Dict[str, int]] = None,
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "ModelMixin"], "NeuronDefaultConfig"]]:
     """
@@ -153,8 +153,8 @@ def get_stable_diffusion_models_for_export(
             List of adapter names to be used for referencing the loaded adapter models.
         lora_scales (`Optional[List[float]]`, defaults to `None`):
             List of scaling factors for lora adapters.
-        controlnets (`Optional[List["ControlNetModel"]]]`, defaults to `None`):
-            One or multiple ControlNets providing additional conditioning to the `unet` during the denoising process. If you set multiple
+        controlnet_ids (`Optional[Union[str, List[str]]]`, defaults to `None`):
+            Model ID of one or multiple ControlNets providing additional conditioning to the `unet` during the denoising process. If you set multiple
             ControlNets as a list, the outputs from each ControlNet are added together to create one combined additional conditioning.
         controlnet_input_shapes (`Optional[Dict[str, int]]`, defaults to `None`):
             Static shapes used for compiling ControlNets.
@@ -170,6 +170,7 @@ def get_stable_diffusion_models_for_export(
         lora_weight_names=lora_weight_names,
         lora_adapter_names=lora_adapter_names,
         lora_scales=lora_scales,
+        controlnet_ids=controlnet_ids,
     )
     library_name = "diffusers"
 
@@ -227,7 +228,7 @@ def get_stable_diffusion_models_for_export(
     if task == "stable-diffusion-xl":
         unet_neuron_config.is_sdxl = True
 
-    unet_neuron_config.with_controlnet = True if controlnets else False
+    unet_neuron_config.with_controlnet = True if controlnet_ids else False
 
     models_for_export[DIFFUSION_MODEL_UNET_NAME] = (unet, unet_neuron_config)
 
@@ -266,8 +267,12 @@ def get_stable_diffusion_models_for_export(
     models_for_export[DIFFUSION_MODEL_VAE_DECODER_NAME] = (vae_decoder, vae_decoder_neuron_config)
 
     # ControlNet
-    if controlnets:
-        for idx, controlnet in enumerate(controlnets):
+    if controlnet_ids:
+        if isinstance(controlnet_ids, str):
+            controlnet_ids = [controlnet_ids]
+        for idx in range(len(controlnet_ids)):
+            controlnet_name = DIFFUSION_MODEL_CONTROLNET_NAME + "_" + str(idx)
+            controlnet = models_for_export[controlnet_name]
             controlnet_config_constructor = TasksManager.get_exporter_config_constructor(
                 model=controlnet,
                 exporter="neuron",
@@ -281,7 +286,7 @@ def get_stable_diffusion_models_for_export(
                 dynamic_batch_size=dynamic_batch_size,
                 **controlnet_input_shapes,
             )
-            models_for_export[DIFFUSION_MODEL_CONTROLNET_NAME + "_" + str(idx)] = (
+            models_for_export[controlnet_name] = (
                 controlnet,
                 controlnet_neuron_config,
             )
@@ -351,6 +356,7 @@ def get_submodels_for_export_stable_diffusion(
     lora_weight_names: Optional[Union[str, List[str]]] = None,
     lora_adapter_names: Optional[Union[str, List[str]]] = None,
     lora_scales: Optional[List[float]] = None,
+    controlnet_ids: Optional[Union[str, List[str]]] = None,
 ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
     """
     Returns the components of a Stable Diffusion model.
@@ -381,6 +387,7 @@ def get_submodels_for_export_stable_diffusion(
     text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
     if text_encoder_2 is not None:
         text_encoder_2.config.output_hidden_states = True
+        text_encoder_2.text_model.config.output_hidden_states = True
         models_for_export.append((DIFFUSION_MODEL_TEXT_ENCODER_2_NAME, copy.deepcopy(text_encoder_2)))
 
     # U-NET
@@ -418,6 +425,15 @@ def get_submodels_for_export_stable_diffusion(
     vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
     models_for_export.append((DIFFUSION_MODEL_VAE_DECODER_NAME, vae_decoder))
 
+    # ControlNets
+    controlnets = load_controlnets(controlnet_ids)
+    if controlnets:
+        for idx, controlnet in enumerate(controlnets):
+            controlnet.config.text_encoder_projection_dim = pipeline.unet.config.text_encoder_projection_dim
+            controlnet.config.requires_aesthetics_score = pipeline.unet.config.requires_aesthetics_score
+            controlnet.config.time_cond_proj_dim = pipeline.unet.config.time_cond_proj_dim
+            models_for_export.append((DIFFUSION_MODEL_CONTROLNET_NAME + "_" + str(idx), controlnet))
+
     return OrderedDict(models_for_export)
 
 
diff --git a/optimum/neuron/__init__.py b/optimum/neuron/__init__.py
index 9f989a6c2..a55e42ef3 100644
--- a/optimum/neuron/__init__.py
+++ b/optimum/neuron/__init__.py
@@ -27,7 +27,7 @@
 
 _import_structure = {
     "hf_argparser": ["NeuronHfArgumentParser"],
-    "trainers": ["NeuronTrainer", "Seq2SeqNeuronTrainer"],
+    "trainers": ["NeuronTrainer", "Seq2SeqNeuronTrainer", "NeuronSFTTrainer"],
     "training_args": ["NeuronTrainingArguments", "Seq2SeqNeuronTrainingArguments"],
     "modeling_traced": ["NeuronTracedModel"],
     "modeling": [
@@ -58,6 +58,7 @@
         "NeuronStableDiffusionXLImg2ImgPipeline",
         "NeuronStableDiffusionXLInpaintPipeline",
         "NeuronStableDiffusionControlNetPipeline",
+        "NeuronStableDiffusionXLControlNetPipeline",
     ],
     "modeling_decoder": ["NeuronDecoderModel"],
     "modeling_seq2seq": ["NeuronModelForSeq2SeqLM"],
@@ -68,7 +69,7 @@
         "ModelParallelismPlugin",
     ],
     "pipelines": ["pipeline"],
-    "utils": ["get_peft_model"],
+    "utils": ["NeuronSFTConfig", "get_peft_model"],
 }
 
 if TYPE_CHECKING:
@@ -100,6 +101,7 @@
         NeuronStableDiffusionInstructPix2PixPipeline,
         NeuronStableDiffusionPipeline,
         NeuronStableDiffusionPipelineBase,
+        NeuronStableDiffusionXLControlNetPipeline,
         NeuronStableDiffusionXLImg2ImgPipeline,
         NeuronStableDiffusionXLInpaintPipeline,
         NeuronStableDiffusionXLPipeline,
@@ -107,9 +109,9 @@
     from .modeling_seq2seq import NeuronModelForSeq2SeqLM
     from .modeling_traced import NeuronTracedModel
     from .pipelines import pipeline
-    from .trainers import NeuronTrainer, Seq2SeqNeuronTrainer
+    from .trainers import NeuronSFTTrainer, NeuronTrainer, Seq2SeqNeuronTrainer
     from .training_args import NeuronTrainingArguments, Seq2SeqNeuronTrainingArguments
-    from .utils import get_peft_model
+    from .utils import NeuronSFTConfig, get_peft_model
 
 else:
     import sys
diff --git a/optimum/neuron/generation/token_selector.py b/optimum/neuron/generation/token_selector.py
index 3d0935cc4..6edd4fd1c 100644
--- a/optimum/neuron/generation/token_selector.py
+++ b/optimum/neuron/generation/token_selector.py
@@ -92,6 +92,7 @@ def create(
         """
         generation_config.validate()
         generation_config = copy.deepcopy(generation_config)
+        model._prepare_special_tokens(generation_config)
 
         unsupported_generation_flags = [
             "output_attentions",
diff --git a/optimum/neuron/modeling_diffusion.py b/optimum/neuron/modeling_diffusion.py
index 4e3e193c1..30a310bf8 100644
--- a/optimum/neuron/modeling_diffusion.py
+++ b/optimum/neuron/modeling_diffusion.py
@@ -1058,15 +1058,15 @@ def forward(
         inputs = (sample, timestep, encoder_hidden_states)
         if timestep_cond is not None:
             inputs = inputs + (timestep_cond,)
-        if added_cond_kwargs is not None:
-            text_embeds = added_cond_kwargs.pop("text_embeds", None)
-            time_ids = added_cond_kwargs.pop("time_ids", None)
-            inputs = inputs + (text_embeds, time_ids)
         if mid_block_additional_residual is not None:
             inputs = inputs + (mid_block_additional_residual,)
         if down_block_additional_residuals is not None:
             for idx in range(len(down_block_additional_residuals)):
                 inputs = inputs + (down_block_additional_residuals[idx],)
+        if added_cond_kwargs:
+            text_embeds = added_cond_kwargs.pop("text_embeds", None)
+            time_ids = added_cond_kwargs.pop("time_ids", None)
+            inputs = inputs + (text_embeds, time_ids)
 
         outputs = self.model(*inputs)
         return outputs
@@ -1139,9 +1139,15 @@ def forward(
         controlnet_cond: torch.Tensor,
         conditioning_scale: float = 1.0,
         guess_mode: bool = False,
+        added_cond_kwargs: Optional[Dict] = None,
         return_dict: bool = True,
     ) -> Union["ControlNetOutput", Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
+        timestep = timestep.expand((sample.shape[0],)).to(torch.long)
         inputs = (sample, timestep, encoder_hidden_states, controlnet_cond, conditioning_scale)
+        if added_cond_kwargs:
+            text_embeds = added_cond_kwargs.pop("text_embeds", None)
+            time_ids = added_cond_kwargs.pop("time_ids", None)
+            inputs += (text_embeds, time_ids)
         outputs = self.model(*inputs)
 
         if guess_mode:
@@ -1320,7 +1326,7 @@ class NeuronStableDiffusionXLInpaintPipeline(
 
 
 class NeuronStableDiffusionXLControlNetPipeline(
-    NeuronStableDiffusionPipelineBase, NeuronStableDiffusionXLControlNetPipelineMixin
+    NeuronStableDiffusionXLPipelineBase, NeuronStableDiffusionXLControlNetPipelineMixin
 ):
     __call__ = NeuronStableDiffusionXLControlNetPipelineMixin.__call__
 
diff --git a/optimum/neuron/pipelines/diffusers/pipeline_controlnet.py b/optimum/neuron/pipelines/diffusers/pipeline_controlnet.py
index 641123635..690872c83 100644
--- a/optimum/neuron/pipelines/diffusers/pipeline_controlnet.py
+++ b/optimum/neuron/pipelines/diffusers/pipeline_controlnet.py
@@ -238,7 +238,7 @@ def __call__(
                 it will be overriden by the static batch size of neuron (except for dynamic batching).
             eta (`float`, defaults to 0.0):
                 Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+                to the [`diffusers.schedulers.DDIMScheduler`], and is ignored in other schedulers.
             generator (`Optional[Union[torch.Generator, List[torch.Generator]]]`, defaults to `None`):
                 A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                 generation deterministic.
diff --git a/optimum/neuron/pipelines/diffusers/pipeline_controlnet_sd_xl.py b/optimum/neuron/pipelines/diffusers/pipeline_controlnet_sd_xl.py
index 69e80292f..5555add8e 100644
--- a/optimum/neuron/pipelines/diffusers/pipeline_controlnet_sd_xl.py
+++ b/optimum/neuron/pipelines/diffusers/pipeline_controlnet_sd_xl.py
@@ -14,9 +14,748 @@
 # limitations under the License.
 """Override some diffusers API for NeuronStableDiffusionXLControlNetPipelineMixin"""
 
+import copy
+import logging
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
-class NeuronStableDiffusionXLControlNetPipelineMixin:
-    def __call__(self):
-        raise NotImplementedError(
-            "`NeuronStableDiffusionXLControlNetPipelineMixin` is not yet supported but will come soon."
+import torch
+from diffusers import StableDiffusionXLControlNetPipeline
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps
+from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
+
+from .pipeline_utils import StableDiffusionXLPipelineMixin
+
+
+logger = logging.getLogger(__name__)
+
+
+class NeuronStableDiffusionXLControlNetPipelineMixin(
+    StableDiffusionXLPipelineMixin, StableDiffusionXLControlNetPipeline
+):
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.29.2/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py#L625
+    # Replace class types with Neuron ones
+    def check_inputs(
+        self,
+        prompt,
+        prompt_2,
+        image,
+        negative_prompt=None,
+        negative_prompt_2=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        pooled_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        negative_pooled_prompt_embeds=None,
+        controlnet_conditioning_scale=1.0,
+        control_guidance_start=0.0,
+        control_guidance_end=1.0,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+
+        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+
+        # Check `image`
+        if self.controlnet.__class__.__name__ == "NeuronControlNetModel":
+            self.check_image(image, prompt, prompt_embeds)
+        elif self.controlnet.__class__.__name__ == "NeuronMultiControlNetModel":
+            if not isinstance(image, list):
+                raise TypeError("For multiple controlnets: `image` must be type `list`")
+
+            # When `image` is a nested list:
+            # (e.g. [[canny_image_1, pose_image_1], [canny_image_2, pose_image_2]])
+            elif any(isinstance(i, list) for i in image):
+                raise ValueError("A single batch of multiple conditionings are not supported at the moment.")
+            elif len(image) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets."
+                )
+            else:
+                for image_ in image:
+                    self.check_image(image_, prompt, prompt_embeds)
+        else:
+            raise ValueError(
+                f"{self.controlnet.__class__.__name__} is not a supported class for ControlNet. The class must be either `NeuronControlNetModel` or `NeuronMultiControlNetModel`."
+            )
+
+        # Check `controlnet_conditioning_scale`
+        if self.controlnet.__class__.__name__ == "NeuronControlNetModel":
+            if not isinstance(controlnet_conditioning_scale, float):
+                raise TypeError("For single controlnet: `controlnet_conditioning_scale` must be type `float`.")
+        elif self.controlnet.__class__.__name__ == "NeuronMultiControlNetModel":
+            if isinstance(controlnet_conditioning_scale, list):
+                if any(isinstance(i, list) for i in controlnet_conditioning_scale):
+                    raise ValueError("A single batch of multiple conditionings are not supported at the moment.")
+            elif isinstance(controlnet_conditioning_scale, list) and len(controlnet_conditioning_scale) != len(
+                self.controlnet.nets
+            ):
+                raise ValueError(
+                    "For multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have"
+                    " the same length as the number of controlnets"
+                )
+        else:
+            raise ValueError(
+                f"{self.controlnet.__class__.__name__} is not a supported class for ControlNet. The class must be either `NeuronControlNetModel` or `NeuronMultiControlNetModel`."
+            )
+
+        if not isinstance(control_guidance_start, (tuple, list)):
+            control_guidance_start = [control_guidance_start]
+
+        if not isinstance(control_guidance_end, (tuple, list)):
+            control_guidance_end = [control_guidance_end]
+
+        if len(control_guidance_start) != len(control_guidance_end):
+            raise ValueError(
+                f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list."
+            )
+
+        if self.controlnet.__class__.__name__ == "NeuronMultiControlNetModel":
+            if len(control_guidance_start) != len(self.controlnet.nets):
+                raise ValueError(
+                    f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}."
+                )
+
+        for start, end in zip(control_guidance_start, control_guidance_end):
+            if start >= end:
+                raise ValueError(
+                    f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}."
+                )
+            if start < 0.0:
+                raise ValueError(f"control guidance start: {start} can't be smaller than 0.")
+            if end > 1.0:
+                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")
+
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+
+    # Adapted from https://github.com/huggingface/diffusers/blob/v0.30.0/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py#L899
+    def _get_add_time_ids(
+        self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+
+    # Adapted from https://github.com/huggingface/diffusers/blob/1f81fbe274e67c843283e69eb8f00bb56f75ffc4/src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py#L1001
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        image: PipelineImageInput = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        denoising_end: Optional[float] = None,
+        guidance_scale: float = 5.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
+        guess_mode: bool = False,
+        control_guidance_start: Union[float, List[float]] = 0.0,
+        control_guidance_end: Union[float, List[float]] = 1.0,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        target_size: Optional[Tuple[int, int]] = None,
+        negative_original_size: Optional[Tuple[int, int]] = None,
+        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+        negative_target_size: Optional[Tuple[int, int]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`Optional[Union[str, List[str]]]`, defaults to `None`):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            prompt_2 (`Optional[Union[str, List[str]]]`, defaults to `None`):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders.
+            image (`Optional["PipelineImageInput"]`, defaults to `None`):
+                The ControlNet input condition to provide guidance to the `unet` for generation. If the type is
+                specified as `torch.Tensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted
+                as an image. The dimensions of the output image defaults to `image`'s dimensions. If height and/or
+                width are passed, `image` is resized accordingly. If multiple ControlNets are specified in `init`,
+                images must be passed as a list such that each element of the list can be correctly batched for input
+                to a single ControlNet.
+            num_inference_steps (`int`, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`Optional[List[int]]`, defaults to `None`):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`Optional[List[int]]`, defaults to `None`):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`Optional[float]`, defaults to `None`):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, defaults to 5.0):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`Optional[Union[str, List[str]]]`, defaults to `None`):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            negative_prompt_2 (`Optional[Union[str, List[str]]]`, defaults to `None`):
+                The prompt or prompts to guide what to not include in image generation. This is sent to `tokenizer_2`
+                and `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders.
+            num_images_per_prompt (`int`, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`diffusers.schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`Optional[Union[torch.Generator, List[torch.Generator]]]`, defaults to `None`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`Optional[torch.Tensor]`, defaults to `None`):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`Optional[torch.Tensor]`, defaults to `None`):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`Optional[torch.Tensor]`, defaults to `None`):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            pooled_prompt_embeds (`Optional[torch.Tensor]`, defaults to `None`):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, pooled text embeddings are generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`Optional[torch.Tensor]`, defaults to `None`):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs (prompt
+                weighting). If not provided, pooled `negative_prompt_embeds` are generated from `negative_prompt` input
+                argument.
+            ip_adapter_image: (`Optional[PipelineImageInput]`, defaults to `None`): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`Optional[List[torch.Tensor]]`, defaults to `None`):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`Optional[str]`, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`Optional[Dict[str, Any]]`, defaults to `None`):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            controlnet_conditioning_scale (`Union[float, List[float]]`, defaults to 1.0):
+                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
+                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
+                the corresponding scale as a list.
+            guess_mode (`bool`, defaults to `False`):
+                The ControlNet encoder tries to recognize the content of the input image even if you remove all
+                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
+            control_guidance_start (`Union[float, List[float]]`, defaults to 0.0):
+                The percentage of total steps at which the ControlNet starts applying.
+            control_guidance_end (`Union[float, List[float]]`, defaults to 1.0):
+                The percentage of total steps at which the ControlNet stops applying.
+            original_size (`Optional[Tuple[int, int]]`, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int, int]`, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Optional[Tuple[int, int]]`, defaults to `None`):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Optional[Tuple[int, int]]`, defaults to `None`):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int, int]`, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Optional[Tuple[int, int]]`, defaults to `None`):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            clip_skip (`Optional[int]`, defaults to `None`):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Optional[Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]]`, defaults to `None`):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List[str]`, defaults to `["latents"]`):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned containing the output images.
+        """
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        controlnet = self.controlnet
+
+        # align format for control guidance
+        if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list):
+            control_guidance_start = len(control_guidance_end) * [control_guidance_start]
+        elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list):
+            control_guidance_end = len(control_guidance_start) * [control_guidance_end]
+        elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list):
+            mult = len(controlnet.nets) if controlnet.__class__.__name__ == "NeuronMultiControlNetModel" else 1
+            control_guidance_start, control_guidance_end = (
+                mult * [control_guidance_start],
+                mult * [control_guidance_end],
+            )
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            image=image,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            ip_adapter_image=ip_adapter_image,
+            ip_adapter_image_embeds=ip_adapter_image_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            controlnet_conditioning_scale=controlnet_conditioning_scale,
+            control_guidance_start=control_guidance_start,
+            control_guidance_end=control_guidance_end,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if isinstance(controlnet_conditioning_scale, float):
+            controlnet_conditioning_scale = torch.tensor([controlnet_conditioning_scale])
+            if controlnet.__class__.__name__ == "NeuronMultiControlNetModel":
+                controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets)
+
+        global_pool_conditions = (
+            controlnet.config.global_pool_conditions
+            if controlnet.__class__.__name__ == "NeuronControlNetModel"
+            else controlnet.nets[0].config.global_pool_conditions
+        )
+        guess_mode = guess_mode or global_pool_conditions
+        # TODO: Remove after the guess mode of ControlNet is supported
+        if guess_mode:
+            logger.info("Disabling the guess mode as this is not supported yet.")
+            guess_mode = False
+
+        # 3.1 Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        lora_scale = None
+        do_classifier_free_guidance = guidance_scale > 1.0 and (
+            self.dynamic_batch_size or self.data_parallel_mode == "unet"
+        )
+        (
+            prompt_embeds,
+            negative_prompt_embeds,
+            pooled_prompt_embeds,
+            negative_pooled_prompt_embeds,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # 3.2 Encode ip_adapter_image
+        # TODO: support ip adapter
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            logger.info(
+                "IP adapter is not supported yet, `ip_adapter_image` and `ip_adapter_image_embeds` will be ignored."
+            )
+
+        # 4. Prepare image
+        height = self.vae_encoder.config.neuron["static_height"]
+        width = self.vae_encoder.config.neuron["static_width"]
+        if controlnet.__class__.__name__ == "NeuronControlNetModel":
+            image = self.prepare_image(
+                image=image,
+                width=width,
+                height=height,
+                batch_size=batch_size * num_images_per_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                device=None,
+                dtype=None,
+                do_classifier_free_guidance=do_classifier_free_guidance,
+                guess_mode=guess_mode,
+            )
+            height, width = image.shape[-2:]
+        elif controlnet.__class__.__name__ == "NeuronMultiControlNetModel":
+            images = []
+
+            for image_ in image:
+                image_ = self.prepare_image(
+                    image=image_,
+                    width=width,
+                    height=height,
+                    batch_size=batch_size * num_images_per_prompt,
+                    num_images_per_prompt=num_images_per_prompt,
+                    device=None,
+                    dtype=None,
+                    do_classifier_free_guidance=do_classifier_free_guidance,
+                    guess_mode=guess_mode,
+                )
+
+                images.append(image_)
+
+            image = images
+            height, width = image[0].shape[-2:]
+        else:
+            assert False
+
+        # 5. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            scheduler=self.scheduler,
+            num_inference_steps=num_inference_steps,
+            device=None,
+            timesteps=timesteps,
+            sigmas=sigmas,
+        )
+        self._num_timesteps = len(timesteps)
+
+        # 6. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            generator,
+            latents,
+        )
+
+        # 6.5 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=None, dtype=latents.dtype)
+
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7.1 Create tensor stating which controlnets to keep
+        controlnet_keep = []
+        for i in range(len(timesteps)):
+            keeps = [
+                1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e)
+                for s, e in zip(control_guidance_start, control_guidance_end)
+            ]
+            controlnet_keep.append(keeps[0] if controlnet.__class__.__name__ == "NeuronControlNetModel" else keeps)
+
+        # 7.2 Prepare added time ids & embeddings
+        if isinstance(image, list):
+            original_size = original_size or image[0].shape[-2:]
+        else:
+            original_size = original_size or image.shape[-2:]
+        target_size = target_size or (height, width)
+
+        add_text_embeds = pooled_prompt_embeds
+        if self.text_encoder_2 is None:
+            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+        else:
+            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
+
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
         )
+
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+
+        add_time_ids = add_time_ids.repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+
+        # 8.1 Apply denoising_end
+        if (
+            self.denoising_end is not None
+            and isinstance(self.denoising_end, float)
+            and self.denoising_end > 0
+            and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # Relevant thread:
+                # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+
+                # controlnet(s) inference
+                if guess_mode and do_classifier_free_guidance:
+                    # Infer ControlNet only for the conditional batch.
+                    control_model_input = latents
+                    control_model_input = self.scheduler.scale_model_input(control_model_input, t)
+                    controlnet_prompt_embeds = prompt_embeds.chunk(2)[1]
+                    controlnet_added_cond_kwargs = {
+                        "text_embeds": add_text_embeds.chunk(2)[1],
+                        "time_ids": add_time_ids.chunk(2)[1],
+                    }
+                else:
+                    control_model_input = latent_model_input
+                    controlnet_prompt_embeds = prompt_embeds
+                    controlnet_added_cond_kwargs = copy.deepcopy(added_cond_kwargs)
+
+                if isinstance(controlnet_keep[i], list):
+                    cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])]
+                else:
+                    controlnet_cond_scale = controlnet_conditioning_scale
+                    if isinstance(controlnet_cond_scale, list):
+                        controlnet_cond_scale = controlnet_cond_scale[0]
+                    cond_scale = controlnet_cond_scale * controlnet_keep[i]
+
+                # Duplicate inputs for ddp
+                t = torch.tensor([t] * 2) if self.data_parallel_mode == "unet" else t
+                cond_scale = (
+                    torch.tensor([cond_scale]).repeat(2)
+                    if self.data_parallel_mode == "unet"
+                    else torch.tensor(cond_scale)
+                )
+
+                down_block_res_samples, mid_block_res_sample = self.controlnet(
+                    control_model_input,
+                    t,
+                    encoder_hidden_states=controlnet_prompt_embeds,
+                    controlnet_cond=image,
+                    conditioning_scale=cond_scale,
+                    guess_mode=guess_mode,
+                    added_cond_kwargs=controlnet_added_cond_kwargs,
+                    return_dict=False,
+                )
+
+                if guess_mode and do_classifier_free_guidance:
+                    # Infered ControlNet only for the conditional batch.
+                    # To apply the output of ControlNet to both the unconditional and conditional batches,
+                    # add 0 to the unconditional batch to keep it unchanged.
+                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
+                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])
+
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+                    logger.info(
+                        "IP adapter is not supported yet, `ip_adapter_image` and `ip_adapter_image_embeds` will be ignored."
+                    )
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    down_block_additional_residuals=down_block_res_samples,
+                    mid_block_additional_residual=mid_block_res_sample,
+                    added_cond_kwargs=added_cond_kwargs,
+                )[0]
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = (
+                hasattr(self.vae_decoder.config, "latents_mean") and self.vae_decoder.config.latents_mean is not None
+            )
+            has_latents_std = (
+                hasattr(self.vae_decoder.config, "latents_std") and self.vae_decoder.config.latents_std is not None
+            )
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = (
+                    latents * latents_std / getattr(self.vae_decoder.config, "scaling_factor", 0.18215) + latents_mean
+                )
+            else:
+                latents = latents / getattr(self.vae_decoder.config, "scaling_factor", 0.18215)
+
+            image = self.vae_decoder(latents)[0]
+
+        else:
+            image = latents
+
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+
+        if not return_dict:
+            return (image,)
+
+        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py
index 125122cd1..d80614478 100755
--- a/optimum/neuron/trainers.py
+++ b/optimum/neuron/trainers.py
@@ -15,21 +15,36 @@
 """Defines Trainer subclasses to perform training on AWS Neuron instances."""
 
 import copy
+import dataclasses
+import inspect
 import math
 import os
 import shutil
 import sys
 import time
 import warnings
-from typing import Any, Dict, List, Optional, Tuple, Union
+from functools import wraps
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import datasets
 import numpy as np
 import torch
 from accelerate import __version__ as accelerate_version
+from accelerate.state import PartialState
 from accelerate.utils import AutocastKwargs, DataLoaderConfiguration, GradientAccumulationPlugin
 from packaging import version
 from torch.utils.data import Dataset
-from transformers import PreTrainedModel, Seq2SeqTrainer, Trainer, TrainingArguments
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollator,
+    DataCollatorForLanguageModeling,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Seq2SeqTrainer,
+    Trainer,
+    TrainingArguments,
+)
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 from transformers.integrations import hp_params
 from transformers.modeling_utils import unwrap_model
@@ -39,7 +54,7 @@
     TRAINER_STATE_NAME,
     TRAINING_ARGS_NAME,
 )
-from transformers.trainer_callback import TrainerState
+from transformers.trainer_callback import TrainerCallback, TrainerState
 from transformers.trainer_pt_utils import (
     IterableDatasetShard,
     find_batch_size,
@@ -63,6 +78,7 @@
     WEIGHTS_NAME,
     is_accelerate_available,
     is_apex_available,
+    is_peft_available,
     is_sagemaker_mp_enabled,
 )
 
@@ -73,6 +89,7 @@
 from .training_args import NeuronTrainingArguments
 from .utils import (
     is_torch_xla_available,
+    is_trl_available,
     patch_within_function,
 )
 from .utils.cache_utils import (
@@ -84,7 +101,7 @@
 )
 from .utils.hub_cache_utils import ModelCacheEntry, hub_neuronx_cache, patch_neuron_cc_wrapper, synchronize_hub_cache
 from .utils.misc import is_main_worker, is_precompilation
-from .utils.peft_utils import NeuronPeftModel
+from .utils.peft_utils import NeuronPeftModel, get_peft_model
 from .utils.require_utils import requires_neuronx_distributed, requires_torch_neuronx
 from .utils.training_utils import (
     get_model_param_count,
@@ -93,6 +110,7 @@
     patch_generation_mixin_to_neuron_generation_mixin,
     skip_first_batches,
 )
+from .utils.trl_utils import NeuronSFTConfig
 from .utils.version_utils import get_neuronxcc_version
 
 
@@ -111,6 +129,26 @@
 else:
     IS_SAGEMAKER_MP_POST_1_10 = False
 
+
+if is_trl_available():
+    from trl import SFTConfig, SFTTrainer
+else:
+
+    class SFTTrainer:
+        pass
+
+    class SFTConfig:
+        pass
+
+
+if is_peft_available():
+    from peft import PeftConfig
+else:
+
+    class PeftConfig:
+        pass
+
+
 logger = logging.get_logger("transformers.trainer")
 
 KEEP_HF_HUB_PROGRESS_BARS = os.environ.get("KEEP_HF_HUB_PROGRESS_BARS")
@@ -120,7 +158,7 @@
 transformers_get_optimizer_cls_and_kwargs = Trainer.get_optimizer_cls_and_kwargs
 
 
-class AugmentTrainerForNeuronMixin:
+class _TrainerForNeuron:
     def __init__(self, *args, **kwargs):
         if not isinstance(self, Trainer):
             raise TypeError(f"{self.__class__.__name__} can only be mixed with Trainer subclasses.")
@@ -454,7 +492,11 @@ def _maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, igno
                         tr_loss.zero_()
 
                 def log_closure(self, reduced_tr_loss, grad_norm):
-                    if is_main_worker_for_metrics():
+                    # We need to check that self.state.global_step > self._globalstep_last_logged because if two
+                    # closures are added in a row (which can happen at the end of the training), then it will fail the
+                    # second time because at this point we will have:
+                    # self.state.global_step = self._globalstep_last_logged
+                    if is_main_worker_for_metrics() and self.state.global_step > self._globalstep_last_logged:
                         logs: Dict[str, float] = {}
                         tr_loss_scalar = reduced_tr_loss.to("cpu").item()
 
@@ -1455,13 +1497,369 @@ def save_state(self):
         return super().save_state()
 
 
-class NeuronTrainer(AugmentTrainerForNeuronMixin, Trainer):
+class NeuronTrainer(_TrainerForNeuron, Trainer):
     """
     Trainer that is suited for performing training on AWS Tranium instances.
     """
 
 
-class Seq2SeqNeuronTrainer(AugmentTrainerForNeuronMixin, Seq2SeqTrainer):
+class Seq2SeqNeuronTrainer(_TrainerForNeuron, Seq2SeqTrainer):
     """
     Seq2SeqTrainer that is suited for performing training on AWS Tranium instances.
     """
+
+
+class _SFTTrainerTrainerInit(SFTTrainer):
+    def __init__(self, *args, **kwargs):
+        return Trainer.__init__(self, *args, **kwargs)
+
+
+class NeuronSFTTrainer(_TrainerForNeuron, _SFTTrainerTrainerInit):
+    """
+    `SFTTrainer` adapted for Neuron.
+
+    It differs from the original `SFTTrainer` by:
+        - Using `_TrainerForNeuron.__init__()` instead of `Trainer.__init__()`
+        - Using the `_TrainerForNeuron.train()` instead of `Trainer.train()`
+        - Adapts the `_prepare_non_packed_dataloader` to pad to max length. In the original `SFTTrainer` examples are
+          not padded, which is an issue here because it triggers compilation every time.
+    """
+
+    def __init__(
+        self,
+        model: Optional[Union[PreTrainedModel, torch.nn.Module, str]] = None,
+        args: Optional[SFTConfig] = None,
+        data_collator: Optional[DataCollator] = None,  # type: ignore
+        train_dataset: Optional[Dataset] = None,
+        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        model_init: Optional[Callable[[], PreTrainedModel]] = None,
+        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
+        callbacks: Optional[List[TrainerCallback]] = None,
+        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
+        peft_config: Optional["PeftConfig"] = None,
+        formatting_func: Optional[Callable] = None,
+    ):
+        if not is_trl_available():
+            raise RuntimeError("Using NeuronSFTTrainer requires the trl library.")
+
+        from trl.extras.dataset_formatting import get_formatting_func_from_dataset
+
+        # This will be changed to :
+        from trl.trainer.callbacks import RichProgressCallback
+        from trl.trainer.utils import (
+            DataCollatorForCompletionOnlyLM,
+            peft_module_casting_to_bf16,
+        )
+
+        if is_peft_available():
+            from peft import PeftConfig, prepare_model_for_kbit_training
+
+        if args is None:
+            output_dir = "tmp_trainer"
+            warnings.warn(f"No `SFTConfig` passed, using `output_dir={output_dir}`.")
+            args = NeuronSFTConfig(output_dir=output_dir)
+        elif args is not None and args.__class__.__name__ == "NeuronTrainingArguments":
+            args_as_dict = args.to_dict()
+            # Manually copy token values as TrainingArguments.to_dict() redacts them
+            args_as_dict.update({k: getattr(args, k) for k in args_as_dict.keys() if k.endswith("_token")})
+            args = NeuronSFTConfig(**args_as_dict)
+
+        if getattr(args, "model_init_kwargs", None) is None:
+            model_init_kwargs = {}
+        elif not isinstance(model, str):
+            raise ValueError("You passed model_init_kwargs to the SFTConfig, but your model is already instantiated.")
+        else:
+            model_init_kwargs = args.model_init_kwargs
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if torch_dtype is not None:
+                # Convert to `torch.dtype` if an str is passed
+                if isinstance(torch_dtype, str) and torch_dtype != "auto":
+                    torch_dtype = getattr(torch, torch_dtype)
+                if torch_dtype != "auto" and not isinstance(torch_dtype, torch.dtype):
+                    raise ValueError(
+                        f"Invalid `torch_dtype` passed to the SFTConfig. Expected a string with either `torch.dtype` or 'auto', but got {torch_dtype}."
+                    )
+                model_init_kwargs["torch_dtype"] = torch_dtype
+
+        if isinstance(model, str):
+            warnings.warn(
+                "You passed a model_id to the SFTTrainer. This will automatically create an "
+                "`AutoModelForCausalLM` or a `PeftModel` (if you passed a `peft_config`) for you."
+            )
+            model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+
+        if args.packing and data_collator is not None and isinstance(data_collator, DataCollatorForCompletionOnlyLM):
+            raise ValueError(
+                "You passed a `DataCollatorForCompletionOnlyLM` to the SFTTrainer. This is not compatible with the `packing` argument."
+            )
+
+        if is_peft_available() and peft_config is not None:
+            if not isinstance(peft_config, PeftConfig):
+                raise ValueError(
+                    "If you want to use the PeftModel, you need to pass a PeftConfig object to the SFTTrainer."
+                    f" and you passed a {type(peft_config)}."
+                )
+
+            if not isinstance(model, NeuronPeftModel):
+                _support_gc_kwargs = hasattr(
+                    args, "gradient_checkpointing_kwargs"
+                ) and "gradient_checkpointing_kwargs" in list(
+                    inspect.signature(prepare_model_for_kbit_training).parameters
+                )
+                gradient_checkpointing_kwargs = getattr(args, "gradient_checkpointing_kwargs", None) or {}
+                is_sharded_qlora = False
+                # Below is to support QLoRA + FSDP / DS-Zero3 - one should never call
+                # peft_module_casting_to_bf16 or prepare_model_for_kbit_training when doing
+                # QLoRA + FSDP / DS-Zero3
+                if getattr(model, "is_loaded_in_4bit", False):
+                    for _, param in model.named_parameters():
+                        if param.__class__.__name__ == "Params4bit":
+                            is_sharded_qlora = param.data.device.type == "cpu"
+                            break
+                if getattr(model, "is_loaded_in_8bit", False) or (
+                    getattr(model, "is_loaded_in_4bit", False) and not is_sharded_qlora
+                ):
+                    prepare_model_kwargs = {
+                        "use_gradient_checkpointing": getattr(args, "gradient_checkpointing", False)
+                    }
+
+                    if _support_gc_kwargs:
+                        prepare_model_kwargs["gradient_checkpointing_kwargs"] = gradient_checkpointing_kwargs
+
+                    model = prepare_model_for_kbit_training(model, **prepare_model_kwargs)
+
+                    if args is not None:
+                        args = dataclasses.replace(args, gradient_checkpointing=False)
+                elif getattr(args, "gradient_checkpointing", False) and (
+                    "use_reentrant" not in gradient_checkpointing_kwargs
+                    or gradient_checkpointing_kwargs["use_reentrant"]
+                ):
+                    # For backward compatibility with older versions of transformers
+                    if hasattr(model, "enable_input_require_grads"):
+                        model.enable_input_require_grads()
+                    else:
+
+                        def make_inputs_require_grad(module, input, output):
+                            output.requires_grad_(True)
+
+                        model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+                if (
+                    "autocast_adapter_dtype" in list(inspect.signature(get_peft_model).parameters)
+                    and getattr(model, "is_loaded_in_4bit", False)
+                    and is_sharded_qlora
+                ):
+                    model = get_peft_model(model, peft_config, autocast_adapter_dtype=False)
+                else:
+                    model = get_peft_model(model, peft_config)
+                if (
+                    args is not None
+                    and args.bf16
+                    and getattr(model, "is_loaded_in_4bit", False)
+                    and not is_sharded_qlora
+                ):
+                    peft_module_casting_to_bf16(model)
+
+        if tokenizer is None:
+            tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
+            if getattr(tokenizer, "pad_token", None) is None:
+                tokenizer.pad_token = tokenizer.eos_token
+
+        if args.max_seq_length is None:
+            # to overcome some issues with broken tokenizers
+            args.max_seq_length = min(tokenizer.model_max_length, 1024)
+
+            warnings.warn(
+                f"You didn't pass a `max_seq_length` argument to the SFTTrainer, this will default to {args.max_seq_length}"
+            )
+
+        self.dataset_num_proc = args.dataset_num_proc
+
+        self.dataset_batch_size = args.dataset_batch_size
+
+        self._trainer_supports_neftune = hasattr(args, "neftune_noise_alpha")
+
+        if args.dataset_kwargs is None:
+            args.dataset_kwargs = {}
+
+        if formatting_func is None and args.dataset_text_field is None:
+            # check if dataset has ChatML format or instruction format and is supported
+            # if not stays #None
+            formatting_func = get_formatting_func_from_dataset(train_dataset, tokenizer)
+            # if a template is detected, we don't need to add special tokens again
+            if formatting_func is not None:
+                args.dataset_kwargs["add_special_tokens"] = False
+
+        if not args.packing:
+            # If we aren't skipping data preparation, then a dataset_text_field
+            # or formatting_func must be provided.
+            if (
+                args.dataset_text_field is None
+                and formatting_func is None
+                and not args.dataset_kwargs.get("skip_prepare_dataset", False)
+            ):
+                raise ValueError(
+                    "You passed `packing=False` to the SFTTrainer/SFTConfig, but you didn't pass a `dataset_text_field` or `formatting_func` argument."
+                )
+
+            if data_collator is None:
+                data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+
+        # Pre-process the datasets only once per node. The remaining processes will use the cache.
+        with PartialState().local_main_process_first():
+            if train_dataset is not None:
+                train_dataset = self._prepare_dataset(
+                    train_dataset,
+                    tokenizer,
+                    args.packing,
+                    args.dataset_text_field,
+                    args.max_seq_length,
+                    formatting_func,
+                    args.num_of_sequences,
+                    args.chars_per_token,
+                    remove_unused_columns=args.remove_unused_columns if args is not None else True,
+                    **args.dataset_kwargs,
+                )
+            if eval_dataset is not None:
+                _multiple = isinstance(eval_dataset, dict)
+                _eval_datasets = eval_dataset if _multiple else {"singleton": eval_dataset}
+
+                eval_packing = args.packing if args.eval_packing is None else args.eval_packing
+
+                for _eval_dataset_name, _eval_dataset in _eval_datasets.items():
+                    _eval_datasets[_eval_dataset_name] = self._prepare_dataset(
+                        _eval_dataset,
+                        tokenizer,
+                        eval_packing,
+                        args.dataset_text_field,
+                        args.max_seq_length,
+                        formatting_func,
+                        args.num_of_sequences,
+                        args.chars_per_token,
+                        remove_unused_columns=args.remove_unused_columns if args is not None else True,
+                        **args.dataset_kwargs,
+                    )
+                if not _multiple:
+                    eval_dataset = _eval_datasets["singleton"]
+
+        if tokenizer.padding_side is not None and tokenizer.padding_side != "right":
+            warnings.warn(
+                "You passed a tokenizer with `padding_side` not equal to `right` to the SFTTrainer. This might lead to some unexpected behaviour due to "
+                "overflow issues when training a model in half-precision. You might consider adding `tokenizer.padding_side = 'right'` to your code."
+            )
+
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            model_init=model_init,
+            compute_metrics=compute_metrics,
+            callbacks=callbacks,
+            optimizers=optimizers,
+            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
+        )
+
+        # Add tags for models that have been loaded with the correct transformers version
+        if hasattr(self.model, "add_model_tags"):
+            self.model.add_model_tags(self._tag_names)
+
+        if self.args.max_steps > 0 and args.packing:
+            warnings.warn(
+                "You passed `packing=True` to the SFTTrainer/SFTConfig, and you are training your model with `max_steps` strategy. The dataset will be iterated until the `max_steps` are reached."
+            )
+            self.train_dataset.infinite = True
+        elif self.args.max_steps == -1 and args.packing:
+            self.train_dataset.infinite = False
+
+        if any(isinstance(callback, RichProgressCallback) for callback in self.callback_handler.callbacks):
+            for callback in self.callback_handler.callbacks:
+                # Remove the PrinterCallback to avoid duplicated prints in case we passed a `RichProgressCallback`
+                if callback.__class__.__name__ == "PrinterCallback":
+                    self.callback_handler.pop_callback(callback)
+
+    @wraps(_TrainerForNeuron.train)
+    def train(self, *args, **kwargs):
+        # Activate neftune right before training.
+        if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
+            self.model = self._trl_activate_neftune(self.model)
+
+        output = super().train(*args, **kwargs)
+
+        # After training we make sure to retrieve back the original forward pass method
+        # for the embedding layer by removing the forward post hook.
+        if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
+            unwrapped_model = unwrap_model(self.model)
+            if is_peft_available() and isinstance(unwrapped_model, NeuronPeftModel):
+                embeddings = unwrapped_model.base_model.model.get_input_embeddings()
+            else:
+                embeddings = unwrapped_model.get_input_embeddings()
+
+            self.neftune_hook_handle.remove()
+            del embeddings.neftune_noise_alpha
+
+        return output
+
+    def _prepare_non_packed_dataloader(
+        self,
+        tokenizer,
+        dataset,
+        dataset_text_field,
+        max_seq_length,
+        formatting_func=None,
+        add_special_tokens=True,
+        remove_unused_columns=True,
+    ):
+        use_formatting_func = formatting_func is not None and dataset_text_field is None
+        self._dataset_sanity_checked = False
+
+        # Inspired from: https://huggingface.co/learn/nlp-course/chapter7/6?fw=pt
+        def tokenize(element):
+            outputs = tokenizer(
+                element[dataset_text_field] if not use_formatting_func else formatting_func(element),
+                add_special_tokens=add_special_tokens,
+                truncation=True,
+                # For Neuron we need to pad because otherwise it will trigger compilation for each new sequence length.
+                padding="max_length",
+                max_length=max_seq_length,
+                return_overflowing_tokens=False,
+                return_length=False,
+            )
+
+            if use_formatting_func and not self._dataset_sanity_checked:
+                if not isinstance(formatting_func(element), list):
+                    raise ValueError(
+                        "The `formatting_func` should return a list of processed strings since it can lead to silent bugs."
+                    )
+                else:
+                    self._dataset_sanity_checked = True
+
+            return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+
+        signature_columns = ["input_ids", "labels", "attention_mask"]
+
+        if dataset.column_names is not None:  # None for IterableDataset
+            extra_columns = list(set(dataset.column_names) - set(signature_columns))
+        else:
+            extra_columns = []
+
+        if not remove_unused_columns and len(extra_columns) > 0:
+            warnings.warn(
+                "You passed `remove_unused_columns=False` on a non-packed dataset. This might create some issues with the default collator and yield to errors. If you want to "
+                f"inspect dataset other columns (in this case {extra_columns}), you can subclass `DataCollatorForLanguageModeling` in case you used the default collator and create your own data collator in order to inspect the unused dataset columns."
+            )
+
+        map_kwargs = {
+            "batched": True,
+            "remove_columns": dataset.column_names if remove_unused_columns else None,
+            "batch_size": self.dataset_batch_size,
+        }
+        if isinstance(dataset, datasets.Dataset):
+            map_kwargs["num_proc"] = self.dataset_num_proc  # this arg is not available for IterableDataset
+        tokenized_dataset = dataset.map(tokenize, **map_kwargs)
+
+        return tokenized_dataset
diff --git a/optimum/neuron/training_args.py b/optimum/neuron/training_args.py
index ce6e34a0b..176373716 100644
--- a/optimum/neuron/training_args.py
+++ b/optimum/neuron/training_args.py
@@ -130,7 +130,7 @@ def __post_init__(self):
         # Patches accelerate.utils.imports.is_tpu_available to match `is_torch_xla_available`
         patch_accelerate_is_torch_xla_available()
 
-        if self.fsdp != "":
+        if self.fsdp not in ["", []]:
             raise RuntimeError("FSDP is not supported.")
 
         if self.fp16:
diff --git a/optimum/neuron/utils/__init__.py b/optimum/neuron/utils/__init__.py
index ce8283639..0c4e60209 100644
--- a/optimum/neuron/utils/__init__.py
+++ b/optimum/neuron/utils/__init__.py
@@ -40,6 +40,7 @@
         "is_torch_neuronx_available",
         "is_torch_xla_available",
         "is_transformers_neuronx_available",
+        "is_trl_available",
     ],
     "input_generators": [
         "DummyBeamValuesGenerator",
@@ -73,6 +74,7 @@
         "is_model_officially_supported",
         "patch_transformers_for_neuron_sdk",
     ],
+    "trl_utils": ["NeuronSFTConfig"],
 }
 
 if TYPE_CHECKING:
@@ -97,6 +99,7 @@
         is_torch_neuronx_available,
         is_torch_xla_available,
         is_transformers_neuronx_available,
+        is_trl_available,
     )
     from .input_generators import (
         ASTDummyAudioInputGenerator,
@@ -130,6 +133,7 @@
         is_model_officially_supported,
         patch_transformers_for_neuron_sdk,
     )
+    from .trl_utils import NeuronSFTConfig
 else:
     import sys
 
diff --git a/optimum/neuron/utils/import_utils.py b/optimum/neuron/utils/import_utils.py
index 11340e1d6..ebfc7d81d 100644
--- a/optimum/neuron/utils/import_utils.py
+++ b/optimum/neuron/utils/import_utils.py
@@ -65,3 +65,14 @@ def is_accelerate_available(min_version: Optional[str] = MIN_ACCELERATE_VERSION)
 
 def is_torch_neuronx_available() -> bool:
     return importlib.util.find_spec("torch_neuronx") is not None
+
+
+def is_trl_available() -> bool:
+    trl_available = importlib.util.find_spec("trl") is not None
+    if trl_available:
+        import trl
+
+        if version.parse(trl.__version__) >= version.parse("0.10.0"):
+            return True
+        raise RuntimeError("Only `trl` 0.10.0 and more recent is supported.")
+    return False
diff --git a/optimum/neuron/utils/trl_utils.py b/optimum/neuron/utils/trl_utils.py
new file mode 100644
index 000000000..c3b4d129c
--- /dev/null
+++ b/optimum/neuron/utils/trl_utils.py
@@ -0,0 +1,35 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities related to the TRL library and support."""
+
+from dataclasses import dataclass
+
+from ..training_args import NeuronTrainingArguments
+from .import_utils import is_trl_available
+
+
+if is_trl_available():
+    from trl import SFTConfig
+else:
+
+    @dataclass
+    class SFTConfig:
+        def __init__(self, *args, **kwargs):
+            raise RuntimeError("You need to install the `trl` library to use the `NeuronSFTConfig`.")
+
+
+@dataclass
+class NeuronSFTConfig(NeuronTrainingArguments, SFTConfig):
+    pass
diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
index 9d4ec89f1..78c865a5e 100644
--- a/optimum/neuron/version.py
+++ b/optimum/neuron/version.py
@@ -12,6 +12,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "0.0.24.dev0"
+__version__ = "0.0.25.dev0"
 
 __sdk_version__ = "2.19.1"
diff --git a/pyproject.toml b/pyproject.toml
index 572fda1d3..01d30af5b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ line-length = 119
 # Never enforce `E501` (line length violations).
 ignore = ["C901", "E501", "E741", "W605"]
 select = ["C", "E", "F", "I", "W"]
+exclude = ["*.ipynb"]
 
 # Ignore import violations in all `__init__.py` files.
 [tool.ruff.lint.per-file-ignores]
diff --git a/setup.py b/setup.py
index 63febc0fd..a389de8dc 100644
--- a/setup.py
+++ b/setup.py
@@ -13,9 +13,9 @@
 
 
 INSTALL_REQUIRES = [
-    "transformers == 4.41.1",
+    "transformers == 4.43.2",
     "accelerate == 0.29.2",
-    "optimum ~= 1.20.0",
+    "optimum ~= 1.21.0",
     "huggingface_hub >= 0.20.1",
     "numpy>=1.22.2, <=1.25.2",
     "protobuf<4",
@@ -33,6 +33,7 @@
     "safetensors",
     "sentence-transformers >= 2.2.0",
     "peft",
+    "trl",
     "compel",
     "rjieba",
     "soundfile",
diff --git a/tests/cache/test_neuronx_cache.py b/tests/cache/test_neuronx_cache.py
index a5f3c3886..684e3b6fe 100644
--- a/tests/cache/test_neuronx_cache.py
+++ b/tests/cache/test_neuronx_cache.py
@@ -34,14 +34,14 @@
 )
 from optimum.neuron.utils import get_hub_cached_entries, synchronize_hub_cache
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-from optimum.utils.testing_utils import TOKEN
 
 
 @pytest.fixture
-def cache_repos():
+def cache_repos(staging):
     # Setup: create temporary Hub repository and local cache directory
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-    user = api.whoami()["name"]
+    token = staging["token"]
+    user = staging["user"]
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=token)
     hostname = socket.gethostname()
     cache_repo_id = f"{user}/{hostname}-optimum-neuron-cache"
     if api.repo_exists(cache_repo_id):
@@ -57,7 +57,7 @@ def cache_repos():
     os.environ["NEURON_COMPILE_CACHE_URL"] = cache_path
     os.environ["CUSTOM_CACHE_REPO"] = cache_repo_id
     os.environ["HF_ENDPOINT"] = ENDPOINT_STAGING
-    os.environ["HF_TOKEN"] = TOKEN
+    os.environ["HF_TOKEN"] = token
     yield (cache_path, cache_repo_id)
     # Teardown
     api.delete_repo(cache_repo_id)
@@ -173,7 +173,8 @@ def check_traced_cache_entry(cache_path):
 
 
 def assert_local_and_hub_cache_sync(cache_path, cache_repo_id):
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
+    # Since created models are public on the staging endpoint we don't need a token
+    api = HfApi(endpoint=ENDPOINT_STAGING)
     remote_files = api.list_repo_files(cache_repo_id)
     local_files = get_local_cached_files(cache_path)
     for file in local_files:
diff --git a/tests/conftest.py b/tests/conftest.py
index 8062756a5..539136409 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,7 +26,7 @@
     set_neuron_cache_path,
 )
 
-from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, get_random_string
+from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, TOKEN_STAGING, USER_STAGING, get_random_string
 
 
 # Inferentia fixtures
@@ -171,3 +171,19 @@ def pytest_fixture_setup(fixturedef, request):
     if getattr(fixturedef.func, "is_dist_fixture", False):
         dist_fixture_class = fixturedef.func()
         dist_fixture_class(request)
+
+
+@pytest.fixture
+def staging():
+    """A pytest fixture only available in huggingface_hub staging mode
+
+    If the huggingface_hub is not operating in staging mode, tests using
+    that fixture are automatically skipped.
+
+    Returns:
+        a Dict containing a valid staging user and token.
+    """
+    return {
+        "user": USER_STAGING,
+        "token": TOKEN_STAGING,
+    }
diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
index c50c3c72c..b8346a7fb 100644
--- a/tests/decoder/conftest.py
+++ b/tests/decoder/conftest.py
@@ -58,8 +58,7 @@ def _export_model(model_id, export_kwargs, neuron_model_path):
     try:
         subprocess.run(export_command, check=True)
     except subprocess.CalledProcessError as e:
-        logger.error(f"Failed to export model: {e}")
-        return
+        raise SystemError(f"Failed to export model: {e}")
 
 
 @pytest.fixture(scope="session", params=DECODER_MODEL_CONFIGURATIONS.keys())
diff --git a/tests/decoder/test_decoder_pipelines.py b/tests/decoder/test_decoder_pipelines.py
index 9f850a871..83045f93b 100644
--- a/tests/decoder/test_decoder_pipelines.py
+++ b/tests/decoder/test_decoder_pipelines.py
@@ -35,7 +35,7 @@ def _test_generation(p):
             # We only ever generate one sequence per input
             sequence = output[0]
             if return_tensors:
-                input_ids = p.tokenizer(input, add_special_tokens=False).input_ids
+                input_ids = p.tokenizer(input).input_ids
                 assert sequence["generated_token_ids"][: len(input_ids)] == input_ids
             else:
                 assert sequence["generated_text"].startswith(input)
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index 7e372ad9a..d94e0f0e6 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -13,39 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-import re
 
 from huggingface_hub import HfApi
 from transformers.testing_utils import ENDPOINT_STAGING
 
 from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-from optimum.utils.testing_utils import TOKEN, USER
-
-
-def _test_push_to_hub(model, model_path, repo_id, ignore_patterns=[]):
-    model.push_to_hub(model_path, repo_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
-    api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
-    try:
-        hub_files_path = api.list_repo_files(repo_id)
-        for path, _, files in os.walk(model_path):
-            for name in files:
-                local_file_path = os.path.join(path, name)
-                hub_file_path = os.path.relpath(local_file_path, model_path)
-                excluded = False
-                for pattern in ignore_patterns:
-                    if re.compile(pattern).match(hub_file_path) is not None:
-                        excluded = True
-                        break
-                assert excluded or hub_file_path in hub_files_path
-    finally:
-        api.delete_repo(repo_id)
-
-
-def neuron_push_model_id(model_id):
-    model_name = model_id.split("/")[-1]
-    repo_id = f"{USER}/{model_name}-neuronx"
-    return repo_id
 
 
 @is_inferentia_test
@@ -59,6 +32,18 @@ def test_seq2seq_model_from_hub():
 
 @is_inferentia_test
 @requires_neuronx
-def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id):
+def test_push_seq2seq_to_hub(neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, staging):
     model = NeuronModelForSeq2SeqLM.from_pretrained(neuron_seq2seq_greedy_path)
-    _test_push_to_hub(model, neuron_seq2seq_greedy_path, neuron_push_seq2seq_id)
+    model.push_to_hub(
+        neuron_seq2seq_greedy_path, neuron_push_seq2seq_id, use_auth_token=staging.token, endpoint=ENDPOINT_STAGING
+    )
+    api = HfApi(endpoint=ENDPOINT_STAGING, token=staging.token)
+    try:
+        hub_files_path = api.list_repo_files(neuron_push_seq2seq_id)
+        for path, _, files in os.walk(neuron_seq2seq_greedy_path):
+            for name in files:
+                local_file_path = os.path.join(path, name)
+                hub_file_path = os.path.relpath(local_file_path, neuron_seq2seq_greedy_path)
+                assert hub_file_path in hub_files_path
+    finally:
+        api.delete_repo(neuron_push_seq2seq_id)
diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
index 9e7d45370..0effb7387 100644
--- a/tests/test_cache_utils.py
+++ b/tests/test_cache_utils.py
@@ -37,9 +37,8 @@
     set_neuron_cache_path,
 )
 from optimum.neuron.utils.testing_utils import is_trainium_test
-from optimum.utils.testing_utils import TOKEN, USER
 
-from .utils import StagingTestMixin, TrainiumTestMixin, get_random_string
+from .utils import TOKEN_STAGING, USER_STAGING, StagingTestMixin, TrainiumTestMixin, get_random_string
 
 
 DUMMY_COMPILER_VERSION = "1.2.3"
@@ -147,10 +146,10 @@ def test_list_files_in_neuron_cache(self):
 class StagingNeuronUtilsTestCase(StagingTestMixin, TestCase):
     def test_set_custom_cache_repo_name_in_hf_home(self):
         orig_token = get_token()
-        login(TOKEN)
+        login(TOKEN_STAGING)
 
         repo_name = f"blablabla-{self.seed}"
-        repo_id = f"{USER}/{repo_name}"
+        repo_id = f"{USER_STAGING}/{repo_name}"
         create_repo(repo_name, repo_type="model")
 
         def remove_repo():
diff --git a/tests/test_trainers.py b/tests/test_trainers.py
index 58ef2c4c4..514120247 100644
--- a/tests/test_trainers.py
+++ b/tests/test_trainers.py
@@ -28,7 +28,7 @@
     AutoModelForSequenceClassification,
 )
 
-from optimum.neuron import NeuronTrainer, NeuronTrainingArguments
+from optimum.neuron import NeuronSFTConfig, NeuronSFTTrainer, NeuronTrainer, NeuronTrainingArguments
 from optimum.neuron.distributed.utils import MODEL_PARALLEL_SHARDS_DIR_NAME
 from optimum.neuron.utils import is_neuronx_distributed_available
 from optimum.neuron.utils.cache_utils import (
@@ -300,7 +300,7 @@ def create_training_args(output_dir, resume_from_checkpoint=None, max_steps=max_
                 per_device_train_batch_size=train_batch_size,
                 per_device_eval_batch_size=eval_batch_size,
                 max_steps=max_steps,
-                logging_steps=1,
+                logging_steps=2,
                 save_steps=5,
                 do_eval=do_eval,
                 output_dir=output_dir,
@@ -396,3 +396,69 @@ def preprocess_function(examples):
 
         trainer.train(resume_from_checkpoint=True)
         trainer.evaluate()
+
+
+@is_trainium_test
+class TestNeuronSFTTrainer(DistributedTest):
+    @pytest.fixture(
+        scope="class",
+        params=[[2, 1, 1], [2, 2, 1]],
+        ids=["dp=2", "tp=2"],
+    )
+    def parallel_sizes(self, request):
+        return request.param
+
+    def _test_sft_trainer(self, parallel_sizes, tmpdir, packing):
+        _, tp_size, pp_size = parallel_sizes
+
+        output_dir = Path(tmpdir)
+
+        dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
+
+        def format_dolly(sample):
+            instruction = f"### Instruction\n{sample['instruction']}"
+            context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
+            response = f"### Answer\n{sample['response']}"
+            # join all the parts together
+            prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
+            if packing:
+                return prompt
+            return [prompt]
+
+        tokenizer, model = get_tokenizer_and_tiny_llama_model()
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "left"  # to prevent warnings
+
+        args = NeuronTrainingArguments(
+            output_dir=output_dir,
+            do_train=True,
+            max_steps=20,
+            per_device_train_batch_size=1,
+            tensor_parallel_size=tp_size,
+            pipeline_parallel_size=pp_size,
+            logging_steps=1,
+        )
+        args = args.to_dict()
+        sft_config = NeuronSFTConfig(
+            max_seq_length=512,
+            packing=packing,
+            dataset_num_proc=1,
+            **args,
+        )
+
+        # Create Trainer instance
+        trainer = NeuronSFTTrainer(
+            model=model,
+            tokenizer=tokenizer,
+            train_dataset=dataset,
+            formatting_func=format_dolly,
+            args=sft_config,
+        )
+
+        trainer.train()
+
+    def test_without_packing(self, parallel_sizes, tmpdir):
+        return self._test_sft_trainer(parallel_sizes, tmpdir, False)
+
+    def test_with_packing(self, parallel_sizes, tmpdir):
+        return self._test_sft_trainer(parallel_sizes, tmpdir, True)
diff --git a/tests/utils.py b/tests/utils.py
index 7bd6c279b..060c77596 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -58,11 +58,15 @@
 from optimum.neuron.utils.patching import DynamicPatch, Patcher
 from optimum.neuron.utils.require_utils import requires_neuronx_distributed
 from optimum.utils import logging
-from optimum.utils.testing_utils import TOKEN, USER
 
 
 logger = logging.get_logger(__name__)
 
+
+# Not critical, only usable on the sandboxed CI instance.
+USER_STAGING = "__DUMMY_OPTIMUM_USER__"
+TOKEN_STAGING = "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"
+
 SEED = 42
 OPTIMUM_INTERNAL_TESTING_CACHE_REPO = "optimum-internal-testing/optimum-neuron-cache-for-testing"
 
@@ -450,7 +454,7 @@ def tearDownClass(cls):
 
 class StagingTestMixin:
     CUSTOM_CACHE_REPO_NAME = "optimum-neuron-cache-testing"
-    CUSTOM_CACHE_REPO = f"{USER}/{CUSTOM_CACHE_REPO_NAME}"
+    CUSTOM_CACHE_REPO = f"{USER_STAGING}/{CUSTOM_CACHE_REPO_NAME}"
     CUSTOM_PRIVATE_CACHE_REPO = f"{CUSTOM_CACHE_REPO}-private"
     _token = ""
     MAX_NUM_LINEARS = 20
@@ -468,8 +472,8 @@ def set_hf_hub_token(cls, token: Optional[str]) -> Optional[str]:
 
     @classmethod
     def setUpClass(cls):
-        cls._staging_token = TOKEN
-        cls._token = cls.set_hf_hub_token(TOKEN)
+        cls._staging_token = TOKEN_STAGING
+        cls._token = cls.set_hf_hub_token(TOKEN_STAGING)
         cls._custom_cache_repo_name = load_custom_cache_repo_name_from_hf_home()
         delete_custom_cache_repo_name_from_hf_home()
 
@@ -511,6 +515,6 @@ def remove_all_files_in_repo(self, repo_id: str):
             pass
 
     def tearDown(self):
-        login(TOKEN)
+        login(TOKEN_STAGING)
         self.remove_all_files_in_repo(self.CUSTOM_CACHE_REPO)
         self.remove_all_files_in_repo(self.CUSTOM_PRIVATE_CACHE_REPO)
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 597a3bea7..09b8f1e80 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -8,7 +8,7 @@ RUN tar -C /tgi -xf /tgi/sources.tar.gz --strip-components=1
 
 # Build cargo components (adapted from TGI original Dockerfile)
 # Note that the build image is aligned on the same Linux version as the base image (Debian bookworm/ Ubuntu 22.04)
-FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.79-bookworm AS chef
 WORKDIR /usr/src
 
 ARG CARGO_REGISTRIES_CRATES_IO_PROTOCOL=sparse
@@ -20,8 +20,6 @@ COPY --from=tgi /tgi/proto proto
 COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/launcher launcher
-# Remove the next line when bumping rust version
-RUN cargo update ravif --precise 0.11.6
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder
@@ -41,6 +39,8 @@ COPY --from=tgi /tgi/proto proto
 COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/launcher launcher
+# Remove this line once TGI has fixed the conflict
+RUN cargo update ureq --precise 2.9.7
 RUN cargo build --release --workspace --exclude benchmark
 
 # Python base image
diff --git a/text-generation-inference/server/text_generation_server/cli.py b/text-generation-inference/server/text_generation_server/cli.py
index 5aa9cceb6..59c09125c 100644
--- a/text-generation-inference/server/text_generation_server/cli.py
+++ b/text-generation-inference/server/text_generation_server/cli.py
@@ -17,6 +17,9 @@ def serve(
     uds_path: str = "/tmp/text-generation-server",
     logger_level: str = "INFO",
     json_output: bool = False,
+    otlp_endpoint: Optional[str] = None,
+    otlp_service_name: str = "text-generation-inference.server",
+    max_input_tokens: Optional[int] = None,
 ):
     """This is the main entry-point for the server CLI.
 
@@ -36,6 +39,12 @@ def serve(
             The server logger level. Defaults to *INFO*.
         json_output (`bool`):
             Use JSON format for log serialization.
+        otlp_endpoint (`Optional[str]`, defaults to `None`):
+            The Open Telemetry endpoint to use.
+        otlp_service_name (`Optional[str]`, defaults to `None`):
+            The name to use when pushing data to the Open Telemetry endpoint.
+        max_input_tokens (`Optional[int]`, defaults to `None`):
+            The maximum number of input tokens each request should contain.
     """
     if sharded:
         raise ValueError("Sharding is not supported.")
diff --git a/text-generation-inference/server/text_generation_server/generator.py b/text-generation-inference/server/text_generation_server/generator.py
index 9be650c98..54eb7d10a 100644
--- a/text-generation-inference/server/text_generation_server/generator.py
+++ b/text-generation-inference/server/text_generation_server/generator.py
@@ -472,8 +472,18 @@ def decode(self, batches: List[CachedBatch]) -> Tuple[List[Generation], CachedBa
         # just carry on with decoding. We adopt the id of the first
         # batch in the list as our next batch id.
         next_batch_id = batches[0].id
+        request_ids = []
+        for batch in batches:
+            request_ids += batch.request_ids
+        cleared_request_ids = []
+        for slot in self.slots:
+            if slot.state == slot.State.READY and slot.request_id not in request_ids:
+                cleared_request_ids.append(slot.request_id)
+                slot.clear()
+        if len(cleared_request_ids) > 0:
+            logger.info(f"Clearing slot for requests {cleared_request_ids} as they are not requested.")
         active_slots = [slot for slot in self.slots if slot.state == slot.State.READY]
-        if len(active_slots) == 0:
+        if len(active_slots) < len(request_ids):
             raise ValueError("Unable to decode tokens for non-prefilled batches (probably due to a previous failure)")
         if self.model.continuous_batching:
             decode_slots = active_slots
diff --git a/text-generation-inference/tests/fixtures/model.py b/text-generation-inference/tests/fixtures/model.py
index c94d45784..b1e785308 100644
--- a/text-generation-inference/tests/fixtures/model.py
+++ b/text-generation-inference/tests/fixtures/model.py
@@ -54,8 +54,7 @@ def export_model(model_id, export_kwargs, neuron_model_path):
     try:
         subprocess.run(export_command, check=True)
     except subprocess.CalledProcessError as e:
-        logger.error(f"Failed to export model: {e}")
-        return
+        raise ValueError(f"Failed to export model: {e}")
 
 
 @pytest.fixture(scope="session", params=MODEL_CONFIGURATIONS.keys())
diff --git a/text-generation-inference/tests/integration/test_implicit_env.py b/text-generation-inference/tests/integration/test_implicit_env.py
index ec1708d50..bb090d10c 100644
--- a/text-generation-inference/tests/integration/test_implicit_env.py
+++ b/text-generation-inference/tests/integration/test_implicit_env.py
@@ -17,8 +17,8 @@ async def tgi_service(request, launcher, neuron_model_config):
     # the tgi_env.py script will take care of setting these
     for var in [
         "MAX_BATCH_SIZE",
-        "MAX_INPUT_LENGTH",
-        "MAX_TOTAL_TOKEN",
+        "MAX_INPUT_TOKENS",
+        "MAX_TOTAL_TOKENS",
         "HF_NUM_CORES",
         "HF_AUTO_CAST_TYPE",
     ]:
diff --git a/text-generation-inference/tgi_env.py b/text-generation-inference/tgi_env.py
index 4584358ae..5c3ea9141 100755
--- a/text-generation-inference/tgi_env.py
+++ b/text-generation-inference/tgi_env.py
@@ -16,7 +16,7 @@
 
 logger = logging.getLogger(__name__)
 
-tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_LENGTH"]
+tgi_router_env_vars = ["MAX_BATCH_SIZE", "MAX_TOTAL_TOKENS", "MAX_INPUT_TOKENS"]
 tgi_server_env_vars = ["HF_NUM_CORES", "HF_AUTO_CAST_TYPE"]
 
 env_config_peering = [
@@ -38,7 +38,9 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
     if not argv:
         argv = sys.argv
     # All these are params passed to tgi and intercepted here
-    parser.add_argument("--max-input-length", type=int, default=os.getenv("MAX_INPUT_LENGTH", 0))
+    parser.add_argument(
+        "--max-input-tokens", type=int, default=os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0))
+    )
     parser.add_argument("--max-total-tokens", type=int, default=os.getenv("MAX_TOTAL_TOKENS", 0))
     parser.add_argument("--max-batch-size", type=int, default=os.getenv("MAX_BATCH_SIZE", 0))
     parser.add_argument("--model-id", type=str, default=os.getenv("MODEL_ID"))
@@ -57,8 +59,8 @@ def parse_cmdline_and_set_env(argv: List[str] = None) -> argparse.Namespace:
     if args.max_total_tokens > 0:
         os.environ["MAX_TOTAL_TOKENS"] = str(args.max_total_tokens)
 
-    if args.max_input_length > 0:
-        os.environ["MAX_INPUT_LENGTH"] = str(args.max_input_length)
+    if args.max_input_tokens > 0:
+        os.environ["MAX_INPUT_TOKENS"] = str(args.max_input_tokens)
 
     if args.max_batch_size > 0:
         os.environ["MAX_BATCH_SIZE"] = str(args.max_batch_size)
@@ -73,12 +75,12 @@ def neuron_config_to_env(neuron_config):
     with open(os.environ["ENV_FILEPATH"], "w") as f:
         for env_var, config_key in env_config_peering:
             f.write("export {}={}\n".format(env_var, neuron_config[config_key]))
-        max_input_length = os.getenv("MAX_INPUT_LENGTH")
-        if not max_input_length:
-            max_input_length = int(neuron_config["sequence_length"]) // 2
-            if max_input_length == 0:
+        max_input_tokens = os.getenv("MAX_INPUT_TOKENS")
+        if not max_input_tokens:
+            max_input_tokens = int(neuron_config["sequence_length"]) // 2
+            if max_input_tokens == 0:
                 raise Exception("Model sequence length should be greater than 1")
-        f.write("export MAX_INPUT_LENGTH={}\n".format(max_input_length))
+        f.write("export MAX_INPUT_TOKENS={}\n".format(max_input_tokens))
 
 
 def sort_neuron_configs(dictionary):
@@ -149,13 +151,13 @@ def check_env_and_neuron_config_compatibility(neuron_config: Dict[str, Any], che
             )
             return False
 
-    if os.getenv("MAX_INPUT_LENGTH"):
-        max_input_length = int(os.environ["MAX_INPUT_LENGTH"])
+    max_input_tokens = int(os.getenv("MAX_INPUT_TOKENS", os.getenv("MAX_INPUT_LENGTH", 0)))
+    if max_input_tokens > 0:
         sequence_length = neuron_config["sequence_length"]
-        if max_input_length >= sequence_length:
+        if max_input_tokens >= sequence_length:
             logger.debug(
-                "Specified max input length is not compatible with config sequence length " "( %s >= %s)",
-                max_input_length,
+                "Specified max input tokens is not compatible with config sequence length " "( %s >= %s)",
+                max_input_tokens,
                 sequence_length,
             )
             return False