diff --git a/benchmark/text-generation-inference/README.md b/benchmark/text-generation-inference/README.md
index 1dc712fb9..ca6bf4cee 100644
--- a/benchmark/text-generation-inference/README.md
+++ b/benchmark/text-generation-inference/README.md
@@ -2,7 +2,7 @@
 
 ## Local environment setup
 
-These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace.  
+These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace.
 
 Copy the configurations down using
 
@@ -44,16 +44,8 @@ Alternatively, you can edit the appropriate docker-compose.yaml to supply the fu
 
 ## Start the servers
 
-For smaller models, you can use the multi-server configuration with a load balancer:
-
-```shell
-$ docker compose --env-file llama-7b/.env up
-```
-
-For larger models, use their specific docker files:
-
 ```shell
-$ docker compose -f llama3-70b/docker-compose.yaml --env-file llama3-70b/.env up
+$ docker compose -f llama3-8b/docker-compose.yaml --env-file llama3-8b/.env up
 ```
 
 Note: edit the .env file to change the model configuration
@@ -87,7 +79,7 @@ $ ./run_all.sh NousResearch/Meta-Llama-3-70B-Instruct
 
 ### Compiling the model
 
-If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally. 
+If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally.
 
 See the [llama3-70b-trn1.32xlarge](llama3-70b-trn1.32xlarge) as an example.
 
diff --git a/benchmark/text-generation-inference/llama-7b/.env b/benchmark/text-generation-inference/llama-7b/.env
index 2b89e8305..ec8e3811d 100644
--- a/benchmark/text-generation-inference/llama-7b/.env
+++ b/benchmark/text-generation-inference/llama-7b/.env
@@ -1,5 +1,5 @@
 MODEL_ID='NousResearch/Llama-2-7b-chat-hf'
 HF_AUTO_CAST_TYPE='fp16'
-MAX_BATCH_SIZE=32
+MAX_BATCH_SIZE=24
 MAX_INPUT_LENGTH=3072
 MAX_TOTAL_TOKENS=4096
diff --git a/benchmark/text-generation-inference/llama-7b/docker-compose.yaml b/benchmark/text-generation-inference/llama-7b/docker-compose.yaml
new file mode 100644
index 000000000..6d86d5afb
--- /dev/null
+++ b/benchmark/text-generation-inference/llama-7b/docker-compose.yaml
@@ -0,0 +1,57 @@
+version: '3.7'
+
+services:
+  tgi-1:
+    image: neuronx-tgi:latest
+    ports:
+      - "8081:8081"
+    environment:
+      - PORT=8081
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=12
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron0"
+      - "/dev/neuron1"
+      - "/dev/neuron2"
+      - "/dev/neuron3"
+      - "/dev/neuron4"
+      - "/dev/neuron5"
+
+  tgi-2:
+    image: neuronx-tgi:latest
+    ports:
+      - "8082:8082"
+    environment:
+      - PORT=8082
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=12
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron6"
+      - "/dev/neuron7"
+      - "/dev/neuron8"
+      - "/dev/neuron9"
+      - "/dev/neuron10"
+      - "/dev/neuron11"
+
+  loadbalancer:
+    image: nginx:alpine
+    ports:
+      - "8080:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - tgi-1
+      - tgi-2
+    deploy:
+      placement:
+        constraints: [node.role == manager]
diff --git a/benchmark/text-generation-inference/llama-7b/nginx.conf b/benchmark/text-generation-inference/llama-7b/nginx.conf
new file mode 100644
index 000000000..01a8f657b
--- /dev/null
+++ b/benchmark/text-generation-inference/llama-7b/nginx.conf
@@ -0,0 +1,14 @@
+### Nginx TGI Load Balancer
+events {}
+http {
+    upstream tgicluster {
+        server tgi-1:8081;
+        server tgi-2:8082;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://tgicluster;
+        }
+    }
+}
diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv
index 96f382dc9..faa69bec7 100644
--- a/benchmark/text-generation-inference/llama-7b/tgi-results.csv
+++ b/benchmark/text-generation-inference/llama-7b/tgi-results.csv
@@ -1,11 +1,13 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233
-huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232
-huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392
-huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552
-huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682
-huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239
-huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213
-huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808
-huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456
-huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137
+huggingface/NousResearch/Llama-2-7b-chat-hf,1,32.32303647791415,0.4092339959997844,32.98457994546189
+huggingface/NousResearch/Llama-2-7b-chat-hf,8,280.2455817454919,0.4103973410001345,18.824823855788903
+huggingface/NousResearch/Llama-2-7b-chat-hf,16,606.2237208004269,0.42390128999977605,19.73879150452322
+huggingface/NousResearch/Llama-2-7b-chat-hf,24,778.5847225896651,0.44628154350084515,21.729555672304947
+huggingface/NousResearch/Llama-2-7b-chat-hf,32,660.0774421854719,0.6625862749997395,40.97050480951723
+huggingface/NousResearch/Llama-2-7b-chat-hf,50,809.3513111702051,1.1112228684996808,32.355166522075024
+huggingface/NousResearch/Llama-2-7b-chat-hf,64,902.2019208540152,1.518584174499665,34.52519498921747
+huggingface/NousResearch/Llama-2-7b-chat-hf,96,1000.426066970307,2.581174633500268,44.06432188527795
+huggingface/NousResearch/Llama-2-7b-chat-hf,100,965.894643860531,5.110174397500032,44.77109148855796
+huggingface/NousResearch/Llama-2-7b-chat-hf,128,1070.1775339135268,7.600166947499474,49.67094403911358
+huggingface/NousResearch/Llama-2-7b-chat-hf,150,1059.6704842739082,8.564977125500263,55.47304516981905
+huggingface/NousResearch/Llama-2-7b-chat-hf,200,1138.036018763616,13.465086967999923,64.28243222745746
diff --git a/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv b/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv
index 39bb30005..fe0313b6e 100644
--- a/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv
+++ b/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv
@@ -1,11 +1,4 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,18.818667211424472,1.3884793975012144,51.46871325828836
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,32.22257477833452,2.0121661404991755,56.734265583687296
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,50.19917175671667,5.205651430500438,66.04042245148653
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,52.13272738944358,9.568476632499369,97.32615035298838
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,53.59997031445967,26.087651531999654,191.19227161475598
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,56.08684244759754,61.25285707449984,310.16900484570965
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,57.40338464731561,129.3146581359997,560.2474255463762
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,58.39025853766574,267.3882590960002,1094.9986170264501
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,58.589480601098536,541.6153878579971,2147.5413489446523
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,58.69645477077839,1085.1772966810022,4231.7554182432905
\ No newline at end of file
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,22.4846564103628,1.2006561384987435,43.7079989917263
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,64.09420641185218,4.925495064999268,85.36754380113435
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,66.62450133873871,22.87049979600488,135.4123021951354
diff --git a/benchmark/text-generation-inference/llama3-8b/.env b/benchmark/text-generation-inference/llama3-8b/.env
new file mode 100644
index 000000000..d0b814f71
--- /dev/null
+++ b/benchmark/text-generation-inference/llama3-8b/.env
@@ -0,0 +1,5 @@
+MODEL_ID='NousResearch/Meta-Llama-3-8B-Instruct'
+HF_AUTO_CAST_TYPE='fp16'
+MAX_BATCH_SIZE=32
+MAX_INPUT_LENGTH=4000
+MAX_TOTAL_TOKENS=4096
diff --git a/benchmark/text-generation-inference/docker-compose.yaml b/benchmark/text-generation-inference/llama3-8b/docker-compose.yaml
similarity index 100%
rename from benchmark/text-generation-inference/docker-compose.yaml
rename to benchmark/text-generation-inference/llama3-8b/docker-compose.yaml
diff --git a/benchmark/text-generation-inference/nginx.conf b/benchmark/text-generation-inference/llama3-8b/nginx.conf
similarity index 100%
rename from benchmark/text-generation-inference/nginx.conf
rename to benchmark/text-generation-inference/llama3-8b/nginx.conf
diff --git a/benchmark/text-generation-inference/llama3-8b/tgi-results.csv b/benchmark/text-generation-inference/llama3-8b/tgi-results.csv
new file mode 100644
index 000000000..c7ea60d67
--- /dev/null
+++ b/benchmark/text-generation-inference/llama3-8b/tgi-results.csv
@@ -0,0 +1,13 @@
+model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,1,44.72947298811359,0.2930618720001803,21.387192412995546
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,8,254.26638394677616,0.3072573690005811,24.51789344634094
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,16,396.49578354796415,0.31329568949968234,29.41915454622028
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,24,458.9461712504898,0.31723227349903027,36.45821381291491
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,32,540.3852559365118,0.31949053349944734,39.548380672987705
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,50,648.5983772802653,0.6981559694995667,47.64409672960739
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,64,729.1746189461367,0.8981061290014623,51.60748655120524
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,96,823.7525735951876,1.1334064394995949,60.62761554646364
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,100,829.9821677199822,1.2774171685014153,63.72698943226848
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,128,838.2579077776568,1.5125607664995186,74.21572967679927
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,150,849.2518611727152,1.7492157529995893,80.71755481115194
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,200,864.4918328198065,2.4135443449995364,98.63394209087461
diff --git a/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml b/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml
new file mode 100644
index 000000000..be606f265
--- /dev/null
+++ b/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml
@@ -0,0 +1,73 @@
+version: '3.7'
+
+services:
+  tgi-1:
+    image: neuronx-tgi:latest
+    ports:
+      - "8081:8081"
+    environment:
+      - PORT=8081
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron0"
+      - "/dev/neuron1"
+      - "/dev/neuron2"
+      - "/dev/neuron3"
+
+  tgi-2:
+    image: neuronx-tgi:latest
+    ports:
+      - "8082:8082"
+    environment:
+      - PORT=8082
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron4"
+      - "/dev/neuron5"
+      - "/dev/neuron6"
+      - "/dev/neuron7"
+
+  tgi-3:
+    image: neuronx-tgi:latest
+    ports:
+      - "8083:8083"
+    environment:
+      - PORT=8083
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron8"
+      - "/dev/neuron9"
+      - "/dev/neuron10"
+      - "/dev/neuron11"
+
+  loadbalancer:
+    image: nginx:alpine
+    ports:
+      - "8080:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - tgi-1
+      - tgi-2
+      - tgi-3
+    deploy:
+      placement:
+        constraints: [node.role == manager]
diff --git a/benchmark/text-generation-inference/mistral-7b/nginx.conf b/benchmark/text-generation-inference/mistral-7b/nginx.conf
new file mode 100644
index 000000000..37a3b8721
--- /dev/null
+++ b/benchmark/text-generation-inference/mistral-7b/nginx.conf
@@ -0,0 +1,15 @@
+### Nginx TGI Load Balancer
+events {}
+http {
+    upstream tgicluster {
+        server tgi-1:8081;
+        server tgi-2:8082;
+        server tgi-3:8083;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://tgicluster;
+        }
+    }
+}
diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py
index 754475c45..e4a7541b6 100644
--- a/benchmark/text-generation/llama2-7b.py
+++ b/benchmark/text-generation/llama2-7b.py
@@ -8,7 +8,7 @@
 
 
 def main():
-    NUM_CORES = 8
+    NUM_CORES = 12
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
@@ -18,7 +18,7 @@ def main():
         "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096],
         "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096],
         "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096],
-        "Llama-2-7B-BS32": ["meta-llama/Llama-2-7b-chat-hf", 32, 4096],
+        "Llama-2-7B-BS24": ["meta-llama/Llama-2-7b-chat-hf", 24, 4096],
     }
 
     for model_name, model_configuration in model_configurations.items():
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png
index 2f657938b..8b26732b6 100644
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png and b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
index 955ee60de..a1fe59446 100644
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png and b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
index 8844a75e7..ec7a219db 100644
Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png and b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png
index b0af4a39e..e00192997 100644
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png and b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png
index 37deedd97..8dc1c23dc 100644
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png and b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png differ
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png
index 2dd4abe77..a76e48020 100644
Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png and b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png differ
diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2-7b.mdx
index 6503a9d80..2a9de6508 100644
--- a/docs/source/benchmarks/inferentia-llama2-7b.mdx
+++ b/docs/source/benchmarks/inferentia-llama2-7b.mdx
@@ -26,9 +26,9 @@ For this benchmark we will use the following configurations:
 | Llama2 7B BS4  | 4          | 4096            |
 | Llama2 7B BS8  | 8          | 4096            |
 | Llama2 7B BS16 | 16         | 4096            |
-| Llama2 7B BS32 | 32         | 4096            |
+| Llama2 7B BS32 | 24         | 4096            |
 
-*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
+*Note: all models are compiled to use 6 devices corresponding to 12 cores on the `inf2.48xlarge` instance.*
 
 *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
 
diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
index 3ff82e886..689de5331 100644
--- a/optimum/exporters/neuron/model_configs.py
+++ b/optimum/exporters/neuron/model_configs.py
@@ -869,4 +869,4 @@ class MistralNeuronConfig(TextNeuronDecoderConfig):
 @register_in_tasks_manager("mixtral", "text-generation")
 class MixtralNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "mixtral.model.MixtralForSampling"
-    CONTINUOUS_BATCHING = True
+    CONTINUOUS_BATCHING = False
diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
index 409b58d17..02cca08f4 100644
--- a/optimum/neuron/distributed/checkpointing.py
+++ b/optimum/neuron/distributed/checkpointing.py
@@ -65,7 +65,7 @@ def convert_fn(tensors):
         return rewritten_tensors
 
     def select_fn(v):
-        return type(v) == xser.TensorReference
+        return type(v) is xser.TensorReference
 
     return xm.ToXlaTensorArena(convert_fn, select_fn).transform(ref_data)
 
diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
index da83a1db0..4d9c4f802 100644
--- a/optimum/neuron/generation/utils.py
+++ b/optimum/neuron/generation/utils.py
@@ -1628,7 +1628,7 @@ def beam_search(
         )
 
         for k, v in sequence_outputs.items():
-            if type(v) == torch.Tensor:
+            if type(v) is torch.Tensor:
                 sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
 
         if return_dict_in_generate:
diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
index 7c7c0f5a9..9d4ec89f1 100644
--- a/optimum/neuron/version.py
+++ b/optimum/neuron/version.py
@@ -14,4 +14,4 @@
 
 __version__ = "0.0.24.dev0"
 
-__sdk_version__ = "2.18.0"
+__sdk_version__ = "2.19.1"
diff --git a/setup.py b/setup.py
index e29fdf22d..63febc0fd 100644
--- a/setup.py
+++ b/setup.py
@@ -61,12 +61,12 @@
     ],
     "neuronx": [
         "wheel",
-        "neuronx-cc==2.13.66.0",
-        "torch-neuronx==2.1.2.2.1.0",
-        "transformers-neuronx==0.10.0.21",
+        "neuronx-cc==2.14.227.0",
+        "torch-neuronx==2.1.2.2.2.0",
+        "transformers-neuronx==0.11.351",
         "torch==2.1.2.*",
         "torchvision==0.16.*",
-        "neuronx_distributed==0.7.0",
+        "neuronx_distributed==0.8.0",
     ],
     "diffusers": ["diffusers>=0.28.0, <0.29.0", "peft"],
     "sentence-transformers": ["sentence-transformers >= 2.2.0"],
@@ -77,7 +77,7 @@
     version=__version__,
     description=(
         "Optimum Neuron is the interface between the Hugging Face Transformers and Diffusers libraries and AWS "
-        "Tranium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and "
+        "Trainium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and "
         "inference on single and multiple neuron core settings for different downstream tasks."
     ),
     long_description=open("README.md", "r", encoding="utf-8").read(),
diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
index 1b02a20de..c50c3c72c 100644
--- a/tests/decoder/conftest.py
+++ b/tests/decoder/conftest.py
@@ -30,13 +30,17 @@
         "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "llama": {
-        "model_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
         "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
         "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
     },
+    "mixtral": {
+        "model_id": "dacorvo/Mixtral-tiny",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
 }
 
 
diff --git a/tests/pipelines/test_decoder_pipelines.py b/tests/decoder/test_decoder_pipelines.py
similarity index 72%
rename from tests/pipelines/test_decoder_pipelines.py
rename to tests/decoder/test_decoder_pipelines.py
index d9a2025b4..9f850a871 100644
--- a/tests/pipelines/test_decoder_pipelines.py
+++ b/tests/decoder/test_decoder_pipelines.py
@@ -43,45 +43,44 @@ def _test_generation(p):
 
 @is_inferentia_test
 @requires_neuronx
-def test_export_no_parameters(inf_decoder_model):
-    p = pipeline("text-generation", inf_decoder_model, export=True)
+def test_export_no_parameters():
+    p = pipeline("text-generation", "gpt2", export=True)
     _test_generation(p)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_load_no_parameters(inf_decoder_path):
-    p = pipeline("text-generation", inf_decoder_path)
+def test_load_no_parameters(neuron_decoder_path):
+    p = pipeline("text-generation", neuron_decoder_path)
     _test_generation(p)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_from_model_and_tokenizer(inf_decoder_path):
-    m = NeuronModelForCausalLM.from_pretrained(inf_decoder_path)
-    t = AutoTokenizer.from_pretrained(inf_decoder_path)
+def test_from_model_and_tokenizer(neuron_decoder_path):
+    m = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    t = AutoTokenizer.from_pretrained(neuron_decoder_path)
     p = pipeline("text-generation", model=m, tokenizer=t)
     _test_generation(p)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_error_already_exported(inf_decoder_path):
+def test_error_already_exported(neuron_decoder_path):
     with pytest.raises(ValueError, match="already been exported"):
-        pipeline("text-generation", inf_decoder_path, export=True)
+        pipeline("text-generation", neuron_decoder_path, export=True)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_error_needs_export(inf_decoder_model):
+def test_error_needs_export():
     with pytest.raises(ValueError, match="must be exported"):
-        pipeline("text-generation", inf_decoder_model, export=False)
+        pipeline("text-generation", "gpt2", export=False)
 
 
 @is_inferentia_test
 @requires_neuronx
-def test_from_hub():
-    model_id = "dacorvo/tiny-random-gpt2-neuronx"
-    revision = "1b3456cf877cc42c053ee8464f1067021eccde4b"
-    p = pipeline("text-generation", model_id, revision=revision)
+def test_from_hub(neuron_decoder_config):
+    model_id = neuron_decoder_config["neuron_model_id"]
+    p = pipeline("text-generation", model_id)
     _test_generation(p)
diff --git a/tests/pipelines/conftest.py b/tests/pipelines/conftest.py
index cc8aa2dc7..33be04610 100644
--- a/tests/pipelines/conftest.py
+++ b/tests/pipelines/conftest.py
@@ -12,13 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from tempfile import TemporaryDirectory
 
 import pytest
-from transformers import AutoTokenizer
-
-from optimum.neuron import NeuronModelForCausalLM
-from optimum.neuron.utils.testing_utils import requires_neuronx
 
 
 STD_TEXT_TASKS = [
@@ -33,22 +28,3 @@
 @pytest.fixture(scope="module", params=STD_TEXT_TASKS)
 def std_text_task(request):
     return request.param
-
-
-@pytest.fixture(scope="module")
-@requires_neuronx
-def inf_decoder_path(inf_decoder_model):
-    model = NeuronModelForCausalLM.from_pretrained(
-        inf_decoder_model, export=True, batch_size=1, sequence_length=128, num_cores=2
-    )
-    model_dir = TemporaryDirectory()
-    model_path = model_dir.name
-    model.save_pretrained(model_path)
-    del model
-    tokenizer = AutoTokenizer.from_pretrained(inf_decoder_model)
-    tokenizer.save_pretrained(model_path)
-    del tokenizer
-    # Yield instead of returning to keep a reference to the temporary directory.
-    # It will go out of scope and be released only once all tests needing the fixture
-    # have been completed.
-    yield model_path
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 2941fdd6e..597a3bea7 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -94,19 +94,19 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU
 # Install neuronx packages
 RUN apt-get update -y \
  && apt-get install -y --no-install-recommends \
-    aws-neuronx-dkms=2.16.7.0 \
-    aws-neuronx-collectives=2.20.22.0-c101c322e \
-    aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 \
-    aws-neuronx-tools=2.17.1.0 \
+    aws-neuronx-dkms=2.17.17.0 \
+    aws-neuronx-collectives=2.21.46.0-69b77134b \
+    aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f \
+    aws-neuronx-tools=2.18.3.0 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
 
 ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}"
 
 RUN pip3 install \
-    neuronx-cc==2.13.66.0 \
-    torch-neuronx==2.1.2.2.1.0 \
-    transformers-neuronx==0.10.0.21 \
+    neuronx-cc==2.14.227.0 \
+    torch-neuronx==2.1.2.2.2.0 \
+    transformers-neuronx==0.11.351 \
     --extra-index-url=https://pip.repos.neuron.amazonaws.com
 
 # Install HuggingFace packages
diff --git a/text-generation-inference/tests/fixtures/model.py b/text-generation-inference/tests/fixtures/model.py
index 5ee46b598..c94d45784 100644
--- a/text-generation-inference/tests/fixtures/model.py
+++ b/text-generation-inference/tests/fixtures/model.py
@@ -30,7 +30,7 @@
         "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "llama": {
-        "model_id": "HuggingFaceTB/cosmo-1b",
+        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
         "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "mistral": {
diff --git a/text-generation-inference/tests/integration/test_generate.py b/text-generation-inference/tests/integration/test_generate.py
index 8ee2eaf2d..79beca5c7 100644
--- a/text-generation-inference/tests/integration/test_generate.py
+++ b/text-generation-inference/tests/integration/test_generate.py
@@ -22,7 +22,7 @@ async def test_model_single_request(tgi_service):
     assert response.details.generated_tokens == 17
     greedy_expectations = {
         "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
-        "llama": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to model",
+        "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
         "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
     }
     assert response.generated_text == greedy_expectations[service_name]
@@ -45,9 +45,9 @@ async def test_model_single_request(tgi_service):
         seed=42,
     )
     sample_expectations = {
-        "gpt2": "A lot of researchers have tried to make a broad, intuitive definition of Deep Learning",
-        "llama": "Deep Learning is a technique for training artificial neural networks",
-        "mistral": "Why is deep learning important?",
+        "gpt2": "A lot of researchers have tried to make",
+        "llama": "Deep Learning is a subset of Artificial Intelligence",
+        "mistral": "Deep Learning is a kind of machine learning",
     }
     assert sample_expectations[service_name] in response
 
@@ -79,7 +79,7 @@ async def test_model_multiple_requests(tgi_service, generate_load):
     assert len(responses) == 4
     expectations = {
         "gpt2": "\n\nDeep learning is a new field of research that has been around for a while",
-        "llama": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to model",
+        "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use",
         "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that",
     }
     expected = expectations[tgi_service.client.service_name]
diff --git a/text-generation-inference/tests/server/test_decode.py b/text-generation-inference/tests/server/test_decode.py
index b403eb777..6ce638947 100644
--- a/text-generation-inference/tests/server/test_decode.py
+++ b/text-generation-inference/tests/server/test_decode.py
@@ -36,14 +36,14 @@ def _test_decode(config_name, generator, do_sample):
     assert output.finish_reason == 0
     if do_sample:
         expected_text = {
-            "gpt2": " The sun was up on the horizon, and the air was chilly. I glanced over at the stars",
-            "llama": " In the corner booth of O'Malley's Pub sat two old friends, retired police officer",
-            "mistral": " The sun was scornful in the eyes of the young people who were trying to get along with",
+            "gpt2": " The sun was set just three miles south of the city. I had just watched a big fireworks display",
+            "llama": " George Orwell, 1984\nThe government is not interested in the truth. They want to control",
+            "mistral": " The sky was as pale as a white horse's skull. A pigeon flew",
         }[config_name]
     else:
         expected_text = {
             "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going',
-            "llama": " In the small town of Meadowgrove, everyone knew each other, and they all took",
+            "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story",
             "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.",
         }[config_name]
     assert output.text == expected_text
diff --git a/text-generation-inference/tests/server/test_prefill.py b/text-generation-inference/tests/server/test_prefill.py
index 6412f926f..c567feaec 100644
--- a/text-generation-inference/tests/server/test_prefill.py
+++ b/text-generation-inference/tests/server/test_prefill.py
@@ -34,9 +34,9 @@ def _test_prefill(config_name, generator, batch_size, do_sample):
     assert next_batch.max_tokens == batch_size * max_length
     assert len(generations) == batch_size
     if do_sample:
-        expectations = {"gpt2": [383, " The"], "llama": [560, " In"], "mistral": [450, " The"]}[config_name]
+        expectations = {"gpt2": [383, " The"], "llama": [10058, " George"], "mistral": [450, " The"]}[config_name]
     else:
-        expectations = {"gpt2": [198, "\n"], "llama": [560, " In"], "mistral": [13, "\n"]}[config_name]
+        expectations = {"gpt2": [198, "\n"], "llama": [10058, " George"], "mistral": [13, "\n"]}[config_name]
     for g in generations:
         tokens = g.tokens
         assert tokens.ids[0] == expectations[0]
@@ -67,7 +67,7 @@ def test_prefill_truncate(neuron_model_config):
     # be different because of the truncation
     expectations = {
         "gpt2": [" He", " He", "\n", " He"],
-        "llama": ["\n", "\n", " He", "\n"],
+        "llama": [" —", " The", " He", " He"],
         "mistral": [" He", "\n", " He", " He"],
     }[config_name]
     for i, g in enumerate(generations):