diff --git a/benchmark/text-generation-inference/README.md b/benchmark/text-generation-inference/README.md index 1dc712fb9..ca6bf4cee 100644 --- a/benchmark/text-generation-inference/README.md +++ b/benchmark/text-generation-inference/README.md @@ -2,7 +2,7 @@ ## Local environment setup -These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace. +These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace. Copy the configurations down using @@ -44,16 +44,8 @@ Alternatively, you can edit the appropriate docker-compose.yaml to supply the fu ## Start the servers -For smaller models, you can use the multi-server configuration with a load balancer: - -```shell -$ docker compose --env-file llama-7b/.env up -``` - -For larger models, use their specific docker files: - ```shell -$ docker compose -f llama3-70b/docker-compose.yaml --env-file llama3-70b/.env up +$ docker compose -f llama3-8b/docker-compose.yaml --env-file llama3-8b/.env up ``` Note: edit the .env file to change the model configuration @@ -87,7 +79,7 @@ $ ./run_all.sh NousResearch/Meta-Llama-3-70B-Instruct ### Compiling the model -If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally. +If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally. See the [llama3-70b-trn1.32xlarge](llama3-70b-trn1.32xlarge) as an example. diff --git a/benchmark/text-generation-inference/llama-7b/.env b/benchmark/text-generation-inference/llama-7b/.env index 2b89e8305..ec8e3811d 100644 --- a/benchmark/text-generation-inference/llama-7b/.env +++ b/benchmark/text-generation-inference/llama-7b/.env @@ -1,5 +1,5 @@ MODEL_ID='NousResearch/Llama-2-7b-chat-hf' HF_AUTO_CAST_TYPE='fp16' -MAX_BATCH_SIZE=32 +MAX_BATCH_SIZE=24 MAX_INPUT_LENGTH=3072 MAX_TOTAL_TOKENS=4096 diff --git a/benchmark/text-generation-inference/llama-7b/docker-compose.yaml b/benchmark/text-generation-inference/llama-7b/docker-compose.yaml new file mode 100644 index 000000000..6d86d5afb --- /dev/null +++ b/benchmark/text-generation-inference/llama-7b/docker-compose.yaml @@ -0,0 +1,57 @@ +version: '3.7' + +services: + tgi-1: + image: neuronx-tgi:latest + ports: + - "8081:8081" + environment: + - PORT=8081 + - MODEL_ID=${MODEL_ID} + - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} + - HF_NUM_CORES=12 + - MAX_BATCH_SIZE=${MAX_BATCH_SIZE} + - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} + - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} + - MAX_CONCURRENT_REQUESTS=512 + devices: + - "/dev/neuron0" + - "/dev/neuron1" + - "/dev/neuron2" + - "/dev/neuron3" + - "/dev/neuron4" + - "/dev/neuron5" + + tgi-2: + image: neuronx-tgi:latest + ports: + - "8082:8082" + environment: + - PORT=8082 + - MODEL_ID=${MODEL_ID} + - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} + - HF_NUM_CORES=12 + - MAX_BATCH_SIZE=${MAX_BATCH_SIZE} + - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} + - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} + - MAX_CONCURRENT_REQUESTS=512 + devices: + - "/dev/neuron6" + - "/dev/neuron7" + - "/dev/neuron8" + - "/dev/neuron9" + - "/dev/neuron10" + - "/dev/neuron11" + + loadbalancer: + image: nginx:alpine + ports: + - "8080:80" + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + depends_on: + - tgi-1 + - tgi-2 + deploy: + placement: + constraints: [node.role == manager] diff --git a/benchmark/text-generation-inference/llama-7b/nginx.conf b/benchmark/text-generation-inference/llama-7b/nginx.conf new file mode 100644 index 000000000..01a8f657b --- /dev/null +++ b/benchmark/text-generation-inference/llama-7b/nginx.conf @@ -0,0 +1,14 @@ +### Nginx TGI Load Balancer +events {} +http { + upstream tgicluster { + server tgi-1:8081; + server tgi-2:8082; + } + server { + listen 80; + location / { + proxy_pass http://tgicluster; + } + } +} diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv index 96f382dc9..faa69bec7 100644 --- a/benchmark/text-generation-inference/llama-7b/tgi-results.csv +++ b/benchmark/text-generation-inference/llama-7b/tgi-results.csv @@ -1,11 +1,13 @@ model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) -huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233 -huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232 -huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392 -huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552 -huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682 -huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239 -huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213 -huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808 -huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456 -huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137 +huggingface/NousResearch/Llama-2-7b-chat-hf,1,32.32303647791415,0.4092339959997844,32.98457994546189 +huggingface/NousResearch/Llama-2-7b-chat-hf,8,280.2455817454919,0.4103973410001345,18.824823855788903 +huggingface/NousResearch/Llama-2-7b-chat-hf,16,606.2237208004269,0.42390128999977605,19.73879150452322 +huggingface/NousResearch/Llama-2-7b-chat-hf,24,778.5847225896651,0.44628154350084515,21.729555672304947 +huggingface/NousResearch/Llama-2-7b-chat-hf,32,660.0774421854719,0.6625862749997395,40.97050480951723 +huggingface/NousResearch/Llama-2-7b-chat-hf,50,809.3513111702051,1.1112228684996808,32.355166522075024 +huggingface/NousResearch/Llama-2-7b-chat-hf,64,902.2019208540152,1.518584174499665,34.52519498921747 +huggingface/NousResearch/Llama-2-7b-chat-hf,96,1000.426066970307,2.581174633500268,44.06432188527795 +huggingface/NousResearch/Llama-2-7b-chat-hf,100,965.894643860531,5.110174397500032,44.77109148855796 +huggingface/NousResearch/Llama-2-7b-chat-hf,128,1070.1775339135268,7.600166947499474,49.67094403911358 +huggingface/NousResearch/Llama-2-7b-chat-hf,150,1059.6704842739082,8.564977125500263,55.47304516981905 +huggingface/NousResearch/Llama-2-7b-chat-hf,200,1138.036018763616,13.465086967999923,64.28243222745746 diff --git a/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv b/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv index 39bb30005..fe0313b6e 100644 --- a/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv +++ b/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv @@ -1,11 +1,4 @@ model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,18.818667211424472,1.3884793975012144,51.46871325828836 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,32.22257477833452,2.0121661404991755,56.734265583687296 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,50.19917175671667,5.205651430500438,66.04042245148653 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,52.13272738944358,9.568476632499369,97.32615035298838 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,53.59997031445967,26.087651531999654,191.19227161475598 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,56.08684244759754,61.25285707449984,310.16900484570965 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,57.40338464731561,129.3146581359997,560.2474255463762 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,58.39025853766574,267.3882590960002,1094.9986170264501 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,58.589480601098536,541.6153878579971,2147.5413489446523 -huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,58.69645477077839,1085.1772966810022,4231.7554182432905 \ No newline at end of file +huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,22.4846564103628,1.2006561384987435,43.7079989917263 +huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,64.09420641185218,4.925495064999268,85.36754380113435 +huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,66.62450133873871,22.87049979600488,135.4123021951354 diff --git a/benchmark/text-generation-inference/llama3-8b/.env b/benchmark/text-generation-inference/llama3-8b/.env new file mode 100644 index 000000000..d0b814f71 --- /dev/null +++ b/benchmark/text-generation-inference/llama3-8b/.env @@ -0,0 +1,5 @@ +MODEL_ID='NousResearch/Meta-Llama-3-8B-Instruct' +HF_AUTO_CAST_TYPE='fp16' +MAX_BATCH_SIZE=32 +MAX_INPUT_LENGTH=4000 +MAX_TOTAL_TOKENS=4096 diff --git a/benchmark/text-generation-inference/docker-compose.yaml b/benchmark/text-generation-inference/llama3-8b/docker-compose.yaml similarity index 100% rename from benchmark/text-generation-inference/docker-compose.yaml rename to benchmark/text-generation-inference/llama3-8b/docker-compose.yaml diff --git a/benchmark/text-generation-inference/nginx.conf b/benchmark/text-generation-inference/llama3-8b/nginx.conf similarity index 100% rename from benchmark/text-generation-inference/nginx.conf rename to benchmark/text-generation-inference/llama3-8b/nginx.conf diff --git a/benchmark/text-generation-inference/llama3-8b/tgi-results.csv b/benchmark/text-generation-inference/llama3-8b/tgi-results.csv new file mode 100644 index 000000000..c7ea60d67 --- /dev/null +++ b/benchmark/text-generation-inference/llama3-8b/tgi-results.csv @@ -0,0 +1,13 @@ +model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms) +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,1,44.72947298811359,0.2930618720001803,21.387192412995546 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,8,254.26638394677616,0.3072573690005811,24.51789344634094 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,16,396.49578354796415,0.31329568949968234,29.41915454622028 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,24,458.9461712504898,0.31723227349903027,36.45821381291491 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,32,540.3852559365118,0.31949053349944734,39.548380672987705 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,50,648.5983772802653,0.6981559694995667,47.64409672960739 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,64,729.1746189461367,0.8981061290014623,51.60748655120524 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,96,823.7525735951876,1.1334064394995949,60.62761554646364 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,100,829.9821677199822,1.2774171685014153,63.72698943226848 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,128,838.2579077776568,1.5125607664995186,74.21572967679927 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,150,849.2518611727152,1.7492157529995893,80.71755481115194 +huggingface/NousResearch/Meta-Llama-3-8B-Instruct,200,864.4918328198065,2.4135443449995364,98.63394209087461 diff --git a/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml b/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml new file mode 100644 index 000000000..be606f265 --- /dev/null +++ b/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml @@ -0,0 +1,73 @@ +version: '3.7' + +services: + tgi-1: + image: neuronx-tgi:latest + ports: + - "8081:8081" + environment: + - PORT=8081 + - MODEL_ID=${MODEL_ID} + - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} + - HF_NUM_CORES=8 + - MAX_BATCH_SIZE=${MAX_BATCH_SIZE} + - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} + - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} + - MAX_CONCURRENT_REQUESTS=512 + devices: + - "/dev/neuron0" + - "/dev/neuron1" + - "/dev/neuron2" + - "/dev/neuron3" + + tgi-2: + image: neuronx-tgi:latest + ports: + - "8082:8082" + environment: + - PORT=8082 + - MODEL_ID=${MODEL_ID} + - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} + - HF_NUM_CORES=8 + - MAX_BATCH_SIZE=${MAX_BATCH_SIZE} + - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} + - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} + - MAX_CONCURRENT_REQUESTS=512 + devices: + - "/dev/neuron4" + - "/dev/neuron5" + - "/dev/neuron6" + - "/dev/neuron7" + + tgi-3: + image: neuronx-tgi:latest + ports: + - "8083:8083" + environment: + - PORT=8083 + - MODEL_ID=${MODEL_ID} + - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE} + - HF_NUM_CORES=8 + - MAX_BATCH_SIZE=${MAX_BATCH_SIZE} + - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH} + - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS} + - MAX_CONCURRENT_REQUESTS=512 + devices: + - "/dev/neuron8" + - "/dev/neuron9" + - "/dev/neuron10" + - "/dev/neuron11" + + loadbalancer: + image: nginx:alpine + ports: + - "8080:80" + volumes: + - ./nginx.conf:/etc/nginx/nginx.conf:ro + depends_on: + - tgi-1 + - tgi-2 + - tgi-3 + deploy: + placement: + constraints: [node.role == manager] diff --git a/benchmark/text-generation-inference/mistral-7b/nginx.conf b/benchmark/text-generation-inference/mistral-7b/nginx.conf new file mode 100644 index 000000000..37a3b8721 --- /dev/null +++ b/benchmark/text-generation-inference/mistral-7b/nginx.conf @@ -0,0 +1,15 @@ +### Nginx TGI Load Balancer +events {} +http { + upstream tgicluster { + server tgi-1:8081; + server tgi-2:8082; + server tgi-3:8083; + } + server { + listen 80; + location / { + proxy_pass http://tgicluster; + } + } +} diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py index 754475c45..e4a7541b6 100644 --- a/benchmark/text-generation/llama2-7b.py +++ b/benchmark/text-generation/llama2-7b.py @@ -8,7 +8,7 @@ def main(): - NUM_CORES = 8 + NUM_CORES = 12 num_cores = get_available_cores() if num_cores < NUM_CORES: raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.") @@ -18,7 +18,7 @@ def main(): "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096], "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096], "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096], - "Llama-2-7B-BS32": ["meta-llama/Llama-2-7b-chat-hf", 32, 4096], + "Llama-2-7B-BS24": ["meta-llama/Llama-2-7b-chat-hf", 24, 4096], } for model_name, model_configuration in model_configurations.items(): diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png index 2f657938b..8b26732b6 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png and b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png index 955ee60de..a1fe59446 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png and b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png index 8844a75e7..ec7a219db 100644 Binary files a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png and b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png index b0af4a39e..e00192997 100644 Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png and b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png index 37deedd97..8dc1c23dc 100644 Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png and b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png differ diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png index 2dd4abe77..a76e48020 100644 Binary files a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png and b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png differ diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2-7b.mdx index 6503a9d80..2a9de6508 100644 --- a/docs/source/benchmarks/inferentia-llama2-7b.mdx +++ b/docs/source/benchmarks/inferentia-llama2-7b.mdx @@ -26,9 +26,9 @@ For this benchmark we will use the following configurations: | Llama2 7B BS4 | 4 | 4096 | | Llama2 7B BS8 | 8 | 4096 | | Llama2 7B BS16 | 16 | 4096 | -| Llama2 7B BS32 | 32 | 4096 | +| Llama2 7B BS32 | 24 | 4096 | -*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.* +*Note: all models are compiled to use 6 devices corresponding to 12 cores on the `inf2.48xlarge` instance.* *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.* diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py index 3ff82e886..689de5331 100644 --- a/optimum/exporters/neuron/model_configs.py +++ b/optimum/exporters/neuron/model_configs.py @@ -869,4 +869,4 @@ class MistralNeuronConfig(TextNeuronDecoderConfig): @register_in_tasks_manager("mixtral", "text-generation") class MixtralNeuronConfig(TextNeuronDecoderConfig): NEURONX_CLASS = "mixtral.model.MixtralForSampling" - CONTINUOUS_BATCHING = True + CONTINUOUS_BATCHING = False diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py index 409b58d17..02cca08f4 100644 --- a/optimum/neuron/distributed/checkpointing.py +++ b/optimum/neuron/distributed/checkpointing.py @@ -65,7 +65,7 @@ def convert_fn(tensors): return rewritten_tensors def select_fn(v): - return type(v) == xser.TensorReference + return type(v) is xser.TensorReference return xm.ToXlaTensorArena(convert_fn, select_fn).transform(ref_data) diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py index da83a1db0..4d9c4f802 100644 --- a/optimum/neuron/generation/utils.py +++ b/optimum/neuron/generation/utils.py @@ -1628,7 +1628,7 @@ def beam_search( ) for k, v in sequence_outputs.items(): - if type(v) == torch.Tensor: + if type(v) is torch.Tensor: sequence_outputs[k] = sequence_outputs[k].to(input_ids.device) if return_dict_in_generate: diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py index 7c7c0f5a9..9d4ec89f1 100644 --- a/optimum/neuron/version.py +++ b/optimum/neuron/version.py @@ -14,4 +14,4 @@ __version__ = "0.0.24.dev0" -__sdk_version__ = "2.18.0" +__sdk_version__ = "2.19.1" diff --git a/setup.py b/setup.py index e29fdf22d..63febc0fd 100644 --- a/setup.py +++ b/setup.py @@ -61,12 +61,12 @@ ], "neuronx": [ "wheel", - "neuronx-cc==2.13.66.0", - "torch-neuronx==2.1.2.2.1.0", - "transformers-neuronx==0.10.0.21", + "neuronx-cc==2.14.227.0", + "torch-neuronx==2.1.2.2.2.0", + "transformers-neuronx==0.11.351", "torch==2.1.2.*", "torchvision==0.16.*", - "neuronx_distributed==0.7.0", + "neuronx_distributed==0.8.0", ], "diffusers": ["diffusers>=0.28.0, <0.29.0", "peft"], "sentence-transformers": ["sentence-transformers >= 2.2.0"], @@ -77,7 +77,7 @@ version=__version__, description=( "Optimum Neuron is the interface between the Hugging Face Transformers and Diffusers libraries and AWS " - "Tranium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and " + "Trainium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and " "inference on single and multiple neuron core settings for different downstream tasks." ), long_description=open("README.md", "r", encoding="utf-8").read(), diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py index 1b02a20de..c50c3c72c 100644 --- a/tests/decoder/conftest.py +++ b/tests/decoder/conftest.py @@ -30,13 +30,17 @@ "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, }, "llama": { - "model_id": "princeton-nlp/Sheared-LLaMA-1.3B", + "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"}, }, "mistral": { "model_id": "optimum/mistral-1.1b-testing", "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"}, }, + "mixtral": { + "model_id": "dacorvo/Mixtral-tiny", + "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, + }, } diff --git a/tests/pipelines/test_decoder_pipelines.py b/tests/decoder/test_decoder_pipelines.py similarity index 72% rename from tests/pipelines/test_decoder_pipelines.py rename to tests/decoder/test_decoder_pipelines.py index d9a2025b4..9f850a871 100644 --- a/tests/pipelines/test_decoder_pipelines.py +++ b/tests/decoder/test_decoder_pipelines.py @@ -43,45 +43,44 @@ def _test_generation(p): @is_inferentia_test @requires_neuronx -def test_export_no_parameters(inf_decoder_model): - p = pipeline("text-generation", inf_decoder_model, export=True) +def test_export_no_parameters(): + p = pipeline("text-generation", "gpt2", export=True) _test_generation(p) @is_inferentia_test @requires_neuronx -def test_load_no_parameters(inf_decoder_path): - p = pipeline("text-generation", inf_decoder_path) +def test_load_no_parameters(neuron_decoder_path): + p = pipeline("text-generation", neuron_decoder_path) _test_generation(p) @is_inferentia_test @requires_neuronx -def test_from_model_and_tokenizer(inf_decoder_path): - m = NeuronModelForCausalLM.from_pretrained(inf_decoder_path) - t = AutoTokenizer.from_pretrained(inf_decoder_path) +def test_from_model_and_tokenizer(neuron_decoder_path): + m = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path) + t = AutoTokenizer.from_pretrained(neuron_decoder_path) p = pipeline("text-generation", model=m, tokenizer=t) _test_generation(p) @is_inferentia_test @requires_neuronx -def test_error_already_exported(inf_decoder_path): +def test_error_already_exported(neuron_decoder_path): with pytest.raises(ValueError, match="already been exported"): - pipeline("text-generation", inf_decoder_path, export=True) + pipeline("text-generation", neuron_decoder_path, export=True) @is_inferentia_test @requires_neuronx -def test_error_needs_export(inf_decoder_model): +def test_error_needs_export(): with pytest.raises(ValueError, match="must be exported"): - pipeline("text-generation", inf_decoder_model, export=False) + pipeline("text-generation", "gpt2", export=False) @is_inferentia_test @requires_neuronx -def test_from_hub(): - model_id = "dacorvo/tiny-random-gpt2-neuronx" - revision = "1b3456cf877cc42c053ee8464f1067021eccde4b" - p = pipeline("text-generation", model_id, revision=revision) +def test_from_hub(neuron_decoder_config): + model_id = neuron_decoder_config["neuron_model_id"] + p = pipeline("text-generation", model_id) _test_generation(p) diff --git a/tests/pipelines/conftest.py b/tests/pipelines/conftest.py index cc8aa2dc7..33be04610 100644 --- a/tests/pipelines/conftest.py +++ b/tests/pipelines/conftest.py @@ -12,13 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from tempfile import TemporaryDirectory import pytest -from transformers import AutoTokenizer - -from optimum.neuron import NeuronModelForCausalLM -from optimum.neuron.utils.testing_utils import requires_neuronx STD_TEXT_TASKS = [ @@ -33,22 +28,3 @@ @pytest.fixture(scope="module", params=STD_TEXT_TASKS) def std_text_task(request): return request.param - - -@pytest.fixture(scope="module") -@requires_neuronx -def inf_decoder_path(inf_decoder_model): - model = NeuronModelForCausalLM.from_pretrained( - inf_decoder_model, export=True, batch_size=1, sequence_length=128, num_cores=2 - ) - model_dir = TemporaryDirectory() - model_path = model_dir.name - model.save_pretrained(model_path) - del model - tokenizer = AutoTokenizer.from_pretrained(inf_decoder_model) - tokenizer.save_pretrained(model_path) - del tokenizer - # Yield instead of returning to keep a reference to the temporary directory. - # It will go out of scope and be released only once all tests needing the fixture - # have been completed. - yield model_path diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile index 2941fdd6e..597a3bea7 100644 --- a/text-generation-inference/Dockerfile +++ b/text-generation-inference/Dockerfile @@ -94,19 +94,19 @@ RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEU # Install neuronx packages RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - aws-neuronx-dkms=2.16.7.0 \ - aws-neuronx-collectives=2.20.22.0-c101c322e \ - aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 \ - aws-neuronx-tools=2.17.1.0 \ + aws-neuronx-dkms=2.17.17.0 \ + aws-neuronx-collectives=2.21.46.0-69b77134b \ + aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f \ + aws-neuronx-tools=2.18.3.0 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ENV PATH="/opt/bin/:/opt/aws/neuron/bin:${PATH}" RUN pip3 install \ - neuronx-cc==2.13.66.0 \ - torch-neuronx==2.1.2.2.1.0 \ - transformers-neuronx==0.10.0.21 \ + neuronx-cc==2.14.227.0 \ + torch-neuronx==2.1.2.2.2.0 \ + transformers-neuronx==0.11.351 \ --extra-index-url=https://pip.repos.neuron.amazonaws.com # Install HuggingFace packages diff --git a/text-generation-inference/tests/fixtures/model.py b/text-generation-inference/tests/fixtures/model.py index 5ee46b598..c94d45784 100644 --- a/text-generation-inference/tests/fixtures/model.py +++ b/text-generation-inference/tests/fixtures/model.py @@ -30,7 +30,7 @@ "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"}, }, "llama": { - "model_id": "HuggingFaceTB/cosmo-1b", + "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B", "export_kwargs": {"batch_size": 4, "sequence_length": 2048, "num_cores": 2, "auto_cast_type": "fp16"}, }, "mistral": { diff --git a/text-generation-inference/tests/integration/test_generate.py b/text-generation-inference/tests/integration/test_generate.py index 8ee2eaf2d..79beca5c7 100644 --- a/text-generation-inference/tests/integration/test_generate.py +++ b/text-generation-inference/tests/integration/test_generate.py @@ -22,7 +22,7 @@ async def test_model_single_request(tgi_service): assert response.details.generated_tokens == 17 greedy_expectations = { "gpt2": "\n\nDeep learning is a new field of research that has been around for a while", - "llama": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to model", + "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that", } assert response.generated_text == greedy_expectations[service_name] @@ -45,9 +45,9 @@ async def test_model_single_request(tgi_service): seed=42, ) sample_expectations = { - "gpt2": "A lot of researchers have tried to make a broad, intuitive definition of Deep Learning", - "llama": "Deep Learning is a technique for training artificial neural networks", - "mistral": "Why is deep learning important?", + "gpt2": "A lot of researchers have tried to make", + "llama": "Deep Learning is a subset of Artificial Intelligence", + "mistral": "Deep Learning is a kind of machine learning", } assert sample_expectations[service_name] in response @@ -79,7 +79,7 @@ async def test_model_multiple_requests(tgi_service, generate_load): assert len(responses) == 4 expectations = { "gpt2": "\n\nDeep learning is a new field of research that has been around for a while", - "llama": "\n\nDeep learning is a subset of machine learning that uses artificial neural networks to model", + "llama": " A Beginner’s Guide\nDeep learning is a subset of machine learning that involves the use", "mistral": "\nWhat is Deep Learning?\nDeep Learning is a type of machine learning that", } expected = expectations[tgi_service.client.service_name] diff --git a/text-generation-inference/tests/server/test_decode.py b/text-generation-inference/tests/server/test_decode.py index b403eb777..6ce638947 100644 --- a/text-generation-inference/tests/server/test_decode.py +++ b/text-generation-inference/tests/server/test_decode.py @@ -36,14 +36,14 @@ def _test_decode(config_name, generator, do_sample): assert output.finish_reason == 0 if do_sample: expected_text = { - "gpt2": " The sun was up on the horizon, and the air was chilly. I glanced over at the stars", - "llama": " In the corner booth of O'Malley's Pub sat two old friends, retired police officer", - "mistral": " The sun was scornful in the eyes of the young people who were trying to get along with", + "gpt2": " The sun was set just three miles south of the city. I had just watched a big fireworks display", + "llama": " George Orwell, 1984\nThe government is not interested in the truth. They want to control", + "mistral": " The sky was as pale as a white horse's skull. A pigeon flew", }[config_name] else: expected_text = { "gpt2": '\n\n"I\'m going to go to bed," I said.\n\n"I\'m going', - "llama": " In the small town of Meadowgrove, everyone knew each other, and they all took", + "llama": " George Orwell’s classic dystopian novel, 1984, begins with this ominous sentence. The story", "mistral": "\nThe clocks were striking thirteen.\nThe clocks were striking thirteen.", }[config_name] assert output.text == expected_text diff --git a/text-generation-inference/tests/server/test_prefill.py b/text-generation-inference/tests/server/test_prefill.py index 6412f926f..c567feaec 100644 --- a/text-generation-inference/tests/server/test_prefill.py +++ b/text-generation-inference/tests/server/test_prefill.py @@ -34,9 +34,9 @@ def _test_prefill(config_name, generator, batch_size, do_sample): assert next_batch.max_tokens == batch_size * max_length assert len(generations) == batch_size if do_sample: - expectations = {"gpt2": [383, " The"], "llama": [560, " In"], "mistral": [450, " The"]}[config_name] + expectations = {"gpt2": [383, " The"], "llama": [10058, " George"], "mistral": [450, " The"]}[config_name] else: - expectations = {"gpt2": [198, "\n"], "llama": [560, " In"], "mistral": [13, "\n"]}[config_name] + expectations = {"gpt2": [198, "\n"], "llama": [10058, " George"], "mistral": [13, "\n"]}[config_name] for g in generations: tokens = g.tokens assert tokens.ids[0] == expectations[0] @@ -67,7 +67,7 @@ def test_prefill_truncate(neuron_model_config): # be different because of the truncation expectations = { "gpt2": [" He", " He", "\n", " He"], - "llama": ["\n", "\n", " He", "\n"], + "llama": [" —", " The", " He", " He"], "mistral": [" He", "\n", " He", " He"], }[config_name] for i, g in enumerate(generations):