From eb122f22fbd62758c3e09a86d1dc750ccb43b599 Mon Sep 17 00:00:00 2001 From: lijunliang Date: Thu, 25 Jul 2024 08:35:57 +0000 Subject: [PATCH 1/7] sdxl data --- .../examples/sdxl/README.md | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index 7c9cf1560..881b69475 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -66,23 +66,24 @@ python3 benchmarks/text_to_image.py \ ## Performance comparison Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterating 20 steps: -| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 | -| ------------------------------------ | --------------------- | --------------------- | -| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 | -| PyTorch iteration speed | 4.08 it/s | 6.93 it/s | -| OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) | -| PyTorch E2E time | 5.60 s | 3.23 s | -| OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) | -| PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB | -| OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB | -| PyTorch Warmup with Run time | | | -| OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 | -| OneDiff Warmup with Cache time | 306.84 s | 104.57 s | +| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024| +| ------------------------------------ | --------------------- | --------------------- | --------------------- | --------------------- | +| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 | +| PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s | +| OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) |11.789 it/s (+91.4%) |14.895 it/s (96.3%) | +| PyTorch E2E time | 5.60 s | 3.23 s |3.674s |2.972 s | +| OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) | +| PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB |10.465 GiB |10.471 GiB | +| OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB |12.002 GiB |12.013 GiB | +| PyTorch Warmup with Run time | | | | | +| OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 |142.691 s 3 |287.011 s 3 | +| OneDiff Warmup with Cache time | 306.84 s | 104.57 s |142.992s |132.207 s | 1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz. Note this is just for reference, and it varies a lot on different CPU. 2 AMD EPYC 7543 32-Core Processor. +3 Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz (8 core). ## Dynamic shape for SDXL From 29dfb2a94644541136b19f3f61be020011264130 Mon Sep 17 00:00:00 2001 From: lijunliang Date: Thu, 25 Jul 2024 08:36:34 +0000 Subject: [PATCH 2/7] remove extra chars --- onediff_diffusers_extensions/examples/sdxl/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index 881b69475..5cef0cf6a 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -66,9 +66,9 @@ python3 benchmarks/text_to_image.py \ ## Performance comparison Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterating 20 steps: -| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024| +| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024| | ------------------------------------ | --------------------- | --------------------- | --------------------- | --------------------- | -| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 | +| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 | | PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s | | OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) |11.789 it/s (+91.4%) |14.895 it/s (96.3%) | | PyTorch E2E time | 5.60 s | 3.23 s |3.674s |2.972 s | From 73cffe48f2642924c3e56252a60258c7ebc47994 Mon Sep 17 00:00:00 2001 From: Li Junliang <117806079+lijunliangTG@users.noreply.github.com> Date: Thu, 25 Jul 2024 16:40:38 +0800 Subject: [PATCH 3/7] Fix typo in CPU comment: change 'core' to 'cores' --- onediff_diffusers_extensions/examples/sdxl/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index 5cef0cf6a..b0b2b8b76 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -83,7 +83,7 @@ Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterati 2 AMD EPYC 7543 32-Core Processor. -3 Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz (8 core). +3 Intel(R) Xeon(R) Gold 6150 CPU @ 2.70GHz (8 cores). ## Dynamic shape for SDXL From d9500e72f802be3eec5291da27476477a3ca6cb2 Mon Sep 17 00:00:00 2001 From: lijunliang Date: Thu, 25 Jul 2024 10:17:09 +0000 Subject: [PATCH 4/7] add 4090(48G) --- .../examples/sdxl/README.md | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index 5cef0cf6a..fbdd89661 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -66,18 +66,18 @@ python3 benchmarks/text_to_image.py \ ## Performance comparison Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterating 20 steps: -| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024| -| ------------------------------------ | --------------------- | --------------------- | --------------------- | --------------------- | -| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 | -| PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s | -| OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) |11.789 it/s (+91.4%) |14.895 it/s (96.3%) | -| PyTorch E2E time | 5.60 s | 3.23 s |3.674s |2.972 s | -| OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) | -| PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB |10.465 GiB |10.471 GiB | -| OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB |12.002 GiB |12.013 GiB | -| PyTorch Warmup with Run time | | | | | -| OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 |142.691 s 3 |287.011 s 3 | -| OneDiff Warmup with Cache time | 306.84 s | 104.57 s |142.992s |132.207 s | +| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024|RTX 4090(48G) 2048*2048| +| ------------------------------------ | --------------------- | --------------------- | --------------------- | --------------------- |---------------------- | +| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 |2024-07-25 | +| PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s |1.649 it/s | +| OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) |11.789 it/s (+91.4%) |14.895 it/s (+96.3%) |2.967 it/s (+79.9%) | +| PyTorch E2E time | 5.60 s | 3.23 s |3.674s |2.972 s |13.422s | +| OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) |7.688s(-42.8%) | +| PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB |10.465 GiB |10.471 GiB |21.723 GiB | +| OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB |12.002 GiB |12.013 GiB |24.015 GiB | +| PyTorch Warmup with Run time | | | | | | +| OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 |142.691 s 3 |287.011 s 3 |502.223 s 3 | +| OneDiff Warmup with Cache time | 306.84 s | 104.57 s |142.992s |132.207 s |363.051 s | 1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Silver 4314 CPU @ 2.40GHz. Note this is just for reference, and it varies a lot on different CPU. From d127c51d025c779e027c98cb6866dbdd3e5c5fa9 Mon Sep 17 00:00:00 2001 From: lijunliang Date: Thu, 25 Jul 2024 10:18:43 +0000 Subject: [PATCH 5/7] remove extra black --- onediff_diffusers_extensions/examples/sdxl/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index e47cac33a..b96d0caad 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -66,10 +66,10 @@ python3 benchmarks/text_to_image.py \ ## Performance comparison Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterating 20 steps: -| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024|RTX 4090(48G) 2048*2048| +| Metric | RTX 3090 1024*1024 | RTX 4090 1024*1024 |RTX 4090(32G) 1024*1024|RTX 4090(48G) 1024*1024|RTX 4090(48G) 2048*2048| | ------------------------------------ | --------------------- | --------------------- | --------------------- | --------------------- |---------------------- | -| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 |2024-07-25 | -| PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s |1.649 it/s | +| Data update date (yyyy-mm-dd) | 2024-07-10 | 2024-07-10 |2024-07-25 |2024-07-25 |2024-07-25 | +| PyTorch iteration speed | 4.08 it/s | 6.93 it/s |6.158 it/s |7.585 it/s |1.649 it/s | | OneDiff iteration speed | 7.21 it/s (+76.7%) | 13.92 it/s (+100.9%) |11.789 it/s (+91.4%) |14.895 it/s (+96.3%) |2.967 it/s (+79.9%) | | PyTorch E2E time | 5.60 s | 3.23 s |3.674s |2.972 s |13.422s | | OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) |7.688s(-42.8%) | From c9b1751fd6a7d8a1c0eb172110fdf64326c6e1bd Mon Sep 17 00:00:00 2001 From: lijunliang Date: Fri, 26 Jul 2024 01:20:27 +0000 Subject: [PATCH 6/7] add reserved CUDA memory --- onediff_diffusers_extensions/examples/sdxl/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index b96d0caad..3ef00a4cb 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -75,6 +75,8 @@ Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterati | OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) |7.688s(-42.8%) | | PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB |10.465 GiB |10.471 GiB |21.723 GiB | | OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB |12.002 GiB |12.013 GiB |24.015 GiB | +| PyTorch Max reserved CUDA memory Used| | | | |35.615 GiB | +| OneDiff Max reserved CUDA memory Used| | | | |35.666 GiB | | PyTorch Warmup with Run time | | | | | | | OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 |142.691 s 3 |287.011 s 3 |502.223 s 3 | | OneDiff Warmup with Cache time | 306.84 s | 104.57 s |142.992s |132.207 s |363.051 s | From 85c67c1798cbc41ffb41ead2f0b7e19c7a71b95e Mon Sep 17 00:00:00 2001 From: lijunliang Date: Fri, 26 Jul 2024 05:21:18 +0000 Subject: [PATCH 7/7] add Max reserved CUDA memory Used of 4090(32G) 4090(48G) --- onediff_diffusers_extensions/examples/sdxl/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onediff_diffusers_extensions/examples/sdxl/README.md b/onediff_diffusers_extensions/examples/sdxl/README.md index 3ef00a4cb..43b260d0c 100644 --- a/onediff_diffusers_extensions/examples/sdxl/README.md +++ b/onediff_diffusers_extensions/examples/sdxl/README.md @@ -75,8 +75,8 @@ Testing on NVIDIA GeForce RTX 3090 / 4090, with image size of 1024*1024, iterati | OneDiff E2E time | 3.41 s (-39.1%) | 1.67 s (-48.3%) |2.029s (-44.8%) |1.571s (-47.2%) |7.688s(-42.8%) | | PyTorch Max Mem Used | 10.467 GiB | 10.467 GiB |10.465 GiB |10.471 GiB |21.723 GiB | | OneDiff Max Mem Used | 12.004 GiB | 12.021 GiB |12.002 GiB |12.013 GiB |24.015 GiB | -| PyTorch Max reserved CUDA memory Used| | | | |35.615 GiB | -| OneDiff Max reserved CUDA memory Used| | | | |35.666 GiB | +| PyTorch Max reserved CUDA memory Used| | |14.078 GiB |14.078 GiB |35.615 GiB | +| OneDiff Max reserved CUDA memory Used| | |14.873 GiB |14.859 GiB |35.666 GiB | | PyTorch Warmup with Run time | | | | | | | OneDiff Warmup with Compilation time | 474.36 s 1 | 236.54 s 2 |142.691 s 3 |287.011 s 3 |502.223 s 3 | | OneDiff Warmup with Cache time | 306.84 s | 104.57 s |142.992s |132.207 s |363.051 s |