From 5ce6e2a497a0ec050225707c786cdeca5181afee Mon Sep 17 00:00:00 2001 From: strint Date: Fri, 7 Jun 2024 22:08:18 +0800 Subject: [PATCH] update pix art alpha --- README.md | 7 ++++--- benchmarks/text_to_image.py | 2 +- .../examples/pixart_alpha/README.md | 17 +++++++++++------ 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 4069bad52..5e3011497 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ The Full Introduction of OneDiff: - [Installation](#installation) - [OneDiff Installation](#onediff-installation) - [Install a compiler backend](#install-a-compiler-backend) - - [(Optional) Install NexFort](#optional-install-nexfort) + - [(Optional) Install Nexfort](#optional-install-nexfort) - [(Optional) Install OneFlow](#optional-install-oneflow) - [2. Install torch and diffusers](#2-install-torch-and-diffusers) - [3. Install OneDiff](#3-install-onediff) @@ -188,10 +188,11 @@ When considering the choice between OneFlow and Nexfort, either one is optional, - For all other cases, it is recommended to use OneFlow. Note that optimizations within OneFlow will gradually transition to Nexfort in the future. -##### (Optional) Install NexFort +##### (Optional) Install Nexfort +The detailed introduction of Nexfort is [here](https://github.com/siliconflow/onediff/tree/main/onediff/src/onediff/infer_compiler/backends/nexfort/README.md). ```bash -python3 -m pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 +python3 -m pip install -U torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 torchao==0.1 python3 -m pip install -U nexfort ``` diff --git a/benchmarks/text_to_image.py b/benchmarks/text_to_image.py index cb006fc85..a0f565770 100644 --- a/benchmarks/text_to_image.py +++ b/benchmarks/text_to_image.py @@ -267,7 +267,7 @@ def main(): options = json.loads(args.compiler_config) else: # config with string - options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}' + options = '{"mode": "max-optimize:max-autotune:freezing", "memory_format": "channels_last"}' pipe = compile_pipe( pipe, backend="nexfort", options=options, fuse_qkv_projections=True ) diff --git a/onediff_diffusers_extensions/examples/pixart_alpha/README.md b/onediff_diffusers_extensions/examples/pixart_alpha/README.md index 38ae417d2..eb0f55669 100644 --- a/onediff_diffusers_extensions/examples/pixart_alpha/README.md +++ b/onediff_diffusers_extensions/examples/pixart_alpha/README.md @@ -58,14 +58,10 @@ python3 ./benchmarks/text_to_image.py \ ``` ## Performance comparation -### nexfort compile config -- compiler-config default is `{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}` in `/benchmarks/text_to_image.py` - - setting `--compiler-config '{"mode": "max-autotune", "memory_format": "channels_last"}'` will reduce compilation time and just slightly reduce the performance - - setting `--compiler-config '{"mode": "jit:disable-runtime-fusion", "memory_format": "channels_last"}'` will reduce compilation time to 21.832s, but will reduce the performance -- fuse_qkv_projections: True ### Metric +#### On A100 | Metric | NVIDIA A100-PCIE-40GB (1024 * 1024) | | ------------------------------------------------ | ----------------------------------- | | Data update date(yyyy-mm-dd) | 2024-05-23 | @@ -76,11 +72,12 @@ python3 ./benchmarks/text_to_image.py \ | PyTorch Max Mem Used | 14.445GiB | | OneDiff Max Mem Used | 13.855GiB | | PyTorch Warmup with Run time | 4.100s | -| OneDiff Warmup with Compilation time1 | 776.170s | +| OneDiff Warmup with Compilation time1 | 510.170s | | OneDiff Warmup with Cache time | 111.563s | 1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU. +#### On H800 | Metric | NVIDIA H800 (1024 * 1024) | | ------------------------------------------------ | ----------------------------------- | | Data update date(yyyy-mm-dd) | 2024-05-29 | @@ -96,6 +93,14 @@ python3 ./benchmarks/text_to_image.py \ 2 Intel(R) Xeon(R) Platinum 8468. +#### nexfort compile config and warmup cost +- compiler-config + - default is `{"mode": "max-optimize:max-autotune:freezing", "memory_format": "channels_last"}` in `/benchmarks/text_to_image.py`, the compilation time is about 500 seconds + - setting `--compiler-config '{"mode": "max-autotune", "memory_format": "channels_last"}'` will reduce compilation time to about 60 seconds and just slightly reduce the performance + - setting `--compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}'` will help to make the best performance but the compilation time is about 700 seconds + - setting `--compiler-config '{"mode": "jit:disable-runtime-fusion", "memory_format": "channels_last"}'` will reduce compilation time to 20 seconds, but will reduce the performance +- fuse_qkv_projections: True + ## Quantization Onediff's nexfort backend works closely with Torchao to support model quantization. Quant can reduce the runtime memory requirement and increase the inference speed.