Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add flux example #1126

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
166 changes: 166 additions & 0 deletions benchmarks/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
#!/bin/bash
set -e

# indicate which model to run
# e.g. ./run_benchmark.sh sd15,sd21,sdxl or ./run_benchmark.sh all
run_model=$1



# set environment variables
export NEXFORT_GRAPH_CACHE=1
export NEXFORT_FX_FORCE_TRITON_SDPA=1


# model path
model_dir="/data1/hf_model"
sd15_path="${model_dir}/stable-diffusion-v1-5"
sd21_path="${model_dir}/stable-diffusion-2-1"
sdxl_path="${model_dir}/stable-diffusion-xl-base-1.0"
sd3_path="/data1/home/zhangxu/stable-diffusion-3-medium-diffusers"
flux_dev_path="${model_dir}/FLUX.1-dev/snapshots/0ef5fff789c832c5c7f4e127f94c8b54bbcced44"
flux_schnell_path="${model_dir}/FLUX.1-schnell"

# get current time
current_time=$(date +"%Y-%m-%d")
echo "Current time: ${current_time}"

# get NVIDIA GPU name
gpu_name=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader,nounits | head -n 1 | sed 's/NVIDIA //; s/ /_/g')

XuZhang99 marked this conversation as resolved.
Show resolved Hide resolved
# table header
BENCHMARK_RESULT_TEXT="| Data update date (yyyy-mm-dd) | GPU | Model | HxW | Compiler | Quantization | Iteration speed (it/s) | E2E Time (s) | Max used CUDA memory (GiB) | Warmup time (s) |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"


prompt="beautiful scenery nature glass bottle landscape, purple galaxy bottle"
quantize_config='{"quant_type": "fp8_e4m3_e4m3_dynamic_per_tensor"}'

# oneflow 没有compiler_config
#sd15_nexfort_compiler_config=""
#sd21_nexfort_compiler_config=""
#sdxl_nexfort_compiler_config=""

sd3_nexfort_compiler_config='{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}'
flux_nexfort_compiler_config='{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}'


# benchmark model with one resolution function
benchmark_model_with_one_resolution() {
# model_name is the name of the model
model_name=$1
# model_path is the path of the model
model_path=$2
# steps is the number of inference steps
steps=$3
# compiler is the compiler used, e.g. none, oneflow, nexfort, transform
compiler=$4
# compiler_config is the compiler config used
compiler_config=$5
# height and width are the resolution of the image
height=$6
width=$7
# quantize is whether to quantize
quantize=$8

echo "Running ${model_path} ${height}x${width}..."

# if model_name contains sd3, use sd3 script
if [[ "${model_name}" =~ sd3 ]]; then
script_path="onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py"
# if model_name contains flux, use flux script
elif [[ "${model_name}" =~ flux ]]; then
script_path="onediff_diffusers_extensions/examples/flux/text_to_image_flux.py"
else
# otherwise, use sd script
script_path="benchmarks/text_to_image.py"
fi

# if quantize is True, add --quantize and --quantize-config
if [[ ${quantize} == True ]]; then
script_output=$(python3 ${script_path} \
--model ${model_path} --variant fp16 --steps ${steps} \
--height ${height} --width ${width} --seed 1 \
--compiler ${compiler} --compiler-config "${compiler_config}" \
--quantize --quantize-config "${quantize_config}" \
--prompt "${prompt}" --print-output | tee /dev/tty)
else
script_output=$(python3 ${script_path} \
--model ${model_path} --variant fp16 --steps ${steps} \
--height ${height} --width ${width} --seed 1 \
--compiler ${compiler} --compiler-config "${compiler_config}" \
--prompt "${prompt}" --print-output | tee /dev/tty)
fi

# get inference time, iterations per second, max used cuda memory, warmup time
inference_time=$(echo "${script_output}" | grep -oP '(?<=Inference time: )\d+\.\d+')
iterations_per_second=$(echo "${script_output}" | grep -oP '(?<=Iterations per second: )\d+\.\d+')
max_used_cuda_memory=$(echo "${script_output}" | grep -oP '(?<=Max used CUDA memory : )\d+\.\d+')
warmup_time=$(echo "${script_output}" | grep -oP '(?<=Warmup time: )\d+\.\d+')

# add benchmark result to BENCHMARK_RESULT_TEXT
BENCHMARK_RESULT_TEXT="${BENCHMARK_RESULT_TEXT}| "${current_time}" | "${gpu_name}" | "${model_name}" | ${height}x${width} | ${compiler} | ${quantize} | ${iterations_per_second} | ${inference_time} | ${max_used_cuda_memory} | ${warmup_time} |\n"
XuZhang99 marked this conversation as resolved.
Show resolved Hide resolved
}

# conda init
source ~/miniconda3/etc/profile.d/conda.sh

XuZhang99 marked this conversation as resolved.
Show resolved Hide resolved
#########################################
# if run_model contains sd15 or all, run sd15
if [[ "${run_model}" =~ sd15|all ]]; then
conda activate oneflow
benchmark_model_with_one_resolution sd15 ${sd15_path} 30 none none 512 512 False
benchmark_model_with_one_resolution sd15 ${sd15_path} 30 oneflow none 512 512 False
benchmark_model_with_one_resolution sd15 ${sd15_path} 30 oneflow none 512 512 True
fi

# if run_model contains sd21 or all, run sd21
if [[ "${run_model}" =~ sd21|all ]]; then
# activate oneflow environment
conda activate oneflow
benchmark_model_with_one_resolution sd21 ${sd21_path} 20 none none 768 768 False
benchmark_model_with_one_resolution sd21 ${sd21_path} 20 oneflow none 768 768 False
benchmark_model_with_one_resolution sd21 ${sd21_path} 20 oneflow none 768 768 True
fi

# if run_model contains sdxl or all, run sdxl
if [[ "${run_model}" =~ sdxl|all ]]; then
# activate oneflow environment
conda activate oneflow
benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 none none 1024 1024 False
benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 oneflow none 1024 1024 False
benchmark_model_with_one_resolution sdxl ${sdxl_path} 30 oneflow none 1024 1024 True
fi
#########################################

#########################################
# if run_model contains sd3 or all, run sd3
if [[ "${run_model}" =~ sd3|all ]]; then
conda activate nexfort
# activate nexfort environment
benchmark_model_with_one_resolution sd3 ${sd3_path} 28 none none 1024 1024 False
benchmark_model_with_one_resolution sd3 ${sd3_path} 28 nexfort "${sd3_nexfort_compiler_config}" 1024 1024 False
benchmark_model_with_one_resolution sd3 ${sd3_path} 28 nexfort "${sd3_nexfort_compiler_config}" 1024 1024 True
fi

# if run_model contains flux or all, run flux
if [[ "${run_model}" =~ flux|all ]]; then
# activate nexfort environment
conda activate nexfort
benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 none none 1024 1024 False
benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 nexfort "${flux_nexfort_compiler_config}" 1024 1024 False
benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 nexfort "${flux_nexfort_compiler_config}" 1024 1024 True
benchmark_model_with_one_resolution flux_dev ${flux_dev_path} 20 transform none 1024 1024 False


benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 none none 1024 1024 False
benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 nexfort "${flux_nexfort_compiler_config}" 1024 1024 False
benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 nexfort "${flux_nexfort_compiler_config}" 1024 1024 True
benchmark_model_with_one_resolution flux_schnell ${flux_schnell_path} 4 transform none 1024 1024 False
fi
XuZhang99 marked this conversation as resolved.
Show resolved Hide resolved
#########################################


echo -e "\nBenchmark Results:"
# print benchmark result and add benchmark result to markdown file
echo -e ${BENCHMARK_RESULT_TEXT} | tee -a benchmark_result_"${gpu_name}".md
echo -e "\nBenchmark Done!"
8 changes: 8 additions & 0 deletions benchmarks/text_to_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import torch
from diffusers.utils import load_image
from onediff.infer_compiler import oneflow_compile
from onediff.optimization.quant_optimizer import quantize_model

from onediffx import ( # quantize_pipe currently only supports the nexfort backend.
compile_pipe,
Expand Down Expand Up @@ -252,6 +253,13 @@ def main():
print("Oneflow backend is now active...")
# Note: The compile_pipe() based on the oneflow backend is incompatible with T5EncoderModel.
# pipe = compile_pipe(pipe)

if args.quantize:
if hasattr(pipe, "unet"):
pipe.unet = quantize_model(pipe.unet)
if hasattr(pipe, "transformer"):
pipe.transformer = quantize_model(pipe.transformer)

if hasattr(pipe, "unet"):
pipe.unet = oneflow_compile(pipe.unet)
if hasattr(pipe, "transformer"):
Expand Down
129 changes: 129 additions & 0 deletions onediff_diffusers_extensions/examples/flux/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Run Flux with onediff


## Environment setup

### Set up onediff
https://github.com/siliconflow/onediff?tab=readme-ov-file#installation

### Set up compiler backend
Support two backends: oneflow and nexfort.

https://github.com/siliconflow/onediff?tab=readme-ov-file#install-a-compiler-backend

### Set up flux
HF model: https://huggingface.co/black-forest-labs/FLUX.1-dev and https://huggingface.co/black-forest-labs/FLUX.1-schnell

HF pipeline: https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux

### Set up others
Install extra pkgs and set environment variable.
```bash
pip install --upgrade transformers
pip install --upgrade diffusers[torch]
pip install nvidia-cublas-cu12==12.4.5.8

export NEXFORT_FX_FORCE_TRITON_SDPA=1
```

## Run

### Run FLUX.1-dev 1024*1024 without compile (the original pytorch HF diffusers baseline)
```
python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
--model black-forest-labs/FLUX.1-dev \
--height 1024 \
--width 1024 \
--steps 20 \
--seed 1 \
--output-image ./flux.png
```

### Run FLUX.1-dev 1024*1024 with compile [nexfort backend]

```
python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
--model black-forest-labs/FLUX.1-dev \
--height 1024 \
--width 1024 \
--steps 20 \
--seed 1 \
--compiler nexfort \
--compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \
--output-image ./flux_nexfort_compile.png
```


### Run FLUX.1-schnell 1024*1024 without compile (the original pytorch HF diffusers baseline)
```
python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
--model black-forest-labs/FLUX.1-schnell \
--height 1024 \
--width 1024 \
--steps 4 \
--seed 1 \
--output-image ./flux.png
```

### Run FLUX.1-schnell 1024*1024 with compile [nexfort backend]

```
python3 onediff_diffusers_extensions/examples/flux/text_to_image_flux.py \
--model black-forest-labs/FLUX.1-schnell \
--height 1024 \
--width 1024 \
--steps 4 \
--seed 1 \
--compiler nexfort \
--compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \
--output-image ./flux_nexfort_compile.png
```


## FLUX.1-dev Performance comparation
**Testing on NVIDIA H20-SXM4-80GB:**

Data update date: 2024-10-23

| Framework | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>1</sup> | Warmup with Cache time (seconds) |
|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
| PyTorch | 1.30 | 15.72 | 35.73 | 16.68 | - |
| OneDiff (NexFort) | 1.76 (+35.4%) | 11.57 (-26.4%) | 34.85 | 750.78 | 28.57 |

<sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468V.

**Testing on NVIDIA L20-SXM4-48GB:**

Data update date: 2024-10-28

| Framework | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>2</sup> | Warmup with Cache time (seconds) |
|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
| PyTorch | 1.10 | 18.45 | 35.71 | 18.695 | - |
| OneDiff (NexFort) | 1.41 (+28.2%) | 14.44 (-21.7%) | 34.83 | 546.52 | 25.32 |

<sup>2</sup> OneDiff Warmup with Compilation time is tested on AMD EPYC 9354 32-Core Processor.



## FLUX.1-schnell Performance comparation
**Testing on NVIDIA H20-SXM4-80GB:**

Data update date: 2024-10-23

| Framework | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>1</sup> | Warmup with Cache time (seconds) |
|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
| PyTorch | 1.30 | 3.38 | 35.71 | 4.35 | - |
| OneDiff (NexFort) | 1.75 (+34.6%) | 2.46 (-27.2%) | 34.83 | 201.41 | 19.57 |

<sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468V.

**Testing on NVIDIA L20-SXM4-48GB:**

Data update date: 2024-10-28

| Framework | Iteration Speed (it/s) | E2E Time (seconds) | Max Memory Used (GiB) | Warmup time (seconds) <sup>2</sup> | Warmup with Cache time (seconds) |
|--------------------|------------------------|--------------------|-----------------------|-------------|------------------------|
| PyTorch | 1.10 | 3.94 | 35.69 | 4.15 | - |
| OneDiff (NexFort) | 1.41 (+28.2%) | 3.03 (-23.1%) | 34.81 | 145.63 | 13.56 |

<sup>2</sup> OneDiff Warmup with Compilation time is tested on AMD EPYC 9354 32-Core Processor.
Loading
Loading