siliconflow · Ldpe2G · Dec 12, 2022 · Dec 13, 2022 · Dec 13, 2022
diff --git a/src/diffusers/models/attention_oneflow.py b/src/diffusers/models/attention_oneflow.py
@@ -355,7 +355,7 @@ def _fused_forward(self, hidden_states):
         return hidden_states
 
     def forward(self, hidden_states):
-        return self._fused_forward(hidden_states)
+        #return self._fused_forward(hidden_states)
         residual = hidden_states
         batch, channel, height, width = hidden_states.shape
 
@@ -616,7 +616,7 @@ def _fused_forward(self, hidden_states, context=None, mask=None):
         return hidden_states
 
     def forward(self, hidden_states, context=None, mask=None):
-        return self._fused_forward(hidden_states, context=context, mask=context)
+        #return self._fused_forward(hidden_states, context=context, mask=context)
         batch_size, sequence_length, _ = hidden_states.shape
 
         query = self.to_q(hidden_states)

diff --git a/tests/test_models_unet_oneflow.py b/tests/test_models_unet_oneflow.py
@@ -13,6 +13,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
+import os
+
+# oneflow: '0.8.1.dev20221210+cu112'
+# diffuser: oneflow-fork-unet-tests, 0e749cf375967883f950a4b0d89898853368eead
+
+os.environ["ONEFLOW_MLIR_CSE"] = "1"
+os.environ["ONEFLOW_MLIR_ENABLE_INFERENCE_OPTIMIZATION"] = "1"
+os.environ["ONEFLOW_MLIR_ENABLE_ROUND_TRIP"] = "1"
+os.environ["ONEFLOW_MLIR_FUSE_FORWARD_OPS"] = "1"
+os.environ["ONEFLOW_KERNEL_ENABLE_FUSED_CONV_BIAS"] = "1"
+os.environ["ONEFLOW_KERNEL_ENABLE_FUSED_LINEAR"] = "1"
+os.environ["ONEFLOW_KERENL_CONV_ENABLE_CUTLASS_IMPL"] = "1"
+os.environ["ONEFLOW_KERNEL_GLU_ENABLE_DUAL_GEMM_IMPL"] = "1"
+# above env, no diff when set to 1, fp32 & fp16
+
+os.environ["ONEFLOW_CONV_ALLOW_HALF_PRECISION_ACCUMULATION"] = "1"
+os.environ["ONEFLOW_MATMUL_ALLOW_HALF_PRECISION_ACCUMULATION"] = "1"
+# no diff when set to 1, only for fp16
+
+
+# os.environ["ONEFLOW_MLIR_GROUP_MATMUL"] = "1"
+# fp16: max_diff=0.00328
+# fp32: max_diff=0.00080
+
+# os.environ["ONEFLOW_MLIR_PREFER_NHWC"] = "1"
+# fp16: max_diff=0.01563
+# fp32: max_diff=0.00089
+
+
+# os.environ["ONEFLOW_KERENL_FMHA_ENABLE_TRT_FLASH_ATTN_IMPL"] = "1"
+# only for fused_multi_head_attention_inference kernel
+
 import math
 import unittest
 
@@ -21,7 +54,7 @@
 from diffusers import OneFlowUNet2DConditionModel
 from diffusers.testing_oneflow_utils import floats_tensor, slow, torch_device
 
-from .test_modeling_common_oneflow import ModelTesterMixin
+from tests.test_modeling_common_oneflow import ModelTesterMixin
 
 
 class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase):
@@ -61,3 +94,67 @@ def prepare_init_args_and_inputs_for_common(self):
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
+
+    def test_compare_eager_graph_output(self):
+        unet_pratrained_model_path = "/home/ldp/.cache/huggingface/diffusers/models--CompVis--stable-diffusion-v1-4/snapshots/a304b1ab1b59dd6c3ba9c40705c29c6de4144096/unet"
+        # data_type = torch.float32
+        data_type = torch.float16
+        loading_kwargs = {'torch_dtype': data_type}
+        unet = OneFlowUNet2DConditionModel.from_pretrained(unet_pratrained_model_path, **loading_kwargs).to(torch_device)
+        unet.eval()
+
+        def dummy_input():
+            batch_size = 2
+            num_channels = 4
+            sizes = (64, 64)
+
+            noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device).to(data_type)
+            time_step = torch.tensor([10]).to(torch_device)
+            encoder_hidden_states = floats_tensor((batch_size, 77, 768)).to(torch_device).to(data_type)
+
+            return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
+
+        class UNetGraph(torch.nn.Graph):
+            def __init__(self, unet):
+                super().__init__()
+                self.unet = unet
+                self.config.enable_cudnn_conv_heuristic_search_algo(False)
+
+            def build(self, latent_model_input, t, text_embeddings):
+                text_embeddings = torch._C.amp_white_identity(text_embeddings)
+                return self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+
+        unet_graph = UNetGraph(unet)
+
+        inputs_dict = dummy_input()
+
+        with torch.no_grad():
+            eager_res = unet(**inputs_dict).sample
+
+            graph_res = unet_graph(inputs_dict["sample"], inputs_dict["timestep"], inputs_dict["encoder_hidden_states"])
+
+            import numpy as np
+            out_1 = eager_res.cpu().numpy()
+            out_2 = graph_res.cpu().numpy()
+            out_1 = out_1[~np.isnan(out_1)]
+            out_2 = out_2[~np.isnan(out_2)]
+            max_diff = np.amax(np.abs(out_1 - out_2))
+            print(f"max diff: {max_diff}")
+            self.assertLessEqual(max_diff, 1e-5)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+