Comments

daniil-lyakhov · Jan 17, 2025 · 9511c7c · 9511c7c
1 parent 0bf1e1e
commit 9511c7c
Show file tree

Hide file tree

Showing 10 changed files with 9,125 additions and 8,715 deletions.
diff --git a/nncf/experimental/common/quantization/algorithms/quantizer/openvino_quantizer.py b/nncf/experimental/common/quantization/algorithms/quantizer/openvino_quantizer.py
@@ -17,7 +17,7 @@
 from torch.ao.quantization.observer import PerChannelMinMaxObserver
 from torch.ao.quantization.quantizer.quantizer import QuantizationAnnotation as InductorQAnotation
 from torch.ao.quantization.quantizer.quantizer import QuantizationSpec as InductorQuantizationSpec
-from torch.ao.quantization.quantizer.quantizer import Quantizer as InductorQuantizer
+from torch.ao.quantization.quantizer.quantizer import Quantizer
 
 from nncf.common.graph.graph import NNCFGraph
 from nncf.common.quantization.quantizer_propagation.solver import QuantizerPropagationRule
@@ -26,7 +26,7 @@
 from nncf.common.quantization.structs import QuantizationPreset
 from nncf.common.quantization.structs import QuantizationScheme
 from nncf.common.quantization.structs import QuantizerConfig as NNCFQuantizerConfig
-from nncf.experimental.common.quantization.algorithms.quantizer.base_quantizer import NNCFQuantizer
+from nncf.experimental.quantization.algorithms.quantizer.base_quantizer import Quantizer as NNCFQuantizer
 from nncf.experimental.torch.fx.nncf_graph_builder import GraphConverter
 from nncf.experimental.torch.fx.node_utils import get_graph_node_by_name
 from nncf.experimental.torch.fx.transformations import fold_constant_except_qdq
@@ -42,7 +42,7 @@
 QUANT_ANNOTATION_KEY = "quantization_annotation"
 
 
-class OpenVINOQuantizer(InductorQuantizer, NNCFQuantizer):
+class OpenVINOQuantizer(Quantizer):
     def __init__(
         self,
         mode: Optional[QuantizationMode] = None,
@@ -169,3 +169,11 @@ def validate(self, model: torch.fx.GraphModule) -> None:
     def transform_for_annotation(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
         fold_constant_except_qdq(model)
         return model
+
+
+class OpenVINOQuantizerAdapter(NNCFQuantizer):
+    def __init__(self, quantizer: OpenVINOQuantizer):
+        self._quantizer = quantizer
+
+    def get_quantization_setup(self, model: torch.fx.GraphModule, nncf_graph: NNCFGraph) -> SingleConfigQuantizerSetup:
+        return self._quantizer.get_quantization_setup(model, nncf_graph)
diff --git a/nncf/experimental/torch/fx/quantization/quantize_pt2e.py b/nncf/experimental/torch/fx/quantization/quantize_pt2e.py
@@ -26,10 +26,12 @@
 from nncf.common.factory import NNCFGraphFactory
 from nncf.common.logging import nncf_logger
 from nncf.data import Dataset
+from nncf.experimental.common.quantization.algorithms.quantizer.openvino_quantizer import OpenVINOQuantizerAdapter
 from nncf.experimental.quantization.algorithms.post_training.algorithm import ExperimentalPostTrainingQuantization
 from nncf.experimental.quantization.quantizers.torch_ao_adapter import TorchAOQuantizerAdapter
 from nncf.experimental.torch.fx.constant_folding import constant_fold
 from nncf.experimental.torch.fx.transformations import QUANTIZE_NODE_TARGETS
+from nncf.experimental.torch.fx.transformations import compress_post_quantize_transformation
 from nncf.quantization.advanced_parameters import AdvancedBiasCorrectionParameters
 from nncf.quantization.advanced_parameters import AdvancedSmoothQuantParameters
 from nncf.quantization.advanced_parameters import RangeEstimatorParameters
@@ -114,7 +116,10 @@ def quantize_pt2e(
     quantized_model = GraphModule(quantized_model, quantized_model.graph)
 
     if fold_quantize:
-        constant_fold(quantized_model, _quant_node_constraint)
+        if isinstance(quantizer, OpenVINOQuantizerAdapter):
+            compress_post_quantize_transformation(quantized_model)
+        else:
+            constant_fold(quantized_model, _quant_node_constraint)
 
     pm = PassManager([DuplicateDQPass()])
 

diff --git a/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/mobilenet_v3_small.dot b/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/mobilenet_v3_small.dot
diff --git a/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/resnet18.dot b/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/resnet18.dot
diff --git a/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/swin_v2_s.dot b/tests/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/swin_v2_s.dot
diff --git a/...s/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/synthetic_transformer.dot b/...s/torch/data/reference_graphs/fx/experimental/OpenVINOQuantizer/synthetic_transformer.dot
@@ -7,39 +7,43 @@ strict digraph  {
 "5 embedding_0_0_nncf_smooth_quant_0" [id=5, type=call_module];
 "6 quantize_per_tensor_default" [id=6, type=quantize_per_tensor];
 "7 dequantize_per_tensor_default" [id=7, type=dequantize_per_tensor];
-"8 linear_scale_0" [id=8, type=get_attr];
-"9 linear_zero_point_0" [id=9, type=get_attr];
-"10 _frozen_param0" [id=10, type=get_attr];
-"11 dequantize_per_channel_default" [id=11, type=dequantize_per_channel];
-"12 linear" [id=12, type=linear];
-"13 linear_0_0_nncf_smooth_quant_0" [id=13, type=call_module];
-"14 quantize_per_tensor_default_1" [id=14, type=quantize_per_tensor];
-"15 dequantize_per_tensor_default_1" [id=15, type=dequantize_per_tensor];
-"16 linear_1_scale_0" [id=16, type=get_attr];
-"17 linear_1_zero_point_0" [id=17, type=get_attr];
-"18 _frozen_param1" [id=18, type=get_attr];
-"19 dequantize_per_channel_default_1" [id=19, type=dequantize_per_channel];
-"20 linear_1" [id=20, type=linear];
-"21 output" [id=21, type=output];
+"8 scale_updated_constant0" [id=8, type=get_attr];
+"9 compressed_weight_updated_constant0" [id=9, type=get_attr];
+"10 mul_tensor" [id=10, type=mul];
+"11 zero_point_updated_constant0" [id=11, type=get_attr];
+"12 sub_tensor" [id=12, type=sub];
+"13 linear" [id=13, type=linear];
+"14 linear_0_0_nncf_smooth_quant_0" [id=14, type=call_module];
+"15 quantize_per_tensor_default_1" [id=15, type=quantize_per_tensor];
+"16 dequantize_per_tensor_default_1" [id=16, type=dequantize_per_tensor];
+"17 scale_updated_constant1" [id=17, type=get_attr];
+"18 compressed_weight_updated_constant1" [id=18, type=get_attr];
+"19 mul_tensor_1" [id=19, type=mul];
+"20 zero_point_updated_constant1" [id=20, type=get_attr];
+"21 sub_tensor_1" [id=21, type=sub];
+"22 linear_1" [id=22, type=linear];
+"23 output" [id=23, type=output];
 "0 wte_weight" -> "4 embedding"  [label="(10, 5)", style=solid];
-"1 linear_bias" -> "12 linear"  [label="(5,)", style=solid];
-"2 lm_head_bias" -> "20 linear_1"  [label="(10,)", style=solid];
+"1 linear_bias" -> "13 linear"  [label="(5,)", style=solid];
+"2 lm_head_bias" -> "22 linear_1"  [label="(10,)", style=solid];
 "3 input_ids" -> "4 embedding"  [label="(5,)", style=solid];
 "4 embedding" -> "5 embedding_0_0_nncf_smooth_quant_0"  [label="(5, 5)", style=solid];
 "5 embedding_0_0_nncf_smooth_quant_0" -> "6 quantize_per_tensor_default"  [label="(5, 5)", style=solid];
 "6 quantize_per_tensor_default" -> "7 dequantize_per_tensor_default"  [label="(5, 5)", style=solid];
-"7 dequantize_per_tensor_default" -> "12 linear"  [label="(5, 5)", style=solid];
-"8 linear_scale_0" -> "11 dequantize_per_channel_default"  [label="(5,)", style=solid];
-"9 linear_zero_point_0" -> "11 dequantize_per_channel_default"  [label="(5,)", style=solid];
-"10 _frozen_param0" -> "11 dequantize_per_channel_default"  [label="(5, 5)", style=solid];
-"11 dequantize_per_channel_default" -> "12 linear"  [label="(5, 5)", style=solid];
-"12 linear" -> "13 linear_0_0_nncf_smooth_quant_0"  [label="(5, 5)", style=solid];
-"13 linear_0_0_nncf_smooth_quant_0" -> "14 quantize_per_tensor_default_1"  [label="(5, 5)", style=solid];
-"14 quantize_per_tensor_default_1" -> "15 dequantize_per_tensor_default_1"  [label="(5, 5)", style=solid];
-"15 dequantize_per_tensor_default_1" -> "20 linear_1"  [label="(5, 5)", style=solid];
-"16 linear_1_scale_0" -> "19 dequantize_per_channel_default_1"  [label="(10,)", style=solid];
-"17 linear_1_zero_point_0" -> "19 dequantize_per_channel_default_1"  [label="(10,)", style=solid];
-"18 _frozen_param1" -> "19 dequantize_per_channel_default_1"  [label="(10, 5)", style=solid];
-"19 dequantize_per_channel_default_1" -> "20 linear_1"  [label="(10, 5)", style=solid];
-"20 linear_1" -> "21 output"  [label="(5, 10)", style=solid];
+"7 dequantize_per_tensor_default" -> "13 linear"  [label="(5, 5)", style=solid];
+"8 scale_updated_constant0" -> "10 mul_tensor"  [label="(5, 1)", style=solid];
+"9 compressed_weight_updated_constant0" -> "10 mul_tensor"  [label="(5, 5)", style=solid];
+"10 mul_tensor" -> "12 sub_tensor"  [label="(5, 5)", style=solid];
+"11 zero_point_updated_constant0" -> "12 sub_tensor"  [label="(5, 1)", style=solid];
+"12 sub_tensor" -> "13 linear"  [label="(5, 5)", style=solid];
+"13 linear" -> "14 linear_0_0_nncf_smooth_quant_0"  [label="(5, 5)", style=solid];
+"14 linear_0_0_nncf_smooth_quant_0" -> "15 quantize_per_tensor_default_1"  [label="(5, 5)", style=solid];
+"15 quantize_per_tensor_default_1" -> "16 dequantize_per_tensor_default_1"  [label="(5, 5)", style=solid];
+"16 dequantize_per_tensor_default_1" -> "22 linear_1"  [label="(5, 5)", style=solid];
+"17 scale_updated_constant1" -> "19 mul_tensor_1"  [label="(10, 1)", style=solid];
+"18 compressed_weight_updated_constant1" -> "19 mul_tensor_1"  [label="(10, 5)", style=solid];
+"19 mul_tensor_1" -> "21 sub_tensor_1"  [label="(10, 5)", style=solid];
+"20 zero_point_updated_constant1" -> "21 sub_tensor_1"  [label="(10, 1)", style=solid];
+"21 sub_tensor_1" -> "22 linear_1"  [label="(10, 5)", style=solid];
+"22 linear_1" -> "23 output"  [label="(5, 10)", style=solid];
 }