minor fixes

openvinotoolkit · May 16, 2024 · 8eadc60 · 8eadc60
1 parent 7fb930b
commit 8eadc60
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 19 deletions.
diff --git a/docs/compression_algorithms/CompressWeights.md b/docs/compression_algorithms/CompressWeights.md
@@ -9,7 +9,7 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod
 #### Supported modes
 
 By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode.
-OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer, but weight are quantized to it [asymmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
+OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
 All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale).
 All embeddings, convolutions and last linear layers are always compressed to 8-bit integer data type. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`.
 Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit asymmetric integer data type.

diff --git a/nncf/parameters.py b/nncf/parameters.py
@@ -68,13 +68,13 @@ class CompressWeightsMode(StrEnum):
     """
     Defines a mode for weight compression.
     :param INT8_SYM: Stands for 8-bit integer symmetric quantization of all weights.
-        Weights are quantized symmetrically with a fixed zero point equals to 128.
+        Weights are quantized symmetrically without zero point.
         https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization
     :param INT8_ASYM: The same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
         with a typical non-fixed zero point.
         https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization
     :param INT4_SYM: Stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-        Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+        Weights are quantized to a primary precision symmetrically without zero point.
         All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
         by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
         criteria and the given ratio.

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -68,11 +68,11 @@ def __init__(
         """
         :param mode: Defines a mode for weight compression.
             INT8_SYM stands for 8-bit integer symmetric quantization of all weights.
-                Weights are quantized symmetrically with a fixed zero point equals to 128.
+                Weights are quantized symmetrically without zero point.
             INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
                 with a typical non-fixed zero point.
             INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-                Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+                Weights are quantized to a primary precision symmetrically without zero point.
                 All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
                 by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
                 criteria and the given ratio.

diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py
@@ -141,31 +141,30 @@ def do_integer_quantization(
     if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]:
         level_low = 0
         level_high = 2**num_bits - 1
+        dtype = TensorDataType.uint8
+
         min_values = fns.min(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         max_values = fns.max(weight, axis=reduction_axes, keepdims=True)  # [a1, r, a2] -> [a1, 1, a2]
         scale, zero_point = calculate_scale_zero_point(
             min_values, max_values, level_low, level_high, narrow_range=False
         )
-        compressed_weights = fns.round(weight / scale + zero_point.astype(weight.dtype))
-        compressed_weights = fns.clip(compressed_weights, level_low, level_high).astype(TensorDataType.uint8)
     else:
-        level_low_sym = -(2 ** (num_bits - 1))
-        level_high_sym = 2 ** (num_bits - 1) - 1
+        level_low = -(2 ** (num_bits - 1))
+        level_high = 2 ** (num_bits - 1) - 1
+        dtype = TensorDataType.int8
 
         scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True)  # [a1, r//gs, 1, a2]
-        scale = scale / level_high_sym
+        scale = scale / level_high
         zero_point = fns.zeros_like(scale).astype(TensorDataType.int32)
         eps = fns.finfo(scale).eps
         # NOTE: adding machine epsilon to avoid division by zero
         scale = fns.where(fns.abs(scale) < eps, eps, scale)
-        compressed_weights = fns.round(weight / scale)
-        compressed_weights = fns.clip(compressed_weights, level_low_sym, level_high_sym).astype(TensorDataType.int8)
 
     if precomputed_scale is not None:
         scale = precomputed_scale
 
     compressed_weights = fns.round(weight / scale + zero_point.astype(weight.dtype))
-    compressed_weights = fns.clip(compressed_weights, level_low, level_high).astype(TensorDataType.uint8)
+    compressed_weights = fns.clip(compressed_weights, level_low, level_high).astype(dtype)
     return compressed_weights, scale, zero_point
 
 

diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
@@ -347,11 +347,11 @@ def compress_weights(
     :param model: A model to be compressed.
     :type model: TModel
     :param mode: Defines a mode for weight compression.
-        INT8_SYM stands for 8-bit integer symmetric quantization of all weights.
+        INT8_SYM stands for 8-bit integer symmetric quantization of all weights without zero point.
         INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
             with a typical non-fixed zero point.
         INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-            Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+            Weights are quantized to a primary precision symmetrically without zero point.
             All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
             by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
             criteria and the given ratio.
@@ -390,7 +390,7 @@ def compress_weights(
     """
     if mode == CompressWeightsMode.INT8:
         warning_deprecated(
-            "`CompressWeightsMode.INT8` is deprecated." "Please, use `CompressWeightsMode.INT8_ASYM` as value instead."
+            "`CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead."
         )
         mode = CompressWeightsMode.INT8_ASYM
 

diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
@@ -17,6 +17,7 @@
 from nncf import SensitivityMetric
 from nncf.quantization import compress_weights
 from nncf.torch import wrap_model
+from nncf.torch.quantization.layers import AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import SymmetricWeightsDecompressor
 
 DATA_BASED_SENSITIVITY_METRICS = (
@@ -127,16 +128,20 @@ def test_compress_weights(mode):
     assert n_compressed_weights == n_target_modules
 
 
-def test_compress_weights_functional_model():
+@pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM))
+def test_compress_weights_functional_model(mode):
     model = FunctionalModel()
+    decompressor_type = (
+        SymmetricWeightsDecompressor if mode == CompressWeightsMode.INT8_SYM else AsymmetricWeightsDecompressor
+    )
 
     input_ids = torch.randint(0, 10, [1, 3, 300, 300])
     wrapped_model = wrap_model(model, example_input=input_ids, trace_parameters=True)
-    compressed_model = compress_weights(wrapped_model, mode=CompressWeightsMode.INT8_SYM)
+    compressed_model = compress_weights(wrapped_model, mode=mode)
 
     n_compressed_weights = 0
     for layer in compressed_model.nncf.external_op.values():
-        if isinstance(layer, SymmetricWeightsDecompressor):
+        if isinstance(layer, decompressor_type):
             n_compressed_weights += 1
     assert n_compressed_weights == 4