minor fixes

openvinotoolkit · Jun 5, 2024 · b1df29f · b1df29f
1 parent 0de5662
commit b1df29f
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 11 deletions.
diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md
@@ -9,7 +9,7 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod
 #### Supported modes
 
 By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode.
-OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer, but weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
+OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
 All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale).
 All embeddings, convolutions and last linear layers are always compressed to 8-bit integer data type. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`.
 Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit asymmetric integer data type.

diff --git a/nncf/parameters.py b/nncf/parameters.py
@@ -68,13 +68,13 @@ class CompressWeightsMode(StrEnum):
     """
     Defines a mode for weight compression.
     :param INT8_SYM: Stands for 8-bit integer symmetric quantization of all weights.
-        Weights are quantized symmetrically with a fixed zero point equals to 128.
+        Weights are quantized symmetrically without zero point.
         https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization
     :param INT8_ASYM: The same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
         with a typical non-fixed zero point.
         https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization
     :param INT4_SYM: Stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-        Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+        Weights are quantized to a primary precision symmetrically without zero point.
         All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
         by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
         criteria and the given ratio.

diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py
@@ -70,11 +70,11 @@ def __init__(
         """
         :param mode: Defines a mode for weight compression.
             INT8_SYM stands for 8-bit integer symmetric quantization of all weights.
-                Weights are quantized symmetrically with a fixed zero point equals to 128.
+                Weights are quantized symmetrically without zero point.
             INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
                 with a typical non-fixed zero point.
             INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-                Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+                Weights are quantized to a primary precision symmetrically without zero point.
                 All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
                 by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
                 criteria and the given ratio.

diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py
@@ -348,11 +348,11 @@ def compress_weights(
     :param model: A model to be compressed.
     :type model: TModel
     :param mode: Defines a mode for weight compression.
-        INT8_SYM stands for 8-bit integer symmetric quantization of all weights.
+        INT8_SYM stands for 8-bit integer symmetric quantization of all weights without zero point.
         INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically
             with a typical non-fixed zero point.
         INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision.
-            Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8.
+            Weights are quantized to a primary precision symmetrically without zero point.
             All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM,
             by default. All others are quantized whether to 4-bit integer or to a backup precision depending on
             criteria and the given ratio.
@@ -393,7 +393,7 @@ def compress_weights(
     """
     if mode == CompressWeightsMode.INT8:
         warning_deprecated(
-            "`CompressWeightsMode.INT8` is deprecated." "Please, use `CompressWeightsMode.INT8_ASYM` as value instead."
+            "`CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead."
         )
         mode = CompressWeightsMode.INT8_ASYM
 

diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py
@@ -17,6 +17,7 @@
 from nncf import SensitivityMetric
 from nncf.quantization import compress_weights
 from nncf.torch import wrap_model
+from nncf.torch.quantization.layers import AsymmetricWeightsDecompressor
 from nncf.torch.quantization.layers import SymmetricWeightsDecompressor
 
 DATA_BASED_SENSITIVITY_METRICS = (
@@ -127,16 +128,20 @@ def test_compress_weights(mode):
     assert n_compressed_weights == n_target_modules
 
 
-def test_compress_weights_functional_model():
+@pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM))
+def test_compress_weights_functional_model(mode):
     model = FunctionalModel()
+    decompressor_type = (
+        SymmetricWeightsDecompressor if mode == CompressWeightsMode.INT8_SYM else AsymmetricWeightsDecompressor
+    )
 
     input_ids = torch.randint(0, 10, [1, 3, 300, 300])
     wrapped_model = wrap_model(model, example_input=input_ids, trace_parameters=True)
-    compressed_model = compress_weights(wrapped_model, mode=CompressWeightsMode.INT8_SYM)
+    compressed_model = compress_weights(wrapped_model, mode=mode)
 
     n_compressed_weights = 0
     for layer in compressed_model.nncf.external_op.values():
-        if isinstance(layer, SymmetricWeightsDecompressor):
+        if isinstance(layer, decompressor_type):
             n_compressed_weights += 1
     assert n_compressed_weights == 4