From 85b32635533ad776439c90a81ee59ae4db3b5862 Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Thu, 13 Jun 2024 09:00:53 +0100 Subject: [PATCH] Represent symmetrically quantized weights in signed data type (#2434) ### Changes Represent symmetrically quantized weights in signed data type with no zero point ### Reason for changes * To detect the quantization type without analyzing zero-point values * Signed data type for symmetrically quantized weights will lead to a smaller footprint, especially in case of grouped quantization. ### Related tickets 130625 ### Tests Updated: `tests/torch/ptq/test_weights_compression.py` and `tests/openvino/native/quantization/test_weights_compression.py` Merge after: https://github.com/openvinotoolkit/openvino/pull/24457 Model | Backend | Metric name | Metric value | Metric diff | Num int4 | Num int8 | RAM MiB | Compr. time | Total time -- | -- | -- | -- | -- | -- | -- | -- | -- | -- tinyllama_data_aware_awq_scale_estimation | OV | Similarity | 0.84048 | -0.15952 | 94 | 124 | 35560 | 0:06:31 | 0:08:36 tinyllama_data_aware_awq_scale_estimation_stateful | OV | Similarity | 0.84048 | -0.15952 | 94 | 124 | 36612 | 0:06:12 | 0:07:40 tinyllama_data_aware_awq_stateful | OV | Similarity | 0.85259 | -0.14741 | 94 | 124 | 34824 | 0:01:50 | 0:03:17 tinyllama_data_aware | OV | Similarity | 0.83853 | -0.16147 | 94 | 124 | 30604 | 0:01:25 | 0:03:30 tinyllama_data_aware_gptq | OV | Similarity | 0.82187 | -0.17813 | 94 | 124 | 39624 | 0:25:09 | 0:27:10 tinyllama_data_free | OV | Similarity | 0.72057 | -0.27943 | 114 | 84 | 6671 | 0:00:42 | 0:02:46 tinyllama_int8_data_free | TORCH | Similarity | 0.95624 | -0.04376 | 0 | 312 | 30161 | 0:00:09 | 0:02:54 --- .../weights_compression/Usage.md | 4 +- nncf/parameters.py | 4 +- .../weight_compression/algorithm.py | 4 +- .../algorithms/weight_compression/gptq.py | 14 +- .../weight_compression/mixed_precision.py | 3 +- .../weight_compression/openvino_backend.py | 60 ++--- .../weight_compression/scale_estimation.py | 95 +++++--- .../weight_compression/torch_backend.py | 22 +- .../weight_compression/weight_lowering.py | 69 +++--- nncf/quantization/quantize_model.py | 6 +- nncf/torch/quantization/layers.py | 29 ++- nncf/torch/quantization/quantize_functions.py | 18 +- ...egerModel_compressed_weights_int8_sym.json | 221 +++++++++--------- .../quantization/test_weights_compression.py | 89 ++++--- .../post_training/data/wc_reference_data.yaml | 12 +- tests/torch/ptq/test_weights_compression.py | 33 ++- 16 files changed, 378 insertions(+), 305 deletions(-) diff --git a/docs/usage/post_training_compression/weights_compression/Usage.md b/docs/usage/post_training_compression/weights_compression/Usage.md index a9e5289e0db..8d2f56143f5 100644 --- a/docs/usage/post_training_compression/weights_compression/Usage.md +++ b/docs/usage/post_training_compression/weights_compression/Usage.md @@ -9,7 +9,7 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod #### Supported modes By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode. -OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is unsigned 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) with a fixed zero point equals to 8. In case of INT4_ASYM mode - also unsigned 4-bit integer, but weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. +OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is signed 4-bit integer and weights are quantized to it [symmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization) without zero point. In case of INT4_ASYM mode - unsigned 4-bit integer and weight are quantized to it [asymmetrically](/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point. All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale). All embeddings, convolutions and last linear layers are always compressed to 8-bit integer data type. To quantize embeddings and last linear layers to 4-bit, use `all_layers=True`. Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit asymmetric integer data type. @@ -484,7 +484,7 @@ Here is the perplexity and accuracy with data-free and data-aware mixed-precisio - The algorithm is supported for OpenVINO and PyTorch models. - The compression applies in-place. - The compressed model is not trainable. -- INT8_SYM, INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only. +- INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only. - NF4 support is experimental - models quantized to nf4 should not be faster models quantized to 8-bit integer. #### Additional resources diff --git a/nncf/parameters.py b/nncf/parameters.py index 3cfeb246668..00cf487b9d4 100644 --- a/nncf/parameters.py +++ b/nncf/parameters.py @@ -68,13 +68,13 @@ class CompressWeightsMode(StrEnum): """ Defines a mode for weight compression. :param INT8_SYM: Stands for 8-bit integer symmetric quantization of all weights. - Weights are quantized symmetrically with a fixed zero point equals to 128. + Weights are quantized symmetrically without zero point. https://github.com/openvinotoolkit/nncf/blob/develop/docs/usage/training_time_compression/other_algorithms/LegacyQuantization.md#symmetric-quantization :param INT8_ASYM: The same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization :param INT4_SYM: Stands for a mixed-precision weights quantization with 4-bit integer as a primary precision. - Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8. + Weights are quantized to a primary precision symmetrically without zero point. All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM, by default. All others are quantized whether to 4-bit integer or to a backup precision depending on criteria and the given ratio. diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 4609df0a463..8fb6e2c1b9b 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -70,11 +70,11 @@ def __init__( """ :param mode: Defines a mode for weight compression. INT8_SYM stands for 8-bit integer symmetric quantization of all weights. - Weights are quantized symmetrically with a fixed zero point equals to 128. + Weights are quantized symmetrically without zero point. INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision. - Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8. + Weights are quantized to a primary precision symmetrically without zero point. All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM, by default. All others are quantized whether to 4-bit integer or to a backup precision depending on criteria and the given ratio. diff --git a/nncf/quantization/algorithms/weight_compression/gptq.py b/nncf/quantization/algorithms/weight_compression/gptq.py index 282e2f6910a..ee2940a86aa 100644 --- a/nncf/quantization/algorithms/weight_compression/gptq.py +++ b/nncf/quantization/algorithms/weight_compression/gptq.py @@ -263,7 +263,7 @@ def _quantize_weights( quantized_col = decompress_nf4_weight(compressed_weights, scales[-1]) else: compressed_weights = calculate_quantized_weight( - fns.unsqueeze(weight_col, 1), scales[-1], zero_points[-1], block_compression_config + fns.unsqueeze(weight_col, 1), block_compression_config, scales[-1], zero_points[-1] ) quantized_col = do_dequantization(compressed_weights, scales[-1], zero_points[-1]) quantized_col = fns.flatten(quantized_col) @@ -287,13 +287,11 @@ def _quantize_weights( ) scales = fns.stack(scales, axis=1) - if wc_params.compression_config.mode == CompressWeightsMode.NF4: - zero_points = None - elif wc_params.compression_config.mode in [ - CompressWeightsMode.INT8_SYM, - CompressWeightsMode.INT4_SYM, + if wc_params.compression_config.mode in [ + CompressWeightsMode.INT8_ASYM, + CompressWeightsMode.INT4_ASYM, ]: - zero_points = fns.squeeze(zero_points[0]) - else: zero_points = fns.stack(zero_points, axis=1) + else: + zero_points = None return scales, zero_points diff --git a/nncf/quantization/algorithms/weight_compression/mixed_precision.py b/nncf/quantization/algorithms/weight_compression/mixed_precision.py index 29a5fc4bb3a..c4e4c73fa92 100644 --- a/nncf/quantization/algorithms/weight_compression/mixed_precision.py +++ b/nncf/quantization/algorithms/weight_compression/mixed_precision.py @@ -22,6 +22,7 @@ from nncf.quantization.algorithms.weight_compression.backend import WeightCompressionAlgoBackend from nncf.quantization.algorithms.weight_compression.config import WeightCompressionConfig from nncf.quantization.algorithms.weight_compression.config import WeightCompressionParameters +from nncf.quantization.algorithms.weight_compression.weight_lowering import do_dequantization from nncf.quantization.algorithms.weight_compression.weight_lowering import do_integer_quantization from nncf.quantization.algorithms.weight_compression.weight_lowering import get_integer_quantization_error @@ -176,7 +177,7 @@ def _calc_weight_sensitivity(self, weight_param: WeightCompressionParameters) -> weight = weight.astype(TensorDataType.float32) compressed_weights, scale, zero_point = do_integer_quantization(weight, reduction_axes, backup_config) - decompressed_weight = (compressed_weights - zero_point).astype(weight.dtype) * scale + decompressed_weight = do_dequantization(compressed_weights, scale, zero_point) decompressed_weight = decompressed_weight.reshape(orig_shape) return fns.linalg.norm(decompressed_weight - weight, ord="fro").item() diff --git a/nncf/quantization/algorithms/weight_compression/openvino_backend.py b/nncf/quantization/algorithms/weight_compression/openvino_backend.py index c565c8a603d..33ecb91a56f 100644 --- a/nncf/quantization/algorithms/weight_compression/openvino_backend.py +++ b/nncf/quantization/algorithms/weight_compression/openvino_backend.py @@ -20,6 +20,7 @@ from nncf.common.graph.transformations.commands import TargetType from nncf.common.graph.utils import get_reduction_axes from nncf.experimental.common.tensor_statistics.collectors import TensorCollector +from nncf.experimental.tensor.definitions import TensorDataType from nncf.experimental.tensor.tensor import Tensor from nncf.openvino.graph.metatypes import openvino_metatypes as om from nncf.openvino.graph.model_transformer import OVModelTransformer @@ -134,17 +135,14 @@ def transform_model( compression_config = wc_params.compression_config if compression_config.mode == CompressWeightsMode.NF4: compression_dtype = ov.Type.nf4 - elif compression_config.mode in [ - CompressWeightsMode.INT8_ASYM, - CompressWeightsMode.INT8_SYM, - CompressWeightsMode.INT8, - CompressWeightsMode.INT4_ASYM, - CompressWeightsMode.INT4_SYM, - ]: - if compression_config.mode in [CompressWeightsMode.INT4_ASYM, CompressWeightsMode.INT4_SYM]: - compression_dtype = ov.Type.u4 - else: - compression_dtype = ov.Type.u8 + elif compression_config.mode == CompressWeightsMode.INT4_SYM: + compression_dtype = ov.Type.i4 + elif compression_config.mode == CompressWeightsMode.INT4_ASYM: + compression_dtype = ov.Type.u4 + elif compression_config.mode == CompressWeightsMode.INT8_SYM: + compression_dtype = ov.Type.i8 + elif compression_config.mode == CompressWeightsMode.INT8_ASYM: + compression_dtype = ov.Type.u8 else: raise ValueError(f"{compression_config.mode.value} is not supported.") @@ -175,7 +173,7 @@ def transform_model( compressed_weight.tensor.data, dtype=compression_dtype, name=const_node_name ) converted_const = opset.convert(compressed_const, ov.Type.f16) - if compressed_weight.zero_point is not None: + if compressed_weight.zero_point is not None and compressed_weight.tensor.dtype == TensorDataType.uint8: zero_point_const = opset.constant( compressed_weight.zero_point.data, dtype=compression_dtype, @@ -220,27 +218,28 @@ def dump_parameters( @staticmethod def get_compress_decompress_pipeline( - weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape + weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape=None ): - ( - w, - s, - zp, - clamp, - ) = OVWeightCompressionAlgoBackend.get_compress_pipeline( + parameters, clamp = OVWeightCompressionAlgoBackend.get_compress_pipeline( weight_compression_parameter, w_shape, s_shape, z_p_shape, True ) - result = (clamp - zp) * s - model = ov.Model([result], [w, s, zp]) + if len(parameters) == 3: + _, s, zp = parameters + result = (clamp - zp) * s + else: + s = parameters[1] + result = clamp * s + + model = ov.Model([result], parameters) compiled_model = ov.compile_model(model) - return lambda w, s, zp: compiled_model([w, s, zp])[0] + return lambda parameters: compiled_model(parameters)[0] @staticmethod def get_compress_pipeline( - weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape, return_nodes=False + weight_compression_parameter: WeightCompressionParameters, w_shape, s_shape, z_p_shape=None, return_nodes=False ): config = weight_compression_parameter.compression_config mode = config.mode @@ -252,18 +251,23 @@ def get_compress_pipeline( w = opset.parameter(w_shape, name="w") s = opset.parameter(s_shape, name="s") - zp = opset.parameter(z_p_shape, name="zp") + parameters = [w, s] + compressed_w = w / s + if z_p_shape is not None: + zp = opset.parameter(z_p_shape, name="zp") + parameters.append(zp) + compressed_w += zp - result = opset.clamp(opset.round(w / s + zp), level_low, level_high, name="compressed_weights") + result = opset.clamp(opset.round(compressed_w), level_low, level_high, name="compressed_weights") if return_nodes: - return w, s, zp, result + return parameters, result - model = ov.Model([result], [w, s, zp]) + model = ov.Model([result], parameters) compiled_model = ov.compile_model(model) - return lambda w, s, zp: compiled_model([w, s, zp])[0] + return lambda parameters: compiled_model(parameters)[0] class OVAWQAlgoAlgoBackend(OVWeightCompressionAlgoBackend): diff --git a/nncf/quantization/algorithms/weight_compression/scale_estimation.py b/nncf/quantization/algorithms/weight_compression/scale_estimation.py index 3cc9d7f4180..156335e43b5 100644 --- a/nncf/quantization/algorithms/weight_compression/scale_estimation.py +++ b/nncf/quantization/algorithms/weight_compression/scale_estimation.py @@ -10,7 +10,7 @@ # limitations under the License. from copy import deepcopy -from typing import Any, Dict, List, Optional, TypeVar +from typing import Any, Dict, List, Optional, Tuple, TypeVar from nncf import Dataset from nncf.common.graph.graph import NNCFGraph @@ -117,7 +117,7 @@ def apply( :return: Dict with pairs (weight name, estimated scale). """ - compress_decompress_cashe = {} + compress_decompress_cache = {} res = dict() for wp in track(self._all_weight_params, description="Applying Scale Estimation"): @@ -165,8 +165,8 @@ def apply( original_weight = fns.zeros_like(weight) + weight compressed_weights, scale, zp = do_integer_quantization(original_weight, reduction_axis, config) - zp = zp.astype(scale.dtype) - + if zp is not None: + zp = zp.astype(scale.dtype) q_weights = do_dequantization(compressed_weights, scale, zp, reduction_axis) s = fns.unsqueeze(s, 0) @@ -180,9 +180,7 @@ def apply( importance = fns.ones_like(original_weight) importance = importance * s - target = compressed_weights.astype(dtype=zp.dtype) - zp - zero_mask = compressed_weights == zp - + target, zero_mask = get_target_zero_mask(compressed_weights, zp) importance = fns.where(zero_mask, 0.0, importance) # normalize importances for every group of weights to make sum of them equal to 1.0 @@ -203,18 +201,20 @@ def apply( if self._weight_penalty > 0.0: min_max_scale_diffs += self._weight_penalty * fns.mean((q_weights - original_weight) ** 2, axis=-1) - key = ( - (wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape + zp.shape - ) - if key in compress_decompress_cashe: - compress_decompress_model = compress_decompress_cashe[key]["compress_decompress_model"] - compress_model = compress_decompress_cashe[key]["compress_model"] + zp_shape = zp.shape if zp is not None else None + key = [(wp.compression_config.mode, wp.compression_config.num_bits) + q_weights.shape + scale.shape] + if zp is not None: + key += zp_shape + key = tuple(key) + if key in compress_decompress_cache: + compress_decompress_model = compress_decompress_cache[key]["compress_decompress_model"] + compress_model = compress_decompress_cache[key]["compress_model"] else: compress_decompress_model = self._backend_entity.get_compress_decompress_pipeline( - wp, q_weights.shape, scale.shape, zp.shape + wp, q_weights.shape, scale.shape, zp_shape ) - compress_model = self._backend_entity.get_compress_pipeline(wp, q_weights.shape, scale.shape, zp.shape) - compress_decompress_cashe[key] = { + compress_model = self._backend_entity.get_compress_pipeline(wp, q_weights.shape, scale.shape, zp_shape) + compress_decompress_cache[key] = { "compress_decompress_model": compress_decompress_model, "compress_model": compress_model, } @@ -222,14 +222,15 @@ def apply( zero_scale = 0.001 zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + input_tensors = [original_weight.data, None] + if zp is not None: + input_tensors.append(zp.data) # iterative rectification of initial scale for i in range(self._initial_steps): - ideal_scale = fns.abs(original_weight) / (fns.abs(target) + zero_mask) - weighted_scale = ideal_scale * importance - - near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) + input_tensors[1] = near_to_ideal_scale.data - out = compress_decompress_model(original_weight.data, near_to_ideal_scale.data, zp.data) + out = compress_decompress_model(input_tensors) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) @@ -252,12 +253,12 @@ def apply( else: near_to_ideal_scale = mask * result_scale + (1.0 - mask) * near_to_ideal_scale result_scale = near_to_ideal_scale + input_tensors[1] = near_to_ideal_scale.data if i < self._initial_steps - 1: - out = compress_model(original_weight.data, near_to_ideal_scale.data, zp.data) + out = compress_model(input_tensors) compressed_weights = fns.zeros_like(original_weight) + out - target = compressed_weights - zp - zero_mask = compressed_weights == zp + target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) # iterative rectification of scale based on grid search @@ -265,18 +266,16 @@ def apply( factor = 1.0 - 0.05 * scale_steps scaled_scale = factor * scale - out = compress_model(original_weight.data, scaled_scale.data, zp.data) + input_tensors[1] = scaled_scale.data + out = compress_model(input_tensors) compressed_weights = fns.zeros_like(original_weight) + out - target = compressed_weights - zp - zero_mask = compressed_weights == zp + target, zero_mask = get_target_zero_mask(compressed_weights, zp) zero_mask = zero_scale * zero_mask.astype(original_weight.dtype) + near_to_ideal_scale = estimate_scales(original_weight, target, zero_mask, importance) - ideal_scale = fns.abs(original_weight) / (fns.abs(target) + zero_mask) - weighted_scale = ideal_scale * importance - near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True) - - out = compress_decompress_model(original_weight.data, near_to_ideal_scale.data, zp.data) + input_tensors[1] = near_to_ideal_scale.data + out = compress_decompress_model(input_tensors) q_weights_ = fns.zeros_like(original_weight) + out q_outs = fns.matmul(fns.transpose(q_weights_, (1, 0, 2)), X) @@ -300,3 +299,35 @@ def apply( res[weight_name] = result_scale return res + + +def get_target_zero_mask(compressed_weights: TTensor, zp: Optional[TTensor] = None) -> Tuple[TTensor, TTensor]: + """ + Computes the target values and a mask indicating zero values in the target. + + :param compressed_weights: The compressed weights tensor. + :param zp: The zero point tensor. + :return: The compressed weights optionally adjusted by the zero point and + a boolean mask indicating positions in the target that are close to zero. + """ + target = compressed_weights + if zp is not None: + target = target.astype(dtype=zp.dtype) - zp + zero_mask = fns.isclose(target, 0) + return target, zero_mask + + +def estimate_scales(weight: TTensor, target: TTensor, zero_mask: TTensor, importance: TTensor) -> TTensor: + """ + Estimates scales for the given weight, target, zero mask, and importance. + + :param weight: The weights tensor. + :param target: The target values tensor. + :param zero_mask: A boolean mask indicating positions in the target that are close to zero. + :param importance: The importance values tensor. + :return: The estimated scales + """ + ideal_scale = fns.abs(weight) / (fns.abs(target) + zero_mask) + weighted_scale = ideal_scale * importance + near_to_ideal_scale = fns.sum(weighted_scale, axis=2, keepdims=True) + return near_to_ideal_scale diff --git a/nncf/quantization/algorithms/weight_compression/torch_backend.py b/nncf/quantization/algorithms/weight_compression/torch_backend.py index 4aa139177df..6e1b4a7a6c1 100644 --- a/nncf/quantization/algorithms/weight_compression/torch_backend.py +++ b/nncf/quantization/algorithms/weight_compression/torch_backend.py @@ -38,7 +38,8 @@ from nncf.torch.model_graph_manager import split_const_name from nncf.torch.model_transformer import PTModelTransformer from nncf.torch.nncf_network import NNCFNetwork -from nncf.torch.quantization.layers import WeightsDecompressor +from nncf.torch.quantization.layers import AsymmetricWeightsDecompressor +from nncf.torch.quantization.layers import SymmetricWeightsDecompressor from nncf.torch.tensor_statistics.collectors import get_raw_stat_collector @@ -211,7 +212,11 @@ def transform_model( compressed_weight.scale = compressed_weight.scale.astype(dtype=TensorDataType.float16) # pack compressed tensor - packed_tensor = compressed_weight.tensor.astype(TensorDataType.uint8) + if compression_config.mode == CompressWeightsMode.INT8_SYM: + dtype = TensorDataType.int8 + else: + dtype = TensorDataType.uint8 + packed_tensor = compressed_weight.tensor.astype(dtype) # sets compressed tensor compressed_parameter = torch.nn.Parameter(packed_tensor.data, requires_grad=False) @@ -225,13 +230,14 @@ def transform_model( if id(param) == id(weight): setattr(c_module, name, compressed_parameter) - # pack zero point tensor - packed_zero_point = compressed_weight.zero_point.astype(TensorDataType.uint8) - # creates weight decompressor - decompressor = WeightsDecompressor( - compressed_weight.scale.data, packed_zero_point.data, result_dtype=weight.dtype - ) + if compression_config.mode == CompressWeightsMode.INT8_SYM: + decompressor = SymmetricWeightsDecompressor(compressed_weight.scale.data, result_dtype=weight.dtype) + else: + packed_zero_point = compressed_weight.zero_point.astype(dtype) + decompressor = AsymmetricWeightsDecompressor( + compressed_weight.scale.data, packed_zero_point.data, result_dtype=weight.dtype + ) # registry weight decompression module in the model decompressor_name = f"weights_decompressor_{weight_node.node_name.replace('.', '_')}" diff --git a/nncf/quantization/algorithms/weight_compression/weight_lowering.py b/nncf/quantization/algorithms/weight_compression/weight_lowering.py index c3b7ed40c84..54a2ef90754 100644 --- a/nncf/quantization/algorithms/weight_compression/weight_lowering.py +++ b/nncf/quantization/algorithms/weight_compression/weight_lowering.py @@ -222,43 +222,39 @@ def calculate_integer_quantization_params( assert mode != CompressWeightsMode.NF4, "The function supports integer quantization only" num_bits = config.num_bits - level_low = 0 - level_high = 2**num_bits - 1 - if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) if mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM]: + level_low = 0 + level_high = 2**num_bits - 1 min_values = fns.min(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] max_values = fns.max(weight, axis=reduction_axes, keepdims=True) # [a1, r, a2] -> [a1, 1, a2] scale, zero_point = calculate_scale_zero_point( min_values, max_values, level_low, level_high, narrow_range=False ) - else: - level_low_sym = -(2 ** (num_bits - 1)) - level_high_sym = 2 ** (num_bits - 1) - 1 - - scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) # [a1, r//gs, 1, a2] - scale = scale / level_high_sym - zero_point = fns.as_tensor_like(scale, [-level_low_sym]).astype(TensorDataType.int32) - eps = fns.finfo(scale).eps - # NOTE: adding machine epsilon to avoid division by zero - scale = fns.where(fns.abs(scale) < eps, eps, scale) + return scale, zero_point - return scale, zero_point + level_high = 2 ** (num_bits - 1) - 1 + scale = fns.max(fns.abs(weight), axis=reduction_axes, keepdims=True) # [a1, r//gs, 1, a2] + scale /= level_high + eps = fns.finfo(scale).eps + # NOTE: adding machine epsilon to avoid division by zero + scale = fns.where(fns.abs(scale) < eps, eps, scale) + return scale, None def calculate_quantized_weight( - weight: Tensor, scale: Tensor, zero_point: Tensor, config: WeightCompressionConfig + weight: Tensor, config: WeightCompressionConfig, scale: Tensor, zero_point: Optional[Tensor] = None ) -> Tensor: """ Quantizes the weight tensor using the provided scale and zero point. :param weight: Weight tensor to quantize. + :param config: Weight compression configuration. :param scale: Scale tensor used for quantization. :param zero_point: Zero point tensor used for quantization. - :param config: Weight compression configuration. - :return: Quantized weight tensor of uint8 type. + :return: Quantized weight tensor of uint8 or int8 type. """ if weight.dtype != TensorDataType.float32: weight = weight.astype(TensorDataType.float32) @@ -266,12 +262,16 @@ def calculate_quantized_weight( scale = scale.astype(TensorDataType.float32) num_bits = config.num_bits - - level_low = 0 - level_high = 2**num_bits - 1 - - compressed_weights = fns.round(weight / scale + zero_point.astype(weight.dtype)) - compressed_weights = fns.clip(compressed_weights, level_low, level_high).astype(TensorDataType.uint8) + asym_quant = config.mode in [CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT4_ASYM] + dtype = TensorDataType.uint8 if asym_quant else TensorDataType.int8 + level_low = 0 if asym_quant else -(2 ** (num_bits - 1)) + level_high = 2**num_bits - 1 if asym_quant else 2 ** (num_bits - 1) - 1 + + compressed_weights = weight / scale + if zero_point is not None: + compressed_weights += zero_point.astype(weight.dtype) + compressed_weights = fns.round(compressed_weights) + compressed_weights = fns.clip(compressed_weights, level_low, level_high).astype(dtype) return compressed_weights @@ -285,14 +285,14 @@ def do_integer_quantization( """ The method quantizes the given weights to integer data type in accordance with the compression config. The config defines a quantization mode: - INT8_SYM mode refers to unsigned int8 symmetric weight compression with a fixed zero point equals to 128 - - quantization to [0, 255] range. + INT8_SYM mode refers to signed int8 symmetric weight compression without zero point - + quantization to [-128, 127] range. INT8_ASYM mode refers to unsigned int8 asymmetric weight compression with a typical non-fixed zero-point - quantization to [0, 255] range. INT4_ASYM mode refers to unsigned int4 asymmetric weight compression with a typical non-fixed zero-point - quantization to [0, 15] range. - INT4_SYM mode refers to unsigned int4 symmetric weight compression with a fixed zero point equals to 8 - - quantization to [0, 15] range. + INT4_SYM mode refers to signed int4 symmetric weight compression without zero point - + quantization to [-8, 7] range. NF4 mode requires a dedicated procedure and it is not supported in this method. One of the parameter of compression config is a group size. Quantization is per-channel, if group size equals to -1, otherwise it's per-group, i.e. group size number of weights in the channel dimension share quantization parameters @@ -303,8 +303,8 @@ def do_integer_quantization( :param config: Information on how to compress (quantize) a specific weight. :param precomputed_scale: Precomputed scale. :param precomputed_zero_point: Precomputed zero point. - :return: The compressed weights tensor of uint8 type, scale tensor of float32 type and - zero point tensor of int32 type that was used for its quantization. + :return: The compressed weights tensor of uint8 (asymmetric mode) or int8 (symmetric mode) type, + scale tensor of float32 type and zero point tensor of int32 type that was used for its quantization. """ mode = config.mode assert mode != CompressWeightsMode.NF4, "The function supports integer quantization only" @@ -324,7 +324,7 @@ def do_integer_quantization( if precomputed_zero_point is not None: zero_point = precomputed_zero_point - compressed_weights = calculate_quantized_weight(weight, scale, zero_point, config) + compressed_weights = calculate_quantized_weight(weight, config, scale, zero_point) return compressed_weights, scale, zero_point @@ -346,8 +346,7 @@ def get_integer_quantization_error( weight = weight.astype(TensorDataType.float32) compressed_weights, scale, zero_point = do_integer_quantization(weight, reduction_axes, config) - - decompressed_weight = (compressed_weights - zero_point).astype(weight.dtype) * scale + decompressed_weight = do_dequantization(compressed_weights, scale, zero_point) decompressed_weight = decompressed_weight.reshape(orig_shape) diff = (decompressed_weight - weight) ** 2 @@ -386,7 +385,7 @@ def compress_weight( def do_dequantization( - compressed_weights: Tensor, scale: Tensor, zero_point: Tensor, reduction_axis: int = -1 + compressed_weights: Tensor, scale: Tensor, zero_point: Optional[Tensor] = None, reduction_axis: int = -1 ) -> Tensor: """ The method dequantizes the given weights to float point data type in accordance with the scale and @@ -399,7 +398,9 @@ def do_dequantization( :return: dequantized/decompressed weights. """ decompressed_weight = compressed_weights.astype(dtype=scale.dtype) - decompressed_weight = (decompressed_weight - zero_point) * scale + if zero_point is not None: + decompressed_weight -= zero_point + decompressed_weight *= scale if reduction_axis > -1: shape = list(decompressed_weight.shape) # [a1, r, a2] - "r" refers to number of channels along reduction axis diff --git a/nncf/quantization/quantize_model.py b/nncf/quantization/quantize_model.py index fc6f5c07cde..46426d77c21 100644 --- a/nncf/quantization/quantize_model.py +++ b/nncf/quantization/quantize_model.py @@ -348,11 +348,11 @@ def compress_weights( :param model: A model to be compressed. :type model: TModel :param mode: Defines a mode for weight compression. - INT8_SYM stands for 8-bit integer symmetric quantization of all weights. + INT8_SYM stands for 8-bit integer symmetric quantization of all weights without zero point. INT8_ASYM is the same as INT8_SYM mode, but weights are quantized to a primary precision asymmetrically with a typical non-fixed zero point. INT4_SYM stands for a mixed-precision weights quantization with 4-bit integer as a primary precision. - Weights are quantized to a primary precision symmetrically with a fixed zero point equals to 8. + Weights are quantized to a primary precision symmetrically without zero point. All embeddings and the last layer are always compressed to a backup precision, which is INT8_ASYM, by default. All others are quantized whether to 4-bit integer or to a backup precision depending on criteria and the given ratio. @@ -393,7 +393,7 @@ def compress_weights( """ if mode == CompressWeightsMode.INT8: warning_deprecated( - "`CompressWeightsMode.INT8` is deprecated." "Please, use `CompressWeightsMode.INT8_ASYM` as value instead." + "`CompressWeightsMode.INT8` is deprecated. Please, use `CompressWeightsMode.INT8_ASYM` as value instead." ) mode = CompressWeightsMode.INT8_ASYM diff --git a/nncf/torch/quantization/layers.py b/nncf/torch/quantization/layers.py index 4b463600bc5..b84d325526f 100644 --- a/nncf/torch/quantization/layers.py +++ b/nncf/torch/quantization/layers.py @@ -46,7 +46,8 @@ from nncf.torch.quantization.quantize_functions import ExportQuantizeToONNXQuantDequant from nncf.torch.quantization.quantize_functions import TuneRange from nncf.torch.quantization.quantize_functions import asymmetric_quantize -from nncf.torch.quantization.quantize_functions import decompress +from nncf.torch.quantization.quantize_functions import decompress_asymmetric +from nncf.torch.quantization.quantize_functions import decompress_symmetric from nncf.torch.quantization.quantize_functions import get_scale_zp_from_input_low_input_high from nncf.torch.quantization.quantize_functions import symmetric_quantize from nncf.torch.return_types import maybe_get_values_from_torch_return_type @@ -1044,9 +1045,9 @@ def get_scale_shape(input_shape: List[int], is_weights: bool, per_channel: bool, return get_per_channel_scale_shape(input_shape, is_weights, channel_idx) -class WeightsDecompressor(nn.Module): +class AsymmetricWeightsDecompressor(nn.Module): """ - Applies decompression of compressed weights in the forward pass + Applies asymmetric decompression of compressed weights in the forward pass """ def __init__(self, scale: torch.Tensor, zero_point: torch.Tensor, result_dtype: torch.dtype = None): @@ -1061,6 +1062,26 @@ def __init__(self, scale: torch.Tensor, zero_point: torch.Tensor, result_dtype: self.result_dtype = result_dtype def forward(self, x): - result = decompress(x, self._scale, self._zero_point) + result = decompress_asymmetric(x, self._scale, self._zero_point) + result = result.type(dtype=self.result_dtype) if self.result_dtype is not None else result + return result + + +class SymmetricWeightsDecompressor(nn.Module): + """ + Applies symmetric decompression of compressed weights in the forward pass + """ + + def __init__(self, scale: torch.Tensor, result_dtype: torch.dtype = None): + """ + :param scale: A scale in quantization scheme + :param result_dtype: (Optional) A data type that result should be cast to + """ + super().__init__() + self.register_buffer("_scale", scale) + self.result_dtype = result_dtype + + def forward(self, x): + result = decompress_symmetric(x, self._scale) result = result.type(dtype=self.result_dtype) if self.result_dtype is not None else result return result diff --git a/nncf/torch/quantization/quantize_functions.py b/nncf/torch/quantization/quantize_functions.py index 6771d6f38fa..9b4055c4586 100644 --- a/nncf/torch/quantization/quantize_functions.py +++ b/nncf/torch/quantization/quantize_functions.py @@ -249,9 +249,9 @@ def backward(ctx: Any, *grad_outputs: Any) -> Any: @register_operator() -def decompress(input: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor) -> torch.Tensor: +def decompress_asymmetric(input: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tensor) -> torch.Tensor: """ - Decompress the input tensor. + Decompress the asymmetrically quantized input tensor. :param input: An input tensor :param scale: A scale tensor @@ -261,3 +261,17 @@ def decompress(input: torch.Tensor, scale: torch.Tensor, zero_point: torch.Tenso input = input.type(dtype=scale.dtype) decompressed_input = (input - zero_point) * scale return decompressed_input + + +@register_operator() +def decompress_symmetric(input: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: + """ + Decompress the symmetrically quantized input tensor. + + :param input: An input tensor + :param scale: A scale tensor + :return: The decompressed tensor + """ + input = input.type(dtype=scale.dtype) + decompressed_input = input * scale + return decompressed_input diff --git a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_sym.json b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_sym.json index 2e58bf43fc2..777d418696d 100644 --- a/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_sym.json +++ b/tests/openvino/native/data/2024.1/reference_scales/IntegerModel_compressed_weights_int8_sym.json @@ -2,63 +2,60 @@ "matmul_2_data": { "compressed_weight": [ [ - 182, - 152, - 200, - 255, - 165, - 136, - 193 - ], - [ - 155, - 140, - 206, - 168, - 219, - 155, - 255 - ], - [ - 177, - 142, - 212, - 251, - 187, - 255, - 195 - ], - [ - 182, - 207, - 255, - 249, - 187, - 225, - 191 - ], - [ - 200, - 235, - 184, - 228, - 225, - 255, - 144 - ], - [ - 222, - 248, - 253, - 130, - 240, - 255, - 252 + 54, + 24, + 72, + 127, + 37, + 8, + 65 + ], + [ + 27, + 12, + 78, + 40, + 91, + 27, + 127 + ], + [ + 49, + 14, + 84, + 123, + 59, + 127, + 67 + ], + [ + 54, + 79, + 127, + 121, + 59, + 97, + 63 + ], + [ + 72, + 107, + 56, + 100, + 97, + 127, + 16 + ], + [ + 94, + 120, + 125, + 2, + 112, + 127, + 124 ] ], - "zero_point": [ - 128 - ], "scale": [ [ 0.0062713623046875 @@ -83,57 +80,54 @@ "matmul_1_data": { "compressed_weight": [ [ - 185, - 208, - 133, - 152, - 255, - 251 + 57, + 80, + 5, + 24, + 127, + 123 ], [ - 206, - 177, - 255, - 253, - 215, - 211 + 78, + 49, + 127, + 125, + 87, + 83 ], [ - 249, - 196, - 152, - 255, - 220, - 183 + 121, + 68, + 24, + 127, + 92, + 55 ], [ - 194, - 249, - 255, - 177, - 206, - 172 + 66, + 121, + 127, + 49, + 78, + 44 ], [ - 213, - 176, - 184, - 255, - 160, - 217 + 85, + 48, + 56, + 127, + 32, + 89 ], [ - 140, - 249, - 242, - 163, - 255, - 136 + 12, + 121, + 114, + 35, + 127, + 8 ] ], - "zero_point": [ - 128 - ], "scale": [ [ 0.005279541015625 @@ -158,33 +152,30 @@ "gather_2_data": { "compressed_weight": [ [ - 217, - 166, - 134, - 130, - 241, - 255 + 89, + 38, + 6, + 2, + 113, + 127 ], [ - 210, - 227, - 202, - 255, - 239, - 128 + 82, + 99, + 74, + 127, + 111, + 0 ], [ - 254, - 133, - 235, - 154, - 255, - 208 + 126, + 5, + 107, + 26, + 127, + 80 ] ], - "zero_point": [ - 128 - ], "scale": [ [ 0.0071868896484375 diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index dbc82013df5..3ca449bba9b 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -74,41 +74,41 @@ def get_next_node(node): def check_int8_node(op: ov.Node, mode: CompressWeightsMode = CompressWeightsMode.INT8_ASYM): - assert op.get_element_type() == ov.Type(np.uint8) + dtype = ov.Type.u8 if mode == CompressWeightsMode.INT8_ASYM else ov.Type.i8 + assert op.get_element_type() == dtype compressed_weight = get_const_value(op) + stats = {"compressed_weight": compressed_weight} convert_node = get_next_node(op) assert convert_node.get_type_name() == "Convert" - sub_node = get_next_node(convert_node) - assert sub_node.get_type_name() == "Subtract" + if mode == CompressWeightsMode.INT8_ASYM: + sub_node = get_next_node(convert_node) + assert sub_node.get_type_name() == "Subtract" - convert_node = sub_node.input_value(1).get_node() - assert convert_node.get_type_name() == "Convert" + convert_node = sub_node.input_value(1).get_node() + assert convert_node.get_type_name() == "Convert" - zero_point_node = convert_node.input_value(0).get_node() - zero_point = get_const_value(zero_point_node) - if mode == CompressWeightsMode.INT8_SYM: - assert list(zero_point_node.shape) == [1] - else: + zero_point_node = convert_node.input_value(0).get_node() + zero_point = get_const_value(zero_point_node) + stats["zero_point"] = zero_point reduced_weight_shape = list(op.shape) reduced_weight_shape[-1] = 1 assert list(zero_point_node.shape) == reduced_weight_shape + mul_node = get_next_node(sub_node) + else: + mul_node = get_next_node(convert_node) - mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" scale_node = mul_node.input_value(1).get_node() scale = get_const_value(scale_node) - - return { - "compressed_weight": compressed_weight, - "zero_point": zero_point, - "scale": scale, - } + stats["scale"] = scale + return stats def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = 7): - assert op.get_element_type() == ov.Type.u4 + dtype = ov.Type.u4 if mode == CompressWeightsMode.INT4_ASYM else ov.Type.i4 + assert op.get_element_type() == dtype weight_shape = op.shape # NOTE: get_const_value doesn't work for 4-bit types assert list(weight_shape)[-1] == group_size @@ -118,20 +118,20 @@ def check_int4_grouped(op: ov.Node, mode: CompressWeightsMode, group_size: int = convert_node = get_next_node(op) assert convert_node.get_type_name() == "Convert" - sub_node = get_next_node(convert_node) - assert sub_node.get_type_name() == "Subtract" + if mode == CompressWeightsMode.INT4_ASYM: + sub_node = get_next_node(convert_node) + assert sub_node.get_type_name() == "Subtract" - convert_node = sub_node.input_value(1).get_node() - assert convert_node.get_type_name() == "Convert" + convert_node = sub_node.input_value(1).get_node() + assert convert_node.get_type_name() == "Convert" - zero_point_node = convert_node.input_value(0).get_node() - assert zero_point_node.get_element_type() == ov.Type.u4 - if mode == CompressWeightsMode.INT4_SYM: - assert list(zero_point_node.shape) == [1] - else: + zero_point_node = convert_node.input_value(0).get_node() + assert zero_point_node.get_element_type() == dtype assert list(zero_point_node.shape) == reduced_weight_shape + mul_node = get_next_node(sub_node) + else: + mul_node = get_next_node(convert_node) - mul_node = get_next_node(sub_node) assert mul_node.get_type_name() == "Multiply" scale_node = mul_node.input_value(1).get_node() assert list(scale_node.shape) == reduced_weight_shape @@ -277,12 +277,12 @@ def test_gather_in_4_bit_if_all_layers_with_data(metric): sensitivity_metric=metric, dataset=dataset, ) - int4_reference_node_names = ["gather_2_data", "gather_2_data/zero_point"] + int4_reference_node_names = ["gather_2_data"] nodes_map = {op.get_friendly_name(): op for op in compressed_model.get_ordered_ops()} for node_name in int4_reference_node_names: node = nodes_map[node_name] assert node.get_type_name() == "Constant" - assert node.get_element_type() == ov.Type.u4 + assert node.get_element_type() == ov.Type.i4 def test_gather_can_be_8_bit_if_all_layers_without_data(): @@ -306,17 +306,13 @@ def test_gather_can_be_8_bit_if_all_layers_without_data(): def test_conv_in_8_bit_if_mode_8bit(mode): model = WeightsModel().ov_model compressed_model = compress_weights(model, mode=mode) - int8_reference_node_names = [ - "conv_weights_0", - "conv_weights_0/zero_point", - "conv_weights_1", - "conv_weights_1/zero_point", - ] + int8_reference_node_names = ["conv_weights_0", "conv_weights_1"] nodes_map = {op.get_friendly_name(): op for op in compressed_model.get_ordered_ops()} + dtype = ov.Type.u8 if mode == CompressWeightsMode.INT8_ASYM else ov.Type.i8 for node_name in int8_reference_node_names: node = nodes_map[node_name] assert node.get_type_name() == "Constant" - assert node.get_element_type() == ov.Type.u8 + assert node.get_element_type() == dtype @pytest.mark.parametrize("all_layers", (True, False)) @@ -339,9 +335,9 @@ def test_conv_in_8_bit_if_mode_4bit(all_layers): ]: assert ov.Type.u8 == op.get_element_type() elif op.get_friendly_name() in ["weights_1", "weights_1/zero_point"]: - assert ov.Type.u4 == op.get_element_type() + assert ov.Type.i4 == op.get_element_type() elif op.get_friendly_name() in ["weights_0", "weights_0/zero_point"]: - dtype = ov.Type.u4 if all_layers else ov.Type.u8 + dtype = ov.Type.i4 if all_layers else ov.Type.u8 assert dtype == op.get_element_type() @@ -354,12 +350,12 @@ def test_gather_can_be_4_bit_if_all_layers_without_data(): group_size=1, all_layers=True, ) - int4_reference_node_names = ["gather_2_data", "gather_2_data/zero_point"] + int4_reference_node_names = ["gather_2_data"] nodes_map = {op.get_friendly_name(): op for op in compressed_model.get_ordered_ops()} for node_name in int4_reference_node_names: node = nodes_map[node_name] assert node.get_type_name() == "Constant" - assert node.get_element_type() == ov.Type.u4 + assert node.get_element_type() == ov.Type.i4 @pytest.mark.parametrize("metric", ALL_SENSITIVITY_METRICS) @@ -427,9 +423,10 @@ def test_data_based_criterion(mode, ref_scores, ref_act_scores, mocker): def test_quantize_Gather_with_multiple_reduction_axes_in_8bit(mode): model = GatherWithTwoReductionAxes().ov_model compressed_model = compress_weights(model, mode=mode) + dtype = ov.Type.u8 if mode == CompressWeightsMode.INT8_ASYM else ov.Type.i8 for op in compressed_model.get_ordered_ops(): if op.get_type_name() == "Constant" and op.get_friendly_name() == "gather_1_data": - assert op.get_element_type() == ov.Type.u8 + assert op.get_element_type() == dtype @pytest.mark.parametrize("mode", (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM)) @@ -445,9 +442,9 @@ def test_quantize_Gather_with_multiple_reduction_axes_if_mode_4bit(mode, all_lay @pytest.mark.parametrize("mode", (CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM)) def test_shared_gather(mode): weight_name_vs_type = { - "gather_2_data": ov.Type(np.uint8), - "shared_data": ov.Type(np.uint8), - "matmul_1_data": ov.Type.u4, + "gather_2_data": ov.Type.u8, + "shared_data": ov.Type.u8, + "matmul_1_data": ov.Type.i4 if mode == CompressWeightsMode.INT4_SYM else ov.Type.u4, } model = GatherAndMatmulShareData().ov_model compressed_model = compress_weights(model, mode, group_size=3) @@ -638,7 +635,7 @@ def test_weight_compress_with_ignored_scope(ignored_scope, num_compressed): if ( op.get_type_name() == "Constant" and op.get_friendly_name() in ref_compressed_weights - and op.get_element_type() == ov.Type(np.uint8) + and op.get_element_type() == ov.Type.u8 ): act_num += 1 assert act_num == num_compressed diff --git a/tests/post_training/data/wc_reference_data.yaml b/tests/post_training/data/wc_reference_data.yaml index ae59fdfb3fb..e9675646e74 100644 --- a/tests/post_training/data/wc_reference_data.yaml +++ b/tests/post_training/data/wc_reference_data.yaml @@ -1,22 +1,22 @@ tinyllama_data_free_backend_OV: metric_value: 0.72057 - num_int4: 228 + num_int4: 114 num_int8: 84 tinyllama_data_aware_backend_OV: metric_value: 0.83853 - num_int4: 188 + num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_stateful_backend_OV: metric_value: 0.85259 - num_int4: 188 + num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_backend_OV: metric_value: 0.8404 - num_int4: 188 + num_int4: 94 num_int8: 124 tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV: metric_value: 0.8404 - num_int4: 188 + num_int4: 94 num_int8: 124 tinyllama_int8_data_free_backend_TORCH: metric_value: 0.95624 @@ -24,5 +24,5 @@ tinyllama_int8_data_free_backend_TORCH: num_int8: 312 tinyllama_data_aware_gptq_backend_OV: metric_value: 0.83387 - num_int4: 188 + num_int4: 94 num_int8: 124 diff --git a/tests/torch/ptq/test_weights_compression.py b/tests/torch/ptq/test_weights_compression.py index b7de194e6c7..30c704e5435 100644 --- a/tests/torch/ptq/test_weights_compression.py +++ b/tests/torch/ptq/test_weights_compression.py @@ -17,7 +17,8 @@ from nncf import SensitivityMetric from nncf.quantization import compress_weights from nncf.torch import wrap_model -from nncf.torch.quantization.layers import WeightsDecompressor +from nncf.torch.quantization.layers import AsymmetricWeightsDecompressor +from nncf.torch.quantization.layers import SymmetricWeightsDecompressor DATA_BASED_SENSITIVITY_METRICS = ( SensitivityMetric.HESSIAN_INPUT_ACTIVATION, @@ -28,7 +29,7 @@ ALL_SENSITIVITY_METRICS = DATA_BASED_SENSITIVITY_METRICS + (SensitivityMetric.WEIGHT_QUANTIZATION_ERROR,) -SUPPORTED_MODES = (CompressWeightsMode.INT8, CompressWeightsMode.INT8_ASYM) +SUPPORTED_MODES = (CompressWeightsMode.INT8, CompressWeightsMode.INT8_ASYM, CompressWeightsMode.INT8_SYM) UNSUPPORTED_MODES = ( CompressWeightsMode.INT4_SYM, CompressWeightsMode.INT4_ASYM, @@ -106,12 +107,14 @@ def forward(self, input_): return x -def test_compress_weights(): +@pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM)) +def test_compress_weights(mode): model = ShortTransformer(5, 10) + dtype = torch.int8 if mode == CompressWeightsMode.INT8_SYM else torch.uint8 input_ids = torch.randint(0, 10, (5,)) wrapped_model = wrap_model(model, example_input=input_ids, trace_parameters=True) - compressed_model = compress_weights(wrapped_model) + compressed_model = compress_weights(wrapped_model, mode=mode) n_compressed_weights = 0 n_target_modules = 0 @@ -119,22 +122,26 @@ def test_compress_weights(): for _, module in compressed_model.named_children(): if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): n_target_modules += 1 - if module.weight.dtype in [torch.uint8, torch.int8]: + if module.weight.dtype == dtype: n_compressed_weights += 1 assert n_compressed_weights == n_target_modules -def test_compress_weights_functional_model(): +@pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM)) +def test_compress_weights_functional_model(mode): model = FunctionalModel() + decompressor_type = ( + SymmetricWeightsDecompressor if mode == CompressWeightsMode.INT8_SYM else AsymmetricWeightsDecompressor + ) input_ids = torch.randint(0, 10, [1, 3, 300, 300]) wrapped_model = wrap_model(model, example_input=input_ids, trace_parameters=True) - compressed_model = compress_weights(wrapped_model) + compressed_model = compress_weights(wrapped_model, mode=mode) n_compressed_weights = 0 for layer in compressed_model.nncf.external_op.values(): - if isinstance(layer, WeightsDecompressor): + if isinstance(layer, decompressor_type): n_compressed_weights += 1 assert n_compressed_weights == 4 @@ -158,12 +165,14 @@ def test_compress_weights_conv(): assert n_compressed_weights == n_target_modules -def test_compress_shared_weights(mocker): +@pytest.mark.parametrize("mode", (CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM)) +def test_compress_shared_weights(mocker, mode): model = ShortTransformer(5, 10, share_weights=True) + dtype = torch.int8 if mode == CompressWeightsMode.INT8_SYM else torch.uint8 input_ids = torch.randint(0, 10, (5,)) wrapped_model = wrap_model(model, example_input=input_ids, trace_parameters=True) - compressed_model = compress_weights(wrapped_model) + compressed_model = compress_weights(wrapped_model, mode=mode) n_compressed_weights = 0 n_target_modules = 0 @@ -171,7 +180,7 @@ def test_compress_shared_weights(mocker): for _, module in compressed_model.named_children(): if isinstance(module, (torch.nn.Linear, torch.nn.Embedding)): n_target_modules += 1 - if module.weight.dtype in [torch.uint8, torch.int8]: + if module.weight.dtype == dtype: n_compressed_weights += 1 assert n_compressed_weights == n_target_modules @@ -215,7 +224,7 @@ def test_raise_error_with_unsupported_params_for_int8(mode, params): @pytest.mark.parametrize("mode", UNSUPPORTED_MODES) -def test_raise_error_with_not_int8_asym(mode): +def test_raise_error_with_not_int8(mode): dummy_torch_model = EmptyModel() dummy_input = torch.Tensor() wrapped_model = wrap_model(dummy_torch_model, example_input=dummy_input, trace_parameters=True)