diff --git a/nncf/quantization/algorithms/weight_compression/algorithm.py b/nncf/quantization/algorithms/weight_compression/algorithm.py index 6d354303aa1..d6dc7e6923b 100644 --- a/nncf/quantization/algorithms/weight_compression/algorithm.py +++ b/nncf/quantization/algorithms/weight_compression/algorithm.py @@ -165,9 +165,6 @@ def _get_ratio_defining_params( if self._mode in [CompressWeightsMode.INT8_SYM, CompressWeightsMode.INT8_ASYM]: return all_weight_params - if self._all_layers: - return list(filter(lambda wp: len(wp.reduction_axes) == 1, all_weight_params)) - ratio_defining_params = list( filter( lambda wp: wp.node_with_weight.metatype in self._backend_entity.matmul_metatypes, @@ -175,18 +172,21 @@ def _get_ratio_defining_params( ) ) + # The last MatMul layer is quantized to 4-bits if all_layers=True or if the layer is shared + if not self._all_layers and not is_last_layer_shared: + ratio_defining_params = ratio_defining_params[:-1] + # Embedding layers are quantized to 4-bits only if all_layers=True. if self._all_layers: embedding_params = list( filter( - lambda wp: wp.node_with_weight.metatype in self._backend_entity.embedding_metatypes, + lambda wp: wp.node_with_weight.metatype in self._backend_entity.embedding_metatypes + and len(wp.reduction_axes) == 1, all_weight_params, ) ) ratio_defining_params.extend(embedding_params) - if not self._all_layers and not is_last_layer_shared: - ratio_defining_params = ratio_defining_params[:-1] return ratio_defining_params def _set_weight_compression_config( diff --git a/tests/openvino/native/quantization/test_weights_compression.py b/tests/openvino/native/quantization/test_weights_compression.py index f339d316d0d..78e63b374ab 100644 --- a/tests/openvino/native/quantization/test_weights_compression.py +++ b/tests/openvino/native/quantization/test_weights_compression.py @@ -455,6 +455,21 @@ def test_shared_gather(mode): assert op.get_element_type() == weight_name_vs_type[op_name] +@pytest.mark.parametrize("all_layers", (True, False)) +def test_shared_gather_all_layers(all_layers): + weight_name_vs_type = { + "gather_2_data": ov.Type.u4 if all_layers else ov.Type.u8, + "shared_data": ov.Type.u4 if all_layers else ov.Type.u8, + "matmul_1_data": ov.Type.u4, + } + model = GatherAndMatmulShareData().ov_model + compressed_model = compress_weights(model, CompressWeightsMode.INT4_ASYM, group_size=-1, all_layers=all_layers) + for op in compressed_model.get_ordered_ops(): + op_name = op.get_friendly_name() + if op.get_type_name() == "Constant" and op_name in weight_name_vs_type: + assert op.get_element_type() == weight_name_vs_type[op_name] + + @dataclass class QuantErrorDesc: weight: List[float]