Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NNCF]: Add INT8 weight compression conformance test for Tinyllama-1.1b PyTorch model #2636

Merged
merged 41 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
2d6fbe5
feat: Added to the test scope
AdiKsOnDev Apr 9, 2024
52e8180
feat: Added torch backend support
AdiKsOnDev Apr 12, 2024
fd48363
Merge branch 'openvinotoolkit:develop' into develop
AdiKsOnDev Apr 16, 2024
c024803
fix: Moved int8 conversion in _validate()
AdiKsOnDev Apr 18, 2024
6405c4e
git: Merge branch 'develop' of github.com:AdiKsOnDev/nncf into develop
AdiKsOnDev Apr 18, 2024
f48c148
fix: Returned initial implementation of _validate()
AdiKsOnDev Apr 22, 2024
f9505e4
chore: Temporary dummy data
AdiKsOnDev Apr 22, 2024
2bc73ec
fix: Model Preparation for TORCH backend
AdiKsOnDev Apr 22, 2024
927c38f
fix: Removed unsupported parameters for INT8
AdiKsOnDev Apr 22, 2024
f008103
chore: Comment on important addition
AdiKsOnDev Apr 22, 2024
eeade47
feat: Added correct metric value according to @aleksu52
AdiKsOnDev Apr 22, 2024
fc05eed
fix: Mode accurate check for the INT8 compression mode
AdiKsOnDev Apr 23, 2024
4aefa0d
feat: Problematic code for @aleksu52 to reproduce
AdiKsOnDev Apr 23, 2024
737c1a7
feat: Use AutoModelForCausalLM for TORCH models
AdiKsOnDev Apr 24, 2024
8066b76
fix: Added model specific parameters during preparation
AdiKsOnDev Apr 24, 2024
512aa63
Merge branch 'openvinotoolkit:develop' into develop
AdiKsOnDev Apr 24, 2024
0041998
refactor: Make a tokenizer during model preparation
AdiKsOnDev Apr 24, 2024
3a61ccf
feat: Tokenize an input string (Temporary) to feed in torch model
AdiKsOnDev Apr 24, 2024
ea0c4c4
fix: Added torch_dtype parameter to the model
AdiKsOnDev Apr 24, 2024
c346100
chore: Removed unnecessary compression parameters
AdiKsOnDev Apr 24, 2024
1cfccf9
refactor: Line spacing, preprocessor usage
AdiKsOnDev Apr 25, 2024
88dc901
Merge branch 'openvinotoolkit:develop' into develop
AdiKsOnDev Apr 26, 2024
5deba30
fix: Removing convert_model()
AdiKsOnDev Apr 27, 2024
40c5686
fix: The pipeline now runs for TORCH models
AdiKsOnDev Apr 27, 2024
d3989be
fix: Using model_hf for validation
AdiKsOnDev Apr 28, 2024
43aec31
fix: Changed the reference metric value
AdiKsOnDev Apr 28, 2024
a85ded2
refactor: Pre-Commit changes
AdiKsOnDev Apr 28, 2024
28af569
fix: Returned the original checks for int4/int8 values
AdiKsOnDev Apr 30, 2024
c3d5e2d
Merge branch 'openvinotoolkit:develop' into develop
AdiKsOnDev Apr 30, 2024
a72ae7e
chore: Pre-Commit changes
AdiKsOnDev Apr 30, 2024
2f6f69c
git: Merge main branch
AdiKsOnDev Apr 30, 2024
d90b356
Merge branch 'develop' into develop
AdiKsOnDev Apr 30, 2024
7d328c3
refactor: Pre-Commit Changes
AdiKsOnDev Apr 30, 2024
7e50cfa
fix: Removed the debugging line
AdiKsOnDev May 1, 2024
7c31d3d
fix: Corrected reference data for TORCH backend
AdiKsOnDev May 2, 2024
6899097
refactor: Code made cleaner
AdiKsOnDev May 2, 2024
86e91f9
fix: Utilized wikitext for TORCH models as well
AdiKsOnDev May 2, 2024
7f32430
feat: Implemented get_num_compressed
AdiKsOnDev May 2, 2024
7729867
fix: Dumping the fp32 model correctly
AdiKsOnDev May 2, 2024
70cd912
chore: Removed unneccesary model wrapping
AdiKsOnDev May 2, 2024
e5db8cc
fix: Changed _validate to match the modified pipeline
AdiKsOnDev May 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/post_training/data/wc_reference_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ tinyllama_data_aware_awq_scale_estimation_stateful_backend_OV:
metric_value: 0.83795
num_int4: 188
num_int8: 124
tinyllama_int8_data_free_backend_TORCH:
metric_value: 0.95944
num_int4: 228
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
num_int8: 84
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
9 changes: 9 additions & 0 deletions tests/post_training/model_scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,15 @@
"params": {"is_stateful": True},
"backends": [BackendType.OV],
},
{
"reported_name": "tinyllama_int8_data_free",
"model_id": "tinyllama/tinyllama-1.1b-step-50k-105b",
"pipeline_cls": LMWeightCompression,
"compression_params": {
"mode": CompressWeightsMode.INT8_ASYM,
},
"backends": [BackendType.TORCH],
},
]


Expand Down
80 changes: 51 additions & 29 deletions tests/post_training/pipelines/lm_weight_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from datasets import load_dataset
from memory_profiler import memory_usage
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from whowhatbench import Evaluator

Expand Down Expand Up @@ -68,23 +69,35 @@ def get_stats(self) -> Dict[str, str]:
class LMWeightCompression(BaseTestPipeline):
"""Pipeline for casual language models from Hugging Face repository"""

OV_MODEL_NAME = "openvino_model.xml"
MODEL_NAME = "openvino_model.xml"
MODEL_FUNC = OVModelForCausalLM

def prepare_model(self) -> None:
is_stateful = self.params.get("is_stateful", False)

if self.backend == BackendType.TORCH:
self.MODEL_NAME = "torch_model.xml"
self.MODEL_FUNC = AutoModelForCausalLM
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
self.MODEL_SPECIFIC_PARAMS = {}
else:
self.MODEL_SPECIFIC_PARAMS = {"export": True, "compile": False, "stateful": is_stateful}

if is_stateful:
self.fp32_model_dir = self.fp32_model_dir.parent / (self.fp32_model_dir.name + "_sf")
if not (self.fp32_model_dir / self.OV_MODEL_NAME).exists():

if not (self.fp32_model_dir / self.MODEL_NAME).exists():
# export by model_id
self.model_hf = OVModelForCausalLM.from_pretrained(
self.model_id, export=True, load_in_8bit=False, compile=False, stateful=is_stateful
self.model_hf = self.MODEL_FUNC.from_pretrained(
self.model_id, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
)

self._dump_model_fp32()
else:
# no export, load from IR. Applicable for sequential run of test cases in local environment.
self.model_hf = OVModelForCausalLM.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
self.model_hf = self.MODEL_FUNC.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
)

self.model = self.model_hf.model

def prepare_preprocessor(self) -> None:
Expand Down Expand Up @@ -138,6 +151,14 @@ def transform_fn(data, max_tokens=128):
def prepare_calibration_dataset(self):
dataset = load_dataset("wikitext", "wikitext-2-v1", split="train", revision="b08601e")
dataset = dataset.filter(lambda example: len(example["text"]) > 128)
if self.backend == BackendType.TORCH:
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
example_text = "The TinyLlama project aims to pretrain a 1.1B Llama model on 3 trillion tokens."
token = self.preprocessor(example_text, max_length=500, return_tensors="pt", truncation=True)
inputs = {"input_ids": token["input_ids"], "attention_mask": token["attention_mask"]}

self.calibration_dataset = nncf.Dataset([inputs])

return
self.calibration_dataset = nncf.Dataset(dataset, self.get_transform_calibration_fn())

def cleanup_cache(self):
Expand All @@ -164,25 +185,16 @@ def collect_data_from_stdout(self, stdout: str):
def save_compressed_model(self) -> None:
if self.backend == BackendType.FP32:
return
ov.serialize(self.model, self.output_model_dir / self.OV_MODEL_NAME)
self.model_hf._save_config(self.output_model_dir)
if self.backend == BackendType.TORCH:
self.model_hf.save_pretrained(self.output_model_dir)
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved

def get_num_compressed(self) -> None:
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
"""
Get number of the i8, u8, i4, u4 ops in the compressed IR.
"""
num_int8 = 0
num_int4 = 0
return

for node in self.model.get_ops():
for i in range(node.get_output_size()):
if node.get_output_element_type(i).get_type_name() in ["i8", "u8"]:
num_int8 += 1
if node.get_output_element_type(i).get_type_name() in ["i4", "u4"]:
num_int4 += 1
ov.serialize(self.model, self.output_model_dir / self.MODEL_NAME)
self.model_hf._save_config(self.output_model_dir)

self.run_info.num_compress_nodes.num_int8 = num_int8
self.run_info.num_compress_nodes.num_int4 = num_int4
def get_num_compressed(self) -> None:
pass

def run_bench(self) -> None:
pass
Expand All @@ -193,20 +205,29 @@ def _dump_model_fp32(self) -> None:
to the dedicated shared folder.
"""
self.model_hf.save_pretrained(self.fp32_model_dir)
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
self.model_hf._save_config(self.fp32_model_dir)
if self.backend != BackendType.TORCH:
self.model_hf._save_config(self.fp32_model_dir)

def _compress(self):
"""
Actual call of weight compression
"""
if self.backend == BackendType.TORCH:
from nncf.torch.model_creation import is_wrapped_model
from nncf.torch.model_creation import wrap_model

if not is_wrapped_model(self.model):
example_input = next(iter(self.calibration_dataset.get_inference_data()))
self.model = wrap_model(self.model, example_input=example_input, trace_parameters=True)
self.calibration_dataset = None

AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
self.compressed_model = nncf.compress_weights(
self.model,
dataset=self.calibration_dataset,
**self.compression_params,
)

def _validate(self):
is_stateful = self.params.get("is_stateful", False)
AdiKsOnDev marked this conversation as resolved.
Show resolved Hide resolved
core = ov.Core()

if os.environ.get("INFERENCE_NUM_THREADS"):
Expand All @@ -218,8 +239,8 @@ def _validate(self):
gt_data_path.parent.mkdir(parents=True, exist_ok=True)
if os.getenv("NNCF_TEST_REGEN_DOT") is not None:
print("Collection ground-truth reference data")
model_gold = OVModelForCausalLM.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
model_gold = self.MODEL_FUNC.from_pretrained(
self.fp32_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
)
evaluator = Evaluator(base_model=model_gold, tokenizer=self.preprocessor, metrics=("similarity",))
evaluator.dump_gt(str(gt_data_path))
Expand All @@ -231,10 +252,11 @@ def _validate(self):
)

compressed_model_hf = self.model_hf
if self.backend != BackendType.FP32:
compressed_model_hf = OVModelForCausalLM.from_pretrained(
self.output_model_dir, trust_remote_code=True, load_in_8bit=False, compile=False, stateful=is_stateful
if self.backend != BackendType.FP32 and self.backend != BackendType.TORCH:
compressed_model_hf = self.MODEL_FUNC.from_pretrained(
self.output_model_dir, trust_remote_code=True, load_in_8bit=False, **self.MODEL_SPECIFIC_PARAMS
)

print("Evaluation of the target model")
_, all_metrics = evaluator.score(compressed_model_hf)
similarity = all_metrics["similarity"][0]
Expand Down
Loading