From cc2f0ee5d3bac0195e07dec058382b6f8dd4c67d Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Mon, 25 Nov 2024 12:18:04 -0800
Subject: [PATCH 01/17] SQFT Finetuning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning/README.md                    |  43 +-
 comps/finetuning/finetune_config.py           |  29 +-
 comps/finetuning/finetuning_service.py        |  13 +-
 comps/finetuning/handlers.py                  | 100 ++-
 .../llm_on_ray/finetune/finetune.py           |  63 +-
 .../utils/create_sqft_nncf_config.py          | 142 ++++
 comps/finetuning/utils/extract_sub_adapter.py |  97 +++
 comps/finetuning/utils/merge_adapter.py       |  16 +
 comps/finetuning_sqft/Dockerfile              |  50 ++
 comps/finetuning_sqft/README.md               | 240 +++++++
 .../example_nncf_config/nncf_config.json      | 630 ++++++++++++++++++
 comps/finetuning_sqft/finetune_runner.py      |  38 ++
 comps/finetuning_sqft/finetune_sqft_config.py | 215 ++++++
 .../finetuning_sqft_service.py                |  76 +++
 comps/finetuning_sqft/handlers.py             | 338 ++++++++++
 comps/finetuning_sqft/launch.sh               |  12 +
 .../llm_on_ray/common/__init__.py             |   6 +
 .../llm_on_ray/common/common.py               |  29 +
 .../llm_on_ray/common/torch_config.py         |  72 ++
 .../llm_on_ray/finetune/__init__.py           |   4 +
 .../llm_on_ray/finetune/data_process.py       | 352 ++++++++++
 .../llm_on_ray/finetune/finetune.py           | 602 +++++++++++++++++
 .../llm_on_ray/finetune/modeling.py           | 211 ++++++
 .../patches/nncf-v2.12.0.patch                |  72 ++
 .../patches/peft-v0.10.0.patch                | 220 ++++++
 .../patches/transformers-v4.44.2.patch        | 171 +++++
 comps/finetuning_sqft/requirements.txt        |  17 +
 .../utils/extract_sub_adapter.py              | 101 +++
 comps/finetuning_sqft/utils/merge.py          |  27 +
 .../utils/nncf_config_process.py              | 156 +++++
 30 files changed, 4116 insertions(+), 26 deletions(-)
 create mode 100644 comps/finetuning/utils/create_sqft_nncf_config.py
 create mode 100644 comps/finetuning/utils/extract_sub_adapter.py
 create mode 100644 comps/finetuning/utils/merge_adapter.py
 create mode 100644 comps/finetuning_sqft/Dockerfile
 create mode 100644 comps/finetuning_sqft/README.md
 create mode 100644 comps/finetuning_sqft/example_nncf_config/nncf_config.json
 create mode 100644 comps/finetuning_sqft/finetune_runner.py
 create mode 100644 comps/finetuning_sqft/finetune_sqft_config.py
 create mode 100644 comps/finetuning_sqft/finetuning_sqft_service.py
 create mode 100644 comps/finetuning_sqft/handlers.py
 create mode 100644 comps/finetuning_sqft/launch.sh
 create mode 100644 comps/finetuning_sqft/llm_on_ray/common/__init__.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/common/common.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/common/torch_config.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
 create mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
 create mode 100644 comps/finetuning_sqft/patches/nncf-v2.12.0.patch
 create mode 100644 comps/finetuning_sqft/patches/peft-v0.10.0.patch
 create mode 100644 comps/finetuning_sqft/patches/transformers-v4.44.2.patch
 create mode 100644 comps/finetuning_sqft/requirements.txt
 create mode 100644 comps/finetuning_sqft/utils/extract_sub_adapter.py
 create mode 100644 comps/finetuning_sqft/utils/merge.py
 create mode 100644 comps/finetuning_sqft/utils/nncf_config_process.py

diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md
index 6f554ca221..d2e26582f4 100644
--- a/comps/finetuning/README.md
+++ b/comps/finetuning/README.md
@@ -114,7 +114,42 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-#### 3.2.2 Reranking Model Training
+#### 3.2.2 Instruction Tuning with SQFT's Neural Low-Rank Adapter Search (NLS) 
+
+In addition to traditional fine-tuning, you can use SQFT's NLS to fine-tune your model. 
+More details about SQFT can be found in [this paper](https://aclanthology.org/2024.findings-emnlp.749.pdf).
+Please follow the additional installation requirements [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#-start-nls-microservice-with-python).
+Use the following command to launch a finetuning job with the NLS algorithm:
+
+```bash
+# create a fine-tuning job with NLS
+# Max LoRA rank: 16
+#   LoRA target modules            -> Low-rank search space
+#   ["q_proj", "k_proj", "v_proj"] -> [16,12,8]
+#   ["up_proj"]                    -> [16,12,8]
+#   ["down_proj"]                  -> [16,12,8]
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "alpaca_data.json",
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "General": {
+      "lora_config": {
+        "r": 16,
+        "neural_lora_search": true,
+        "target_module_groups": [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]],
+        "search_space": ["16,12,8", "16,12,8", "16,12,8"]
+      }
+    }
+  }'
+```
+
+Detailed explanations for the parameters can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#create-nls-fine-tuning-job).
+Additional use-cases and benefits of SQFT are available [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea).
+Instructions to extracting the desired sub-adapter and merging it with the base model can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#leverage-fine-tuned-super-adapter).
+
+#### 3.2.3 Reranking Model Training
 
 Use the following command to launch a finetuning job for reranking model finetuning, such as `BAAI/bge-reranker-large`:
 
@@ -133,7 +168,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-#### 3.2.3 Embedding Model Training
+#### 3.2.4 Embedding Model Training
 
 Use the following command to launch a finetuning job for embedding model finetuning, such as `BAAI/bge-base-en-v1.5`:
 
@@ -173,7 +208,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
 
 ```
 
-#### 3.2.4 LLM Pretraining
+#### 3.2.5 LLM Pretraining
 
 Use the following command to launch a job for LLM pretraining, such as `meta-llama/Llama-2-7b-hf`:
 
@@ -199,7 +234,7 @@ Below is an example for the format of the pretraining dataset:
 {"text": "A boy with a blue tank top sitting watching three dogs."}
 ```
 
-#### 3.2.5 Direct Preference Optimization (DPO)
+#### 3.2.6 Direct Preference Optimization (DPO)
 
 Use the following command to launch a job for LLM Direct Preference Optimization, such as `meta-llama/Llama-2-7b-hf`:
 
diff --git a/comps/finetuning/finetune_config.py b/comps/finetuning/finetune_config.py
index 0b2faf53db..3f297c80f1 100644
--- a/comps/finetuning/finetune_config.py
+++ b/comps/finetuning/finetune_config.py
@@ -5,9 +5,9 @@
 
 from typing import List, Optional, Union
 
-from pydantic import BaseModel, Field, validator
+from pydantic import BaseModel, Field, validator, root_validator
 
-from comps.cores.proto.api_protocol import FineTuningJobsRequest
+from comps.cores.proto.api_protocol import FineTuningJobIDRequest, FineTuningJobsRequest
 
 PRECISION_BF16 = "bf16"
 PRECISION_FP16 = "fp16"
@@ -37,6 +37,22 @@ class LoraConfig(BaseModel):
     target_modules: Optional[List[str]] = None
 
 
+class SQFTNLSConfig(LoraConfig):
+    neural_lora_search: bool = False
+    target_module_groups: Optional[List[List[str]]] = None
+    search_space: Optional[List[str]] = None
+
+    @root_validator(pre=True)
+    def set_target_modules(cls, values):
+        target_module_groups = values.get('target_module_groups')
+        if target_module_groups is not None:
+            values['target_modules'] = [item for sublist in target_module_groups for item in sublist]
+        search_space = values.get('search_space')
+        if search_space is not None:
+            assert len(search_space) == len(target_module_groups)
+        return values
+
+
 class GeneralConfig(BaseModel):
     base_model: str = None
     tokenizer_name: Optional[str] = None
@@ -47,7 +63,7 @@ class GeneralConfig(BaseModel):
     resume_from_checkpoint: Optional[str] = None
     save_strategy: str = "no"
     config: LoadConfig = LoadConfig()
-    lora_config: Optional[LoraConfig] = LoraConfig()
+    lora_config: Optional[Union[LoraConfig, SQFTNLSConfig]] = LoraConfig()
     enable_gradient_checkpointing: bool = False
     task: str = "instruction_tuning"
 
@@ -200,3 +216,10 @@ class FineTuningParams(FineTuningJobsRequest):
     General: GeneralConfig = GeneralConfig()
     Dataset: DatasetConfig = DatasetConfig()
     Training: TrainingConfig = TrainingConfig()
+
+class ExtractSubAdapterParams(FineTuningJobIDRequest):
+    adapter_version: str = "heuristic"
+    custom_config: Optional[List[int]] = None
+
+class MergeAdapterParams(FineTuningJobIDRequest):
+    adapter_version: Optional[str] = None
diff --git a/comps/finetuning/finetuning_service.py b/comps/finetuning/finetuning_service.py
index 64097c720c..4a925ff837 100644
--- a/comps/finetuning/finetuning_service.py
+++ b/comps/finetuning/finetuning_service.py
@@ -4,12 +4,14 @@
 
 from comps import opea_microservices, register_microservice
 from comps.cores.proto.api_protocol import FineTuningJobIDRequest, UploadFileRequest
-from comps.finetuning.finetune_config import FineTuningParams
+from comps.finetuning.finetune_config import FineTuningParams, ExtractSubAdapterParams, MergeAdapterParams
 from comps.finetuning.handlers import (
     handle_cancel_finetuning_job,
     handle_create_finetuning_jobs,
+    handle_extract_sub_adapter,
     handle_list_finetuning_checkpoints,
     handle_list_finetuning_jobs,
+    handle_merge_adapter,
     handle_retrieve_finetuning_job,
     handle_upload_training_files,
     upload_file,
@@ -20,7 +22,6 @@
 def create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
     return handle_create_finetuning_jobs(request, background_tasks)
 
-
 @register_microservice(
     name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
 )
@@ -62,6 +63,14 @@ def list_checkpoints(request: FineTuningJobIDRequest):
     checkpoints = handle_list_finetuning_checkpoints(request)
     return checkpoints
 
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/extract_sub_adapter", host="0.0.0.0", port=8015)
+def extract_sub_adapter(request: ExtractSubAdapterParams):
+    return handle_extract_sub_adapter(request)
+
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/merge_adapter", host="0.0.0.0", port=8015)
+def merge_adapter(request: MergeAdapterParams):
+    return handle_merge_adapter(request)
+
 
 if __name__ == "__main__":
     opea_microservices["opea_service@finetuning"].start()
diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py
index a47b9f980a..dde6424420 100644
--- a/comps/finetuning/handlers.py
+++ b/comps/finetuning/handlers.py
@@ -11,7 +11,7 @@
 from typing import Dict
 
 from fastapi import BackgroundTasks, File, Form, HTTPException, UploadFile
-from pydantic_yaml import to_yaml_file
+from pydantic_yaml import parse_yaml_file_as, to_yaml_file
 from ray.job_submission import JobSubmissionClient
 
 from comps import CustomLogger
@@ -23,7 +23,12 @@
     FineTuningJobList,
     UploadFileRequest,
 )
-from comps.finetuning.finetune_config import FinetuneConfig, FineTuningParams
+from comps.finetuning.finetune_config import (
+    ExtractSubAdapterParams,
+    FinetuneConfig,
+    FineTuningParams,
+    MergeAdapterParams,
+)
 
 logger = CustomLogger("finetuning_handlers")
 
@@ -134,6 +139,97 @@ def handle_create_finetuning_jobs(request: FineTuningParams, background_tasks: B
     return job
 
 
+def handle_extract_sub_adapter(request: ExtractSubAdapterParams):
+    fine_tuning_job_id = request.fine_tuning_job_id
+    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
+    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
+    assert finetuned_model_path == finetune_config.General.output_dir
+    if not os.path.exists(finetuned_model_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+        )
+    if job.status != "succeeded":
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
+
+    if finetune_config.General.lora_config is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+        )
+    if not finetune_config.General.lora_config.neural_lora_search:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' did not enable NLS algorithm, "
+                   f"there is no need to extract sub-adapters!"
+        )
+    nncf_config_path = os.path.join(finetune_config.General.output_dir, "nncf_config.json")
+    if not os.path.exists(nncf_config_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
+        )
+
+    from comps.finetuning.utils.extract_sub_adapter import main as extract_sub_adapter_main
+    extract_sub_adapter_main(
+        adapter_model_path=finetuned_model_path,
+        nncf_config=nncf_config_path,
+        adapter_version=request.adapter_version,
+        custom_config=request.custom_config
+    )
+
+    return fine_tuning_job_id
+
+
+def handle_merge_adapter(request: MergeAdapterParams):
+    fine_tuning_job_id = request.fine_tuning_job_id
+    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
+    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
+    assert finetuned_model_path == finetune_config.General.output_dir
+    if not os.path.exists(finetuned_model_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+        )
+    if job.status != "succeeded":
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
+
+    if finetune_config.General.lora_config is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+        )
+
+    adapter_path = finetuned_model_path
+    adapter_version = request.adapter_version
+    if adapter_version is not None:
+        adapter_path = os.path.join(adapter_path, adapter_version)
+        if not os.path.exists(adapter_path):
+            raise HTTPException(
+                status_code=404,
+                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!"
+            )
+
+    from comps.finetuning.utils.merge_adapter import main as merge_adapter_main
+    merge_adapter_main(
+        base_model_path=finetune_config.General.base_model,
+        adapter_model_path=adapter_path,
+        output_path=os.path.join(adapter_path, "merged_model")
+    )
+
+    return fine_tuning_job_id
+
+
 def handle_list_finetuning_jobs():
     finetuning_jobs_list = FineTuningJobList(data=list(running_finetuning_jobs.values()), has_more=False)
 
diff --git a/comps/finetuning/llm_on_ray/finetune/finetune.py b/comps/finetuning/llm_on_ray/finetune/finetune.py
index d105269a40..97a6257b33 100644
--- a/comps/finetuning/llm_on_ray/finetune/finetune.py
+++ b/comps/finetuning/llm_on_ray/finetune/finetune.py
@@ -39,6 +39,16 @@
 
 logger = CustomLogger("llm_on_ray/finetune")
 
+try:
+    from comps.finetuning.utils.create_sqft_nncf_config import create_sqft_nncf_config
+    from nncf.experimental.torch.nas.bootstrapNAS.training.model_creator_helpers import (
+        create_compressed_model_from_algo_names,
+    )
+    from nncf.torch.model_creation import create_nncf_network
+    is_nncf_available = True
+except ImportError:
+    is_nncf_available = False
+
 
 def adapt_transformers_to_device(config: Dict):
     device = config["Training"]["device"]
@@ -338,6 +348,7 @@ def load_model(config: Dict):
     model_config = config["General"].get("config", {})
     task = config["General"].get("task", "instruction_tuning")
     ref_model = None
+    nls_controller = None
     if task in ["instruction_tuning", "pretraining", "dpo"]:
         model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, **model_config)
         if task == "dpo":
@@ -346,8 +357,26 @@ def load_model(config: Dict):
             )
         lora_config = config["General"].get("lora_config", None)
         if lora_config and task == "instruction_tuning":
+            neural_lora_search = lora_config.pop("neural_lora_search", False)
+            target_module_groups = lora_config.pop("target_module_groups", None)
+            search_space = lora_config.pop("search_space", None)
             peft_config = LoraConfig(**lora_config)
             model = get_peft_model(model, peft_config)
+
+            # Neural LoRA Search (NLS)
+            if neural_lora_search:
+                if not is_nncf_available:
+                    raise NotImplementedError("NNCF is not installed. Please install it for enabling NLS algorithm.")
+                nncf_config = create_sqft_nncf_config(
+                    config=config,
+                    model=model,
+                    target_module_groups=target_module_groups,
+                    search_space=search_space
+                )
+                model = create_nncf_network(model, nncf_config)
+                nls_controller, model = create_compressed_model_from_algo_names(
+                    model, nncf_config, algo_names=["nls"]
+                )
     elif task == "rerank":
         model = CrossEncoder.from_pretrained(
             config["Dataset"].get("train_group_size", 8),
@@ -383,10 +412,9 @@ def load_model(config: Dict):
 
     model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"]))
 
-    return model, ref_model
-
+    return model, ref_model, nls_controller
 
-def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, data_collator):
+def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, data_collator, nls_controller=None):
     device = config["Training"]["device"]
     task = config["General"].get("task", "instruction_tuning")
     if device in ["cpu", "gpu", "cuda"]:
@@ -411,18 +439,21 @@ def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, da
                 max_length=config["Dataset"].get("max_length", 1024),
             )
         else:
-            trainer = Trainer(
-                model=model,
-                args=training_args,
-                train_dataset=tokenized_dataset["train"],
-                eval_dataset=(
-                    tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None
-                ),
-                tokenizer=tokenizer,
-                data_collator=data_collator,
-            )
+            trainer_args = {
+                "model": model,
+                "args": training_args,
+                "train_dataset": tokenized_dataset["train"],
+                "eval_dataset": tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
+                "tokenizer": tokenizer,
+                "data_collator": data_collator,
+            }
+            if nls_controller is not None:
+                trainer_args["compression_ctrl"] = nls_controller
+            trainer = Trainer(**trainer_args)
         return training_args, trainer
     elif device in ["hpu"]:
+        if nls_controller is not None:
+            raise NotImplementedError(f"NLS algorithm is not supported on HPU now.")
         from optimum.habana import GaudiConfig
         from optimum.habana.transformers import GaudiTrainer, GaudiTrainingArguments
 
@@ -495,9 +526,11 @@ def train_func(config: Dict[str, Any]):
 
     data_collator = prepare_data_collator(config, tokenizer)
 
-    model, ref_model = load_model(config)
+    model, ref_model, nls_controller = load_model(config)
 
-    training_args, trainer = get_trainer(config, model, ref_model, tokenizer, tokenized_dataset, data_collator)
+    training_args, trainer = get_trainer(
+        config, model, ref_model, tokenizer, tokenized_dataset, data_collator, nls_controller=nls_controller
+    )
 
     logger.info("train start")
     trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
diff --git a/comps/finetuning/utils/create_sqft_nncf_config.py b/comps/finetuning/utils/create_sqft_nncf_config.py
new file mode 100644
index 0000000000..eb76fcc310
--- /dev/null
+++ b/comps/finetuning/utils/create_sqft_nncf_config.py
@@ -0,0 +1,142 @@
+import os
+import json
+
+try:
+    from nncf import NNCFConfig
+    from nncf.experimental.torch import sqft
+    is_nncf_available = True
+except ImportError:
+    is_nncf_available = False
+
+
+NNCF_CONFIG_TEMPLATE = {
+    "input_info": [
+        {
+            "sample_size": [1, 256],
+            "type": "long",
+            "keyword": "input_ids"
+        },
+        {
+            "sample_size": [1, 256],
+            "type": "long",
+            "keyword": "attention_mask"
+        }
+    ],
+    "SQFT": {
+        "training": {
+            "algorithm": "nls",
+            "elasticity": {
+                "available_elasticity_dims": ["width"],
+                "width": {
+                    "overwrite_groups": [],
+                    "overwrite_groups_widths": []
+                }
+            }
+        }
+    }
+}
+
+
+def add_lr_epochs(nncf_config, learning_rate=3e-4, num_train_epochs=3):
+    """Add learning rate and epochs to the NNCF configuration.
+
+    Args:
+        nncf_config (dict): The NNCF configuration dictionary.
+        learning_rate (float): The initial learning rate to set.
+        num_epochs (int): The number of epochs to set.
+
+    Returns:
+        dict: The updated NNCF configuration.
+    """
+    overwrite_groups_widths = nncf_config["SQFT"]["training"]["elasticity"]["width"]["overwrite_groups_widths"]
+    # Add learning rate and epochs to the configuration
+    nncf_config["SQFT"]["training"]["schedule"] = {
+        "list_stage_descriptions": [
+            {
+                "train_dims": ["width"],
+                "width_indicator": max([len(widths) for widths in overwrite_groups_widths]),
+                "init_lr": learning_rate,
+                "epochs": num_train_epochs,
+                "epochs_lr": num_train_epochs,
+            }
+        ]
+    }
+    return nncf_config
+
+
+def get_model_paths(model, target_module_name):
+    """
+    Find all paths to the target layer in the model.
+
+    Args:
+        model (torch.nn.Module): The model to search.
+        target_module_name (str): The name of the target layer.
+
+    Returns:
+        list: A list of paths to the target layer.
+    """
+    def find_layers(module, target_module_name, path, paths):
+        for name, sub_module in module.named_children():
+            new_path = f"{path}/{sub_module.__class__.__name__}[{name}]"
+            if target_module_name in name:
+                # Check if 'lora_A' is in the sub_module's children
+                for sub_name, _ in sub_module.named_children():
+                    if "lora_A" in sub_name:
+                        paths.append(f"{new_path}/ModuleDict[lora_A]/NNCFLinear[default]/linear_0")
+            find_layers(sub_module, target_module_name, new_path, paths)
+
+    base_path = model.__class__.__name__
+    paths = []
+    find_layers(model, target_module_name, base_path, paths)
+    return paths
+
+
+def create_sqft_nncf_config(
+    config,
+    model,
+    target_module_groups=None,
+    search_space=None
+):
+    """Load and preprocess the NNCF configuration file.
+
+    Returns:
+        NNCFConfig: The preprocessed NNCF configuration object.
+    """
+    if not is_nncf_available:
+        raise NotImplementedError("NNCF is not installed. Please install it for enabling NLS algorithm.")
+    if search_space is None and target_module_groups:
+        raise ValueError("Neural LoRA search is enabled, `search_space` and `target_module_groups` must be provided.")
+    # The NNCF Config will be automatically generated based on `target_module_groups` and `search_space`.
+    num_hidden_layers = model.config.num_hidden_layers
+    nncf_config_dict = NNCF_CONFIG_TEMPLATE
+    overwrite_groups = []
+    for group in target_module_groups:
+        group_paths = []
+        for module in group:
+            target_layer_name = module
+            paths = get_model_paths(model, target_layer_name)
+            assert paths, f"No paths found for module {module}"
+            group_paths.append(paths)
+        # Transpose the list of lists to combine paths by their positions
+        transposed_paths = list(zip(*group_paths))
+        overwrite_groups.extend([list(path_group) for path_group in transposed_paths])
+    nncf_config_dict["SQFT"]["training"]["elasticity"]["width"]["overwrite_groups"] = overwrite_groups
+
+    overwrite_groups_widths = []
+    for space in search_space:
+        space = [int(width) for width in space.split(",")]
+        overwrite_groups_widths.extend([space] * num_hidden_layers)
+    nncf_config_dict["SQFT"]["training"]["elasticity"]["width"]["overwrite_groups_widths"] = overwrite_groups_widths
+    assert len(overwrite_groups) == len(overwrite_groups_widths)
+    nncf_config_dict = add_lr_epochs(
+        nncf_config_dict,
+        learning_rate=config["Training"]["learning_rate"],
+        num_train_epochs=config["Training"]["epochs"]
+    )
+    nncf_config = NNCFConfig.from_dict(nncf_config_dict)
+
+    nncf_config["log_dir"] = config["General"]["output_dir"]
+    os.makedirs(nncf_config["log_dir"], exist_ok=True)
+    with open(os.path.join(nncf_config["log_dir"], "nncf_config.json"), "w") as f:
+        json.dump(nncf_config, f, indent=4)
+    return nncf_config
diff --git a/comps/finetuning/utils/extract_sub_adapter.py b/comps/finetuning/utils/extract_sub_adapter.py
new file mode 100644
index 0000000000..f7b0bf6ff1
--- /dev/null
+++ b/comps/finetuning/utils/extract_sub_adapter.py
@@ -0,0 +1,97 @@
+import os
+import re
+
+import torch
+
+from peft.utils import CONFIG_NAME, WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME
+
+try:
+    from nncf import NNCFConfig
+    is_nncf_available = True
+except ImportError:
+    is_nncf_available = False
+
+
+PATTERN = re.compile(r"[[](.*?)[]]", re.S)
+
+
+def get_width_for_query_prefix(torch_module_to_width, query_module, length=5):
+    """
+    Get the width for a given query module prefix.
+
+    Args:
+        torch_module_to_width (dict): Mapping from torch module to width.
+        query_module (str): The query module name.
+        length (int, optional): The length of the prefix to match. Default is 5.
+
+    Returns:
+        int: The width for the query module prefix.
+    """
+    query_module_list = query_module.split(".")
+    width = next(
+        (
+            value
+            for torch_module, value in torch_module_to_width.items()
+            if torch_module.split(".")[:length] == query_module_list[:length]
+        ),
+        None,
+    )
+    return width
+
+
+def main(adapter_model_path, nncf_config, adapter_version, custom_config=None):
+    if not is_nncf_available:
+        raise NotImplementedError("NNCF is not installed. Please install it.")
+    output_dir = os.path.join(adapter_model_path, adapter_version)
+    os.makedirs(output_dir, exist_ok=True)
+    nncf_config = NNCFConfig.from_json(nncf_config)
+    try:
+        groups = nncf_config["SQFT"]["training"]["elasticity"]["width"]["overwrite_groups"]
+        groups_widths = nncf_config["SQFT"]["training"]["elasticity"]["width"]["overwrite_groups_widths"]
+        assert len(groups) == len(groups_widths)
+    except Exception:
+        raise ValueError("Cannot get the search space in NNCF config.")
+
+    if adapter_version == "maximal":
+        subnetwork_config = {idx: space[0] for idx, space in enumerate(groups_widths)}
+    elif adapter_version == "heuristic":
+        subnetwork_config = {idx: space[(len(space) - 1) // 2] for idx, space in enumerate(groups_widths)}
+    elif adapter_version == "minimal":
+        subnetwork_config = {idx: space[-1] for idx, space in enumerate(groups_widths)}
+    else:
+        assert custom_config is not None, "Missing custom subnetwork config."
+        assert isinstance(custom_config, list), "Custom config must be a list."
+        subnetwork_config = {i: value for i, value in enumerate(custom_config)}
+
+    # Mapping: nncf node -> width
+    nncf_node_to_width = {}
+    for idx, value in subnetwork_config.items():
+        space = groups_widths[idx]
+        assert min(space) <= value <= max(space)
+        cur_dict = {node: value for node in groups[idx]}
+        nncf_node_to_width.update(cur_dict)
+
+    # Prune adapter model (LoRA low-rank)
+    lora_torch_module_to_width = {
+        ".".join(re.findall(PATTERN, k)): v for k, v in nncf_node_to_width.items() if "lora_A" in k
+    }
+    num_module_name_item = list(lora_torch_module_to_width.keys())[0].split(".").index("lora_A")
+    # Load adapter weights
+    try:
+        super_adapter_weights = torch.load(os.path.join(adapter_model_path, WEIGHTS_NAME))
+    except:
+        from safetensors.torch import load_file
+        super_adapter_weights = load_file(os.path.join(adapter_model_path, SAFETENSORS_WEIGHTS_NAME))
+    sub_adapter_weights = {}
+    for weight_key, weight_tensor in super_adapter_weights.items():
+        width = get_width_for_query_prefix(lora_torch_module_to_width, weight_key, length=num_module_name_item)
+        if width is not None:
+            is_loraA = "lora_A" in weight_key
+            new_weight_tensor = weight_tensor[:width].clone() if is_loraA else weight_tensor[:, :width].clone()
+        else:
+            new_weight_tensor = weight_tensor.clone()
+        sub_adapter_weights[weight_key] = new_weight_tensor
+    os.makedirs(output_dir, exist_ok=True)
+    torch.save(sub_adapter_weights, os.path.join(output_dir, WEIGHTS_NAME))
+    config_path = os.path.join(adapter_model_path, CONFIG_NAME)
+    os.system(f"cp {config_path} {output_dir}")
diff --git a/comps/finetuning/utils/merge_adapter.py b/comps/finetuning/utils/merge_adapter.py
new file mode 100644
index 0000000000..a127061ef6
--- /dev/null
+++ b/comps/finetuning/utils/merge_adapter.py
@@ -0,0 +1,16 @@
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def main(base_model_path, adapter_model_path, output_path):
+    base_model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True)
+    model = PeftModel.from_pretrained(base_model, adapter_model_path)
+    model.eval()
+    for name, param in model.named_parameters():
+        param.requires_grad = False
+    merged_model = model.merge_and_unload()
+    merged_model.train(False)
+    base_model.save_pretrained(output_path, state_dict=merged_model.state_dict())
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
+    tokenizer.save_pretrained(output_path)
diff --git a/comps/finetuning_sqft/Dockerfile b/comps/finetuning_sqft/Dockerfile
new file mode 100644
index 0000000000..4715470aec
--- /dev/null
+++ b/comps/finetuning_sqft/Dockerfile
@@ -0,0 +1,50 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the same python version with ray
+FROM python:3.10.14
+
+ARG HF_TOKEN
+
+ENV HF_TOKEN=$HF_TOKEN
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+COPY comps /home/user/comps
+
+RUN chown -R user /home/user/comps/finetuning_sqft
+
+USER user
+
+ENV PATH=$PATH:/home/user/.local/bin
+
+RUN python -m pip install --no-cache-dir --upgrade pip && \
+    python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+    python -m pip install --no-cache-dir intel-extension-for-pytorch && \
+    python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt
+
+# Set up third-party dependencies (SQFT)
+ENV PATH_TO_FINETUNE=/home/user/comps/finetuning_sqft
+RUN mkdir -p $PATH_TO_FINETUNE/third_party && cd $PATH_TO_FINETUNE/third_party && \
+    git clone https://github.com/huggingface/peft.git && \
+    cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/peft-v0.10.0.patch && pip install -e . && cd .. && \
+    git clone https://github.com/huggingface/transformers.git && \
+    cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/transformers-v4.44.2.patch && pip install -e . && cd .. && \
+    git clone https://github.com/openvinotoolkit/nncf.git && \
+    cd nncf && git checkout v2.12.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/nncf-v2.12.0.patch && pip install -e . && cd ..
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+WORKDIR /home/user/comps/finetuning_sqft
+
+RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \
+    echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \
+    echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \
+    echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \
+    echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \
+    echo python finetuning_sqft_service_service.py >> run.sh
+
+CMD bash run.sh
diff --git a/comps/finetuning_sqft/README.md b/comps/finetuning_sqft/README.md
new file mode 100644
index 0000000000..a5748caf76
--- /dev/null
+++ b/comps/finetuning_sqft/README.md
@@ -0,0 +1,240 @@
+# SQFT Fine-tuning Microservice
+
+Fine-tuning microservice with SQFT involves adapting a model to a specific task or dataset to improve its performance on that task, we currently support instruction tuning for LLMs.
+
+## 🚀1. Start Microservice with Python (Option 1)
+
+### 1.1 Install Requirements
+
+```bash
+python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+python -m pip install intel-extension-for-pytorch
+python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+pip install -r requirements.txt
+```
+To enable elastic adapter fine-tuning (Neural Low-Rank Adapter Search) or SparsePEFT from [SQFT](https://arxiv.org/abs/2410.03750), please perform this additional installation:
+
+```bash
+PATH_TO_FINETUNE=$PWD
+mkdir third_party && cd third_party
+
+# transformers (for Neural Lora Search)
+git clone https://github.com/huggingface/transformers.git
+cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/transformers-v4.44.2.patch && pip install -e . && cd ..
+
+# peft (for Neural Low-Rank Adapter Search and SparsePEFT)
+git clone https://github.com/huggingface/peft.git
+cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/peft-v0.10.0.patch && pip install -e . && cd ..
+
+# nncf (for Neural Lora Search)
+git clone https://github.com/openvinotoolkit/nncf.git
+cd nncf && git checkout v2.12.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/nncf-v2.12.0.patch && pip install -e . && cd ..
+```
+
+### 1.2 Start Fine-tuning Service with Python Script
+
+#### 1.2.1 Start Ray Cluster
+
+OneCCL and Intel MPI libraries should be dynamically linked in every node before Ray starts:
+
+```bash
+source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
+```
+
+Start Ray locally using the following command.
+
+```bash
+ray start --head
+```
+
+For a multi-node cluster, start additional Ray worker nodes with below command.
+
+```bash
+ray start --address='${head_node_ip}:6379'
+```
+
+#### 1.2.2 Start Finetuning Service
+
+```bash
+export HF_TOKEN=<your huggingface token>
+export PYTHONPATH=<path to GenAIComps>
+python finetuning_sqft_service.py
+```
+
+## 🚀2. Start Microservice with Docker (Option 2)
+
+### 2.1 Setup on CPU
+
+#### 2.1.1 Build Docker Image
+
+Build docker image with below command:
+
+```bash
+export HF_TOKEN=${your_huggingface_token}
+cd ../../
+docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning_sqft/Dockerfile .
+```
+
+#### 2.1.2 Run Docker with CLI
+
+Start docker container with below command:
+
+```bash
+docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
+```
+
+## 🚀3. Consume Fine-tuning Service
+
+### 3.1 Upload a training file
+
+Download a training file, such as `alpaca_data.json` for instruction tuning and upload it to the server with below command, this file can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json):
+
+```bash
+# upload a training file
+curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./alpaca_data.json" -F purpose="fine-tune"
+```
+
+
+### 3.2 Create fine-tuning job
+
+#### 3.2.1 Instruction Tuning
+
+After a training file like `alpaca_data.json` is uploaded, use the following command to launch a fine-tuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:
+
+```bash
+# create a finetuning job
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "alpaca_data.json",
+    "model": "meta-llama/Llama-2-7b-chat-hf"
+  }'
+ 
+# create a finetuning job (with SparsePEFT)
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "alpaca_data.json",
+    "model": <path to sparse model>,
+    "General": {
+      "lora_config": {
+        "sparse_adapter": true
+      }
+    }
+  }'
+  
+# create a fine-tuning job (with Neural Low-rank adapter Search)
+# Max LoRA rank: 16
+#   LoRA target modules            -> Low-rank search space
+#   ["q_proj", "k_proj", "v_proj"] -> [16,12,8]
+#   ["up_proj"]                    -> [16,12,8]
+#   ["down_proj"]                  -> [16,12,8]
+curl http://${your_ip}:8015/v1/fine_tuning/jobs \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "training_file": "alpaca_data.json",
+    "model": "meta-llama/Llama-2-7b-chat-hf",
+    "General": {
+      "lora_config": {
+        "r": 16,
+        "neural_lora_search": true,
+        "target_module_groups": [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]],
+        "search_space": ["16,12,8", "16,12,8", "16,12,8"]
+      }
+    }
+  }'
+```
+
+Below are some explanations for the parameters related to the Neural Low-rank adapter Search algorithm:
+
+- `target_module_groups` specifies the target module groups, which means that the adapters within the same group will share the same activated low-rank value.
+- `search_space` specifies the search space for each target module (adapter) group. 
+Here, it is `["16,12,8", "16,12,8", "16,12,8"]`, meaning that the search space for each group is [16, 12, 8].
+
+Note that the number of groups should be equal to the number of search spaces (one-to-one correspondence).
+Feel free to try your favorite group design and search spaces.
+
+### 3.3 Manage fine-tuning job
+
+Below commands show how to list fine-tuning jobs, retrieve a fine-tuning job, cancel a fine-tuning job and list checkpoints of a fine-tuning job.
+
+```bash
+# list fine-tuning jobs
+curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET
+
+# retrieve one fine-tuning job
+curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+
+# cancel one fine-tuning job
+curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+
+# list checkpoints of a fine-tuning job
+curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
+```
+
+### 3.4 Leverage fine-tuned model
+
+#### 3.4.1 Extract the sub-adapter
+
+After completing the super-adapter fine-tuning (the checkpoints of the fine-tuning job), 
+the following command demonstrates how to extract the heuristic sub-adapter.
+Additionally, more powerful sub-adapters can be obtained through other advanced search algorithms.
+
+```bash
+curl http://${your_ip}:8015/v1/finetune/extract_adapter \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "fine_tuning_job_id": ${fine_tuning_job_id},
+    "sub_adapter_version": "heuristic"
+  }'
+```
+
+`sub_adapter_version` can be heuristic, minimal, or a custom name.
+When `sub_adapter_version` is set to a custom name, we need to provide a specific configuration in `custom_config`.
+The extracted adapter will be saved in `<path to output directory> / <sub_adapter_version>`.
+
+<details>
+<summary>An example of a custom configuration </summary>
+
+```bash
+curl http://${your_ip}:8015/v1/finetune/extract_adapter \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "fine_tuning_job_id": ${fine_tuning_job_id},
+    "sub_adapter_version": "custom",
+    "custom_config": [8, 8, 16, 8, 8, 12, 8, 12, 12, 12, 8, 16, 12, 16, 16, 12, 12, 8, 8, 16, 8, 8, 12, 8, 16, 12, 8, 16, 8, 16, 12, 8, 8, 16, 16, 16, 16, 16, 8, 12, 12, 16, 12, 16, 12, 16, 16, 12, 8, 12, 12, 8, 8, 12, 8, 12, 12, 8, 16, 8, 8, 8, 8, 12, 16, 16],
+  }'
+```
+
+In the fine-tuning job with Neural Low-rank adapter Search algorithm,  the `nncf_config.json` file (which includes the elastic adapter information) will be saved in the output directory.
+The `custom_config` must correspond with the `overwrite_groups` (adapter modules) or `overwrite_groups_widths`
+(search space for the rank of adapter modules) in `nncf_config.json`. 
+The above command corresponds to the example in [example_nncf_config/nncf_config.json](./example_nncf_config/nncf_config.json), 
+and it will save the sub-adapter to `<path to output directory> / custom`.
+
+</details>
+
+#### 3.4.2 Merge
+
+The following command demonstrates how to merge the sub-adapter to the base pretrained model:
+
+```bash
+curl http://${your_ip}:8015/v1/ffinetune/merge_adapter \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "fine_tuning_job_id": ${fine_tuning_job_id},
+    "sub_adapter_version": "heuristic"
+  }'
+```
+
+The merged model will be saved in `<path to output directory> / <sub_adapter_version> / merged_model`.
+
+## 🚀4. Descriptions for Finetuning parameters
+
+We utilize [OpenAI finetuning parameters](https://platform.openai.com/docs/api-reference/fine-tuning) and extend it with more customizable parameters, see the definitions at [finetune_sqft_config](./finetune_sqft_config.py).
diff --git a/comps/finetuning_sqft/example_nncf_config/nncf_config.json b/comps/finetuning_sqft/example_nncf_config/nncf_config.json
new file mode 100644
index 0000000000..ead7ffe4c6
--- /dev/null
+++ b/comps/finetuning_sqft/example_nncf_config/nncf_config.json
@@ -0,0 +1,630 @@
+{
+    "input_info": [
+        {
+            "sample_size": [
+                1,
+                256
+            ],
+            "type": "long",
+            "keyword": "input_ids"
+        },
+        {
+            "sample_size": [
+                1,
+                256
+            ],
+            "type": "long",
+            "keyword": "attention_mask"
+        }
+    ],
+    "bootstrapNAS": {
+        "training": {
+            "algorithm": "progressive_shrinking",
+            "frozen_layers_allowed": true,
+            "progressivity_of_elasticity": [
+                "width"
+            ],
+            "batchnorm_adaptation": {
+                "num_bn_adaptation_samples": 0
+            },
+            "schedule": {
+                "list_stage_descriptions": [
+                    {
+                        "train_dims": [
+                            "width"
+                        ],
+                        "epochs": 3,
+                        "depth_indicator": 1,
+                        "width_indicator": 8,
+                        "init_lr": 0.0003,
+                        "epochs_lr": 3,
+                        "sample_rate": 1
+                    }
+                ]
+            },
+            "elasticity": {
+                "available_elasticity_dims": [
+                    "width"
+                ],
+                "width": {
+                    "overwrite_groups": [
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ],
+                        [
+                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+                        ]
+                    ],
+                    "overwrite_groups_widths": [
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ],
+                        [
+                            16,
+                            12,
+                            8
+                        ]
+                    ]
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/comps/finetuning_sqft/finetune_runner.py b/comps/finetuning_sqft/finetune_runner.py
new file mode 100644
index 0000000000..45cad43d56
--- /dev/null
+++ b/comps/finetuning_sqft/finetune_runner.py
@@ -0,0 +1,38 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+
+from pydantic_yaml import parse_yaml_raw_as
+from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments
+
+from comps.finetuning_sqft.finetune_sqft_config import FinetuneConfig
+
+
+class FineTuneCallback(TrainerCallback):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        print("FineTuneCallback:", args, state)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Runner for llm_on_ray-finetune")
+    parser.add_argument("--config_file", type=str, required=True, default=None)
+    args = parser.parse_args()
+    model_config_file = args.config_file
+
+    with open(model_config_file) as f:
+        finetune_config = parse_yaml_raw_as(FinetuneConfig, f).model_dump()
+
+    callback = FineTuneCallback()
+    finetune_config["Training"]["callbacks"] = [callback]
+
+    from comps.finetuning_sqft.llm_on_ray.finetune.finetune import main as llm_on_ray_finetune_main
+
+    llm_on_ray_finetune_main(finetune_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/comps/finetuning_sqft/finetune_sqft_config.py b/comps/finetuning_sqft/finetune_sqft_config.py
new file mode 100644
index 0000000000..a34a9e7c3b
--- /dev/null
+++ b/comps/finetuning_sqft/finetune_sqft_config.py
@@ -0,0 +1,215 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+from typing import List, Optional, Union
+
+from pydantic import BaseModel, Field, validator, root_validator
+
+from comps.cores.proto.api_protocol import FineTuningJobIDRequest, FineTuningJobsRequest
+
+PRECISION_BF16 = "bf16"
+PRECISION_FP16 = "fp16"
+PRECISION_NO = "no"
+
+DEVICE_CPU = "cpu"
+DEVICE_HPU = "hpu"
+DEVICE_GPU = "gpu"
+DEVICE_CUDA = "cuda"
+
+ACCELERATE_STRATEGY_DDP = "DDP"
+ACCELERATE_STRATEGY_FSDP = "FSDP"
+ACCELERATE_STRATEGY_DEEPSPEED = "DEEPSPEED"
+
+
+class LoadConfig(BaseModel):
+    trust_remote_code: bool = False
+    # set Huggingface token to access dataset/model
+    token: Optional[str] = None
+
+
+class LoraConfig(BaseModel):
+    task_type: str = "CAUSAL_LM"
+    r: int = 8
+    lora_alpha: int = 16
+    lora_dropout: float = 0.1
+    target_modules: Optional[List[str]] = None
+
+
+class SQFTLoRAConfig(LoraConfig):
+    neural_lora_search: bool = False
+    target_module_groups: Optional[List[List[str]]] = None
+    search_space: Optional[List[str]] = None
+    sparse_adapter: bool = False
+    nncf_config: Optional[str] = None
+
+    @root_validator(pre=True)
+    def set_target_modules(cls, values):
+        target_module_groups = values.get('target_module_groups')
+        if target_module_groups is not None:
+            values['target_modules'] = [item for sublist in target_module_groups for item in sublist]
+        search_space = values.get('search_space')
+        if search_space is not None:
+            assert len(search_space) == len(target_module_groups)
+        return values
+
+
+class GeneralConfig(BaseModel):
+    base_model: str = None
+    tokenizer_name: Optional[str] = None
+    gaudi_config_name: Optional[str] = None
+    gpt_base_model: bool = False
+    output_dir: str = "./tmp"
+    report_to: str = "none"
+    resume_from_checkpoint: Optional[str] = None
+    save_strategy: str = "no"
+    config: LoadConfig = LoadConfig()
+    lora_config: Optional[Union[LoraConfig, SQFTLoRAConfig]] = LoraConfig()
+    enable_gradient_checkpointing: bool = False
+    task: str = "instruction_tuning"
+
+    @validator("report_to")
+    def check_report_to(cls, v: str):
+        assert v in ["none", "tensorboard"]
+        return v
+
+    @validator("task")
+    def check_task(cls, v: str):
+        assert v in ["instruction_tuning"]
+        return v
+
+
+class DatasetConfig(BaseModel):
+    train_file: str = None
+    validation_file: Optional[str] = None
+    validation_split_percentage: int = 5
+    max_length: int = 512
+    group: bool = True
+    block_size: int = 512
+    shuffle: bool = False
+    max_source_length: int = 384
+    padding_side: str = "right"
+    truncation_side: str = "right"
+    max_seq_length: int = 512
+    truncation: bool = True
+    padding: Union[bool, str] = True
+    mask_input: bool = True
+    mask_response: bool = True
+    data_preprocess_type: str = "neural_chat"
+    max_train_samples: int = 0
+    max_eval_samples: int = 0
+    train_group_size: int = 8
+    query_max_len: int = Field(
+        default=128,
+        description=(
+            "The maximum total input sequence length after tokenization for passage. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        ),
+    )
+    passage_max_len: int = Field(
+        default=128,
+        description=(
+            "The maximum total input sequence length after tokenization for passage. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        ),
+    )
+    query_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for query")
+    passage_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for passage")
+
+
+class RayResourceConfig(BaseModel):
+    CPU: int = 32
+    GPU: int = 0
+    HPU: int = 0
+
+
+class TrainingConfig(BaseModel):
+    optimizer: str = "adamw_torch"
+    batch_size: int = 2
+    epochs: int = 1
+    max_train_steps: Optional[int] = None
+    learning_rate: float = 5.0e-5
+    lr_scheduler: str = "linear"
+    weight_decay: float = 0.0
+    device: str = DEVICE_CPU
+    hpu_execution_mode: str = "lazy"
+    num_training_workers: int = 1
+    resources_per_worker: RayResourceConfig = RayResourceConfig()
+    accelerate_mode: str = ACCELERATE_STRATEGY_DDP
+    mixed_precision: str = PRECISION_NO
+    gradient_accumulation_steps: int = 1
+    logging_steps: int = 10
+    deepspeed_config_file: str = ""
+
+    @validator("device")
+    def check_device(cls, v: str):
+        # will convert to lower case
+        if v:
+            assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU, DEVICE_CUDA]
+        return v.lower()
+
+    @validator("hpu_execution_mode")
+    def check_hpu_execution_mode(cls, v: str):
+        if v:
+            assert v in ["lazy", "eager", "eager.compile"]
+        return v
+
+    @validator("accelerate_mode")
+    def check_accelerate_mode(cls, v: str):
+        if v:
+            assert v in [
+                ACCELERATE_STRATEGY_DDP,
+                ACCELERATE_STRATEGY_FSDP,
+                ACCELERATE_STRATEGY_DEEPSPEED,
+            ]
+        return v
+
+    @validator("mixed_precision")
+    def check_mixed_precision(cls, v: str):
+        if v:
+            assert v in [PRECISION_BF16, PRECISION_FP16, PRECISION_NO]
+        return v
+
+    @validator("logging_steps")
+    def check_logging_steps(cls, v: int):
+        assert v > 0
+        return v
+
+    # @model_validator(mode='after')
+    # def check_device_and_accelerate_mode(self) -> "Training":
+    #     dev = self.device
+    #     res = self.resources_per_worker
+    #     mode = self.accelerate_mode
+    #     if dev == "CPU":
+    #         if res.GPU is not None and res.GPU > 0:
+    #             raise ValueError("Please not specified GPU resource when use CPU only in Ray.")
+    #         if mode != "CPU_DDP":
+    #             raise ValueError("Please specified CPU related accelerate mode when use CPU only in Ray.")
+    #     elif dev == "GPU":
+    #         if res.GPU is None or res.GPU == 0:
+    #             raise ValueError("Please specified GPU resource when use GPU to fine tune in Ray.")
+    #         if mode not in ["GPU_DDP", "GPU_FSDP"]:
+    #             raise ValueError("Please speicifed GPU related accelerate mode when use GPU to fine tune in Ray.")
+
+    #     return self
+
+
+class FinetuneConfig(BaseModel):
+    General: GeneralConfig = GeneralConfig()
+    Dataset: DatasetConfig = DatasetConfig()
+    Training: TrainingConfig = TrainingConfig()
+
+
+class FineTuningParams(FineTuningJobsRequest):
+    # priority use FineTuningJobsRequest params
+    General: GeneralConfig = GeneralConfig()
+    Dataset: DatasetConfig = DatasetConfig()
+    Training: TrainingConfig = TrainingConfig()
+
+class ExtractAdapterParams(FineTuningJobIDRequest):
+    sub_adapter_version: str = "heuristic"
+    custom_config: Optional[List[int]] = None
+
+class MergeAdapterParams(FineTuningJobIDRequest):
+    adapter_version: Optional[str] = None
diff --git a/comps/finetuning_sqft/finetuning_sqft_service.py b/comps/finetuning_sqft/finetuning_sqft_service.py
new file mode 100644
index 0000000000..bc11a6cd23
--- /dev/null
+++ b/comps/finetuning_sqft/finetuning_sqft_service.py
@@ -0,0 +1,76 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+from fastapi import BackgroundTasks, Depends
+
+from comps import opea_microservices, register_microservice
+from comps.cores.proto.api_protocol import FineTuningJobIDRequest, UploadFileRequest
+from comps.finetuning_sqft.finetune_sqft_config import FineTuningParams, ExtractAdapterParams, MergeAdapterParams
+from comps.finetuning_sqft.handlers import (
+    handle_cancel_finetuning_job,
+    handle_create_finetuning_jobs,
+    handle_extract_sub_adapter,
+    handle_list_finetuning_checkpoints,
+    handle_list_finetuning_jobs,
+    handle_merge_adapter,
+    handle_retrieve_finetuning_job,
+    handle_upload_training_files,
+    upload_file,
+)
+
+
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
+def create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
+    return handle_create_finetuning_jobs(request, background_tasks)
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
+)
+def list_finetuning_jobs():
+    return handle_list_finetuning_jobs()
+
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
+)
+def retrieve_finetuning_job(request: FineTuningJobIDRequest):
+    job = handle_retrieve_finetuning_job(request)
+    return job
+
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
+)
+def cancel_finetuning_job(request: FineTuningJobIDRequest):
+    job = handle_cancel_finetuning_job(request)
+    return job
+
+
+@register_microservice(
+    name="opea_service@finetuning",
+    endpoint="/v1/files",
+    host="0.0.0.0",
+    port=8015,
+)
+async def upload_training_files(request: UploadFileRequest = Depends(upload_file)):
+    uploadFileInfo = await handle_upload_training_files(request)
+    return uploadFileInfo
+
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
+)
+def list_checkpoints(request: FineTuningJobIDRequest):
+    checkpoints = handle_list_finetuning_checkpoints(request)
+    return checkpoints
+
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/extract_adapter", host="0.0.0.0", port=8015)
+def extract_sub_adapter(request: ExtractAdapterParams):
+    return handle_extract_sub_adapter(request)
+
+@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/merge_adapter", host="0.0.0.0", port=8015)
+def merge_adapter(request: MergeAdapterParams):
+    return handle_merge_adapter(request)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@finetuning"].start()
diff --git a/comps/finetuning_sqft/handlers.py b/comps/finetuning_sqft/handlers.py
new file mode 100644
index 0000000000..03e5745981
--- /dev/null
+++ b/comps/finetuning_sqft/handlers.py
@@ -0,0 +1,338 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import random
+import re
+import time
+import urllib.parse
+import uuid
+from pathlib import Path
+from typing import Dict
+
+from fastapi import BackgroundTasks, File, Form, HTTPException, UploadFile
+from pydantic_yaml import parse_yaml_file_as, to_yaml_file
+from ray.job_submission import JobSubmissionClient
+
+from comps import CustomLogger
+from comps.cores.proto.api_protocol import (
+    FileObject,
+    FineTuningJob,
+    FineTuningJobCheckpoint,
+    FineTuningJobIDRequest,
+    FineTuningJobList,
+    UploadFileRequest,
+)
+from comps.finetuning_sqft.finetune_sqft_config import (
+    ExtractAdapterParams,
+    FinetuneConfig,
+    FineTuningParams,
+    MergeAdapterParams,
+)
+
+logger = CustomLogger("finetuning_handlers")
+
+DATASET_BASE_PATH = "datasets"
+JOBS_PATH = "jobs"
+OUTPUT_DIR = "output"
+
+if not os.path.exists(DATASET_BASE_PATH):
+    os.mkdir(DATASET_BASE_PATH)
+if not os.path.exists(JOBS_PATH):
+    os.mkdir(JOBS_PATH)
+if not os.path.exists(OUTPUT_DIR):
+    os.mkdir(OUTPUT_DIR)
+
+FineTuningJobID = str
+CheckpointID = str
+CheckpointPath = str
+
+CHECK_JOB_STATUS_INTERVAL = 5  # Check every 5 secs
+
+global ray_client
+ray_client: JobSubmissionClient = None
+
+running_finetuning_jobs: Dict[FineTuningJobID, FineTuningJob] = {}
+finetuning_job_to_ray_job: Dict[FineTuningJobID, str] = {}
+checkpoint_id_to_checkpoint_path: Dict[CheckpointID, CheckpointPath] = {}
+
+
+# Add a background task to periodicly update job status
+def update_job_status(job_id: FineTuningJobID):
+    while True:
+        job_status = ray_client.get_job_status(finetuning_job_to_ray_job[job_id])
+        status = str(job_status).lower()
+        # Ray status "stopped" is OpenAI status "cancelled"
+        status = "cancelled" if status == "stopped" else status
+        logger.info(f"Status of job {job_id} is '{status}'")
+        running_finetuning_jobs[job_id].status = status
+        if status == "succeeded" or status == "cancelled" or status == "failed":
+            break
+        time.sleep(CHECK_JOB_STATUS_INTERVAL)
+
+
+def handle_create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
+    base_model = request.model
+    train_file = request.training_file
+    train_file_path = os.path.join(DATASET_BASE_PATH, train_file)
+
+    if not os.path.exists(train_file_path):
+        raise HTTPException(status_code=404, detail=f"Training file '{train_file}' not found!")
+
+    finetune_config = FinetuneConfig(General=request.General, Dataset=request.Dataset, Training=request.Training)
+    finetune_config.General.base_model = base_model
+    finetune_config.Dataset.train_file = train_file_path
+    if request.hyperparameters is not None:
+        if request.hyperparameters.epochs != "auto":
+            finetune_config.Training.epochs = request.hyperparameters.epochs
+
+        if request.hyperparameters.batch_size != "auto":
+            finetune_config.Training.batch_size = request.hyperparameters.batch_size
+
+        if request.hyperparameters.learning_rate_multiplier != "auto":
+            finetune_config.Training.learning_rate = request.hyperparameters.learning_rate_multiplier
+
+    if os.getenv("HF_TOKEN", None):
+        finetune_config.General.config.token = os.getenv("HF_TOKEN", None)
+
+    job = FineTuningJob(
+        id=f"ft-job-{uuid.uuid4()}",
+        model=base_model,
+        created_at=int(time.time()),
+        training_file=train_file,
+        hyperparameters={
+            "n_epochs": finetune_config.Training.epochs,
+            "batch_size": finetune_config.Training.batch_size,
+            "learning_rate_multiplier": finetune_config.Training.learning_rate,
+        },
+        status="running",
+        seed=random.randint(0, 1000) if request.seed is None else request.seed,
+    )
+    finetune_config.General.output_dir = os.path.join(OUTPUT_DIR, job.id)
+    if os.getenv("DEVICE", ""):
+        logger.info(f"specific device: {os.getenv('DEVICE')}")
+
+        finetune_config.Training.device = os.getenv("DEVICE")
+        if finetune_config.Training.device == "hpu":
+            if finetune_config.Training.resources_per_worker.HPU == 0:
+                # set 1
+                finetune_config.Training.resources_per_worker.HPU = 1
+
+    finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
+    to_yaml_file(finetune_config_file, finetune_config)
+
+    global ray_client
+    ray_client = JobSubmissionClient() if ray_client is None else ray_client
+
+    ray_job_id = ray_client.submit_job(
+        # Entrypoint shell command to execute
+        entrypoint=f"python finetune_runner.py --config_file {finetune_config_file}",
+    )
+
+    logger.info(f"Submitted Ray job: {ray_job_id} ...")
+
+    running_finetuning_jobs[job.id] = job
+    finetuning_job_to_ray_job[job.id] = ray_job_id
+
+    background_tasks.add_task(update_job_status, job.id)
+
+    return job
+
+
+def handle_extract_sub_adapter(request: ExtractAdapterParams):
+    fine_tuning_job_id = request.fine_tuning_job_id
+    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
+    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
+    assert finetuned_model_path == finetune_config.General.output_dir
+    if not os.path.exists(finetuned_model_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+        )
+    if job.status != "succeeded":
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
+
+    if finetune_config.General.lora_config is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+        )
+    if not finetune_config.General.lora_config.neural_lora_search:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' did not enable NLS algorithm, "
+                   f"there is no need to extract sub-adapters!"
+        )
+    nncf_config_path = os.path.join(finetune_config.General.output_dir, "nncf_config.json")
+    if not os.path.exists(nncf_config_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
+        )
+
+    from comps.finetuning_sqft.utils.extract_sub_adapter import main as extract_sub_adapter_main
+    extract_sub_adapter_main(
+        adapter_model_path=finetuned_model_path,
+        nncf_config=nncf_config_path,
+        sub_adapter_version=request.sub_adapter_version,
+        custom_config=request.custom_config
+    )
+
+    return fine_tuning_job_id
+
+
+def handle_merge_adapter(request: MergeAdapterParams):
+    fine_tuning_job_id = request.fine_tuning_job_id
+    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
+    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
+    assert finetuned_model_path == finetune_config.General.output_dir
+    if not os.path.exists(finetuned_model_path):
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+        )
+    if job.status != "succeeded":
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
+
+    if finetune_config.General.lora_config is None:
+        raise HTTPException(
+            status_code=404,
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+        )
+
+    adapter_path = finetuned_model_path
+    adapter_version = request.adapter_version
+    if adapter_version is not None:
+        adapter_path = os.path.join(adapter_path, adapter_version)
+        if not os.path.exists(adapter_path):
+            raise HTTPException(
+                status_code=404,
+                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!"
+            )
+
+    from comps.finetuning_sqft.utils.merge import main as merge_adapter_main
+    merge_adapter_main(
+        base_model_path=finetune_config.General.base_model,
+        adapter_model_path=adapter_path,
+        output_path=os.path.join(adapter_path, "merged_model")
+    )
+
+    return fine_tuning_job_id
+
+
+def handle_list_finetuning_jobs():
+    finetuning_jobs_list = FineTuningJobList(data=list(running_finetuning_jobs.values()), has_more=False)
+
+    return finetuning_jobs_list
+
+
+def handle_retrieve_finetuning_job(request: FineTuningJobIDRequest):
+    fine_tuning_job_id = request.fine_tuning_job_id
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    return job
+
+
+def handle_cancel_finetuning_job(request: FineTuningJobIDRequest):
+    fine_tuning_job_id = request.fine_tuning_job_id
+
+    ray_job_id = finetuning_job_to_ray_job.get(fine_tuning_job_id)
+    if ray_job_id is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+
+    global ray_client
+    ray_client = JobSubmissionClient() if ray_client is None else ray_client
+    ray_client.stop_job(ray_job_id)
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    job.status = "cancelled"
+    return job
+
+
+async def save_content_to_local_disk(save_path: str, content):
+    save_path = Path(save_path)
+    try:
+        if isinstance(content, str):
+            with open(save_path, "w", encoding="utf-8") as file:
+                file.write(content)
+        else:
+            with save_path.open("wb") as fout:
+                content = await content.read()
+                fout.write(content)
+    except Exception as e:
+        logger.info(f"Write file failed. Exception: {e}")
+        raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
+
+
+def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
+    fine_tuning_job_id = request.fine_tuning_job_id
+
+    job = running_finetuning_jobs.get(fine_tuning_job_id)
+    if job is None:
+        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
+    output_dir = os.path.join(OUTPUT_DIR, job.id)
+    checkpoints = []
+    if os.path.exists(output_dir):
+        # Iterate over the contents of the directory and add an entry for each
+        files = os.listdir(output_dir)
+        for file in files:  # Loop over directory contents
+            file_path = os.path.join(output_dir, file)
+            if os.path.isdir(file_path) and file.startswith("checkpoint"):
+                steps = re.findall("\d+", file)[0]
+                checkpointsResponse = FineTuningJobCheckpoint(
+                    id=f"ftckpt-{uuid.uuid4()}",  # Generate a unique ID
+                    created_at=int(time.time()),  # Use the current timestamp
+                    fine_tuned_model_checkpoint=file_path,  # Directory path itself
+                    fine_tuning_job_id=fine_tuning_job_id,
+                    object="fine_tuning.job.checkpoint",
+                    step_number=steps,
+                )
+                checkpoints.append(checkpointsResponse)
+        if job.status == "succeeded":
+            checkpointsResponse = FineTuningJobCheckpoint(
+                id=f"ftckpt-{uuid.uuid4()}",  # Generate a unique ID
+                created_at=int(time.time()),  # Use the current timestamp
+                fine_tuned_model_checkpoint=output_dir,  # Directory path itself
+                fine_tuning_job_id=fine_tuning_job_id,
+                object="fine_tuning.job.checkpoint",
+            )
+            checkpoints.append(checkpointsResponse)
+
+    return checkpoints
+
+
+async def upload_file(purpose: str = Form(...), file: UploadFile = File(...)):
+    return UploadFileRequest(purpose=purpose, file=file)
+
+
+async def handle_upload_training_files(request: UploadFileRequest):
+    file = request.file
+    if file is None:
+        raise HTTPException(status_code=404, detail="upload file failed!")
+    filename = urllib.parse.quote(file.filename, safe="")
+    save_path = os.path.join(DATASET_BASE_PATH, filename)
+    await save_content_to_local_disk(save_path, file)
+
+    fileBytes = os.path.getsize(save_path)
+    fileInfo = FileObject(
+        id=f"file-{uuid.uuid4()}",
+        object="file",
+        bytes=fileBytes,
+        created_at=int(time.time()),
+        filename=filename,
+        purpose="fine-tune",
+    )
+
+    return fileInfo
diff --git a/comps/finetuning_sqft/launch.sh b/comps/finetuning_sqft/launch.sh
new file mode 100644
index 0000000000..034c82f3d2
--- /dev/null
+++ b/comps/finetuning_sqft/launch.sh
@@ -0,0 +1,12 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+if [[ -n "$RAY_PORT" ]];then
+    ray start --head --port $RAY_PORT --dashboard-host=0.0.0.0
+else
+    ray start --head --dashboard-host=0.0.0.0
+    export RAY_PORT=8265
+fi
+
+export RAY_ADDRESS=http://localhost:$RAY_PORT
+python finetuning_sqft_service.py
diff --git a/comps/finetuning_sqft/llm_on_ray/common/__init__.py b/comps/finetuning_sqft/llm_on_ray/common/__init__.py
new file mode 100644
index 0000000000..954b7baa4b
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/common/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+from .torch_config import TorchConfig
diff --git a/comps/finetuning_sqft/llm_on_ray/common/common.py b/comps/finetuning_sqft/llm_on_ray/common/common.py
new file mode 100644
index 0000000000..ac01ae12e1
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/common/common.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+import glob
+import importlib
+import os
+
+from comps import CustomLogger
+
+logger = CustomLogger("llm_on_ray")
+
+
+def import_all_modules(basedir, prefix=None):
+    all_py_files = glob.glob(basedir + "/*.py")
+    modules = [os.path.basename(f) for f in all_py_files]
+
+    for module in modules:
+        if not module.startswith("_"):
+            module = module.rstrip(".py")
+            if prefix is None:
+                module_name = module
+            else:
+                module_name = f"{prefix}.{module}"
+            try:
+                importlib.import_module(module_name)
+            except Exception:
+                logger.warning(f"import {module_name} error", exc_info=True)
diff --git a/comps/finetuning_sqft/llm_on_ray/common/torch_config.py b/comps/finetuning_sqft/llm_on_ray/common/torch_config.py
new file mode 100644
index 0000000000..9e3f48a7c3
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/common/torch_config.py
@@ -0,0 +1,72 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+import os
+import sys
+from dataclasses import dataclass
+from typing import Optional
+
+from ray.train._internal.worker_group import WorkerGroup
+from ray.train.torch.config import TorchConfig as RayTorchConfig
+from ray.train.torch.config import _TorchBackend
+
+# The package importlib_metadata is in a different place, depending on the Python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+
+@dataclass
+class TorchConfig(RayTorchConfig):
+    device: Optional[str] = None
+
+    @property
+    def backend_cls(self):
+        EnableCCLBackend.device = self.device
+        return EnableCCLBackend
+
+
+def xpu_libs_import():
+    """Try to import IPEX and oneCCL."""
+    try:
+        import intel_extension_for_pytorch
+    except ImportError:
+        raise ImportError("Please install intel_extension_for_pytorch")
+    try:
+        ccl_version = importlib_metadata.version("oneccl_bind_pt")
+        if ccl_version >= "1.12":
+            import oneccl_bindings_for_pytorch
+        else:
+            import torch_ccl
+    except ImportError as ccl_not_exist:
+        raise ImportError("Please install torch-ccl") from ccl_not_exist
+
+
+def hpu_libs_import():
+    """Try to import habana frameworkfs for torch."""
+    try:
+        import habana_frameworks.torch  # noqa: F401
+    except ImportError as habana_not_exist:
+        raise ImportError("Please install habana_frameworks") from habana_not_exist
+
+
+def _set_torch_distributed_env_vars(device):
+    if device is not None:
+        os.environ["ACCELERATE_TORCH_DEVICE"] = device
+
+
+class EnableCCLBackend(_TorchBackend):
+    device: Optional[str] = None
+
+    def on_start(self, worker_group: WorkerGroup, backend_config: RayTorchConfig):
+        libs_import = hpu_libs_import if self.device is not None and self.device.startswith("hpu") else xpu_libs_import
+        for i in range(len(worker_group)):
+            worker_group.execute_single_async(i, libs_import)
+        super().on_start(worker_group, backend_config)
+
+    def on_training_start(self, worker_group: WorkerGroup, backend_config: RayTorchConfig):
+        super().on_training_start(worker_group, backend_config)
+        worker_group.execute(_set_torch_distributed_env_vars, self.device)
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py b/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
new file mode 100644
index 0000000000..0262e494a9
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py b/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
new file mode 100644
index 0000000000..07b12d71e1
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
@@ -0,0 +1,352 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+import copy
+import math
+import random
+import re
+from dataclasses import dataclass
+from itertools import chain
+from typing import Dict, List, Tuple
+
+import torch
+from torch.utils.data import Dataset
+from transformers import BatchEncoding, DataCollatorWithPadding
+
+IGNORE_INDEX = -100
+
+
+class InstructionDataProcessor:
+    # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.end = tokenizer.eos_token
+        self.intro = (
+            "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+        )
+        self.instruction = "### Instruction:\n"
+        self.input = "### Input:\n"
+        self.response = "### Response:\n"
+        self.padding_side = config["Dataset"].get("padding_side", "right")
+        self.truncation_side = config["Dataset"].get("truncation_side", "right")
+        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
+        self.max_source_length = config["Dataset"].get("max_source_length", 384)
+        self.truncation = config["Dataset"].get("truncation", True)
+        self.padding = config["Dataset"].get("padding", True)
+        self.mask_input = config["Dataset"].get("mask_input", True)
+        self.mask_response = config["Dataset"].get("mask_response", True)
+
+    def make_prompt(self, examples):
+        prompts = {}
+        prompts["prompt_sources"] = []
+        prompts["prompt_targets"] = []
+        for rec in examples:
+            instruction = rec["instruction"]
+            response = rec["input"]
+            context = rec.get("output")
+            if not instruction:
+                raise ValueError(f"Expected an instruction in: {rec}")
+            # if not response:
+            #     raise ValueError(f"Expected a response in: {rec}")
+            if context:
+                prompt = (
+                    self.intro
+                    + self.end
+                    + "\n"
+                    + self.instruction
+                    + instruction
+                    + self.input
+                    + context
+                    + self.end
+                    + "\n"
+                    + self.response
+                )
+                prompts["prompt_sources"].append(prompt)
+            else:
+                prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.end + "\n" + self.response
+                prompts["prompt_sources"].append(prompt)
+            prompt_response = response + self.end
+            prompts["prompt_targets"].append(prompt_response)
+        return prompts
+
+    def __truncate_sequences(self, sequences, max_length):
+        """
+        Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40
+        """
+        words_to_cut = sum(list(map(len, sequences))) - max_length
+        if words_to_cut <= 0:
+            return sequences
+
+        while words_to_cut > 0 and len(sequences) > 0:
+            words_to_cut -= len(sequences[0])
+            sequences = sequences[1:]
+        return sequences
+
+    def tokenize_by_neural_chat(self, examples):
+        """
+        Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225
+        The only differences are:
+        - using our own prompt style
+        - add left or right padding and truncation
+        - add mask_input and mask_response
+        """
+        keys = list(examples.data.keys())
+        if len(keys) != 2:
+            raise ValueError("Unsupported dataset format")
+        assistant_tokens = self.tokenizer.tokenize(self.response)
+        header = self.intro + self.end + "\n"
+
+        examples["input_ids"] = []
+        examples["labels"] = []
+        examples["attention_mask"] = []
+        for instruction, response in zip(examples[keys[0]], examples[keys[1]]):
+            convs = re.findall(
+                r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end),
+                instruction,
+                re.DOTALL,
+            )
+            convs_tokens = [self.tokenizer.tokenize(conv) + self.tokenizer.tokenize("\n") for conv in convs]
+            header_tokens = self.tokenizer.tokenize(header) + self.tokenizer.tokenize("\n")
+            max_input = self.max_source_length - len(header_tokens) - len(assistant_tokens)
+            truncated_convs = self.__truncate_sequences(convs_tokens, max_input)
+            if len(truncated_convs) == 0:
+                truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]]
+
+            prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens]
+            prompt_ids = [self.tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens]
+            prompt_ids = list(chain(*prompt_ids))
+
+            resp_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(response.strip()))
+            # keep last and eos_id
+            max_resp = self.max_seq_length - len(prompt_ids) - 1
+
+            # truncating response
+            if len(resp_ids) > max_resp:
+                if self.truncation_side == "right":
+                    resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:]
+                else:
+                    resp_ids = resp_ids[-max_resp:]
+
+            # masking
+            input_ids = prompt_ids + resp_ids + [self.tokenizer.eos_token_id]
+            if self.mask_input:
+                labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [self.tokenizer.eos_token_id]
+            elif self.mask_response:
+                labels = prompt_ids + [IGNORE_INDEX] * len(resp_ids) + [self.tokenizer.eos_token_id]
+            else:
+                labels = input_ids
+
+            # padding
+            input_len = len(input_ids)
+            pad_len = self.max_seq_length - input_len
+            if self.padding_side == "right":
+                input_ids = input_ids + [self.tokenizer.eos_token_id] * pad_len
+                labels = labels + [IGNORE_INDEX] * pad_len
+                attention_mask = [1] * input_len + [0] * pad_len
+            else:
+                input_ids = [self.tokenizer.eos_token_id] * pad_len + input_ids
+                labels = [IGNORE_INDEX] * pad_len + labels
+                attention_mask = [0] * pad_len + [1] * input_len
+
+            assert len(input_ids) == self.max_seq_length
+            assert len(prompt_ids) <= self.max_source_length
+            assert len(labels) == len(input_ids) == len(attention_mask)
+
+            examples["input_ids"].append(torch.tensor(input_ids))
+            examples["labels"].append(labels)
+            examples["attention_mask"].append(attention_mask)
+
+        return examples
+
+    def tokenize(self, examples):
+        keys = list(examples.data.keys())
+        if len(keys) != 2:
+            raise ValueError("Unsupported dataset format")
+
+        examples["input_ids"] = []
+        examples["labels"] = []
+        examples["attention_mask"] = []
+        for s, t in zip(examples[keys[0]], examples[keys[1]]):
+            results = self.tokenizer(
+                s + t,
+                padding=self.padding,
+                truncation=self.truncation,
+                return_tensors=None,
+                max_length=self.max_length,
+            )
+
+            input_ids = results["input_ids"]
+            input_len = len(input_ids)
+            labels = copy.deepcopy(input_ids)
+            if self.mask_input or self.mask_response:
+                sources_tokenized = self.tokenizer(
+                    s,
+                    padding=False,
+                    truncation=True,
+                    return_tensors=None,
+                    max_length=self.max_length,
+                )
+                input_id_len = len(sources_tokenized["input_ids"])
+                # mask input
+                if self.mask_input:
+                    labels[:input_id_len] = [IGNORE_INDEX] * input_id_len
+                # mask response
+                if self.mask_response:
+                    labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len)
+
+            examples["input_ids"].append(results["input_ids"])
+            examples["labels"].append(labels)
+            examples["attention_mask"].append(results["attention_mask"])
+        return examples
+
+
+class PretrainingDataProcessor:
+    def __init__(self, config, tokenizer):
+        self.tokenizer = tokenizer
+        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
+        self.truncation = config["Dataset"].get("truncation", True)
+        self.padding = config["Dataset"].get("padding", True)
+
+    def tokenize(self, examples):
+        keys = list(examples.data.keys())
+        if len(keys) != 1 and "text" not in keys:
+            raise ValueError("Unsupported dataset format")
+
+        key = keys[0] if len(keys) == 1 else "text"
+        examples["input_ids"] = []
+        examples["labels"] = []
+        examples["attention_mask"] = []
+        for exp in examples[key]:
+            results = self.tokenizer(
+                exp,
+                padding=self.padding,
+                truncation=self.truncation,
+                return_tensors=None,
+                max_length=self.max_length,
+            )
+
+            input_ids = results["input_ids"]
+            labels = copy.deepcopy(input_ids)
+            examples["input_ids"].append(results["input_ids"])
+            examples["labels"].append(labels)
+            examples["attention_mask"].append(results["attention_mask"])
+        return examples
+
+
+class TrainDatasetForCE(Dataset):
+    def __init__(self, dataset, args, tokenizer):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.args = args
+        self.total_len = len(self.dataset)
+
+    def create_one_example(self, qry_encoding: str, doc_encoding: str):
+        item = self.tokenizer.encode_plus(
+            qry_encoding,
+            doc_encoding,
+            truncation=True,
+            max_length=self.args.get("max_length", 512),
+            padding=False,
+        )
+        return item
+
+    def __len__(self):
+        return self.total_len
+
+    def __getitem__(self, item) -> List[BatchEncoding]:
+        query = self.dataset[item]["query"]
+        pos = random.choice(self.dataset[item]["pos"])
+        train_group_size = self.args.get("train_group_size", 8)
+        if len(self.dataset[item]["neg"]) < train_group_size - 1:
+            num = math.ceil((train_group_size - 1) / len(self.dataset[item]["neg"]))
+            negs = random.sample(self.dataset[item]["neg"] * num, train_group_size - 1)
+        else:
+            negs = random.sample(self.dataset[item]["neg"], train_group_size - 1)
+
+        batch_data = []
+        batch_data.append(self.create_one_example(query, pos))
+        for neg in negs:
+            batch_data.append(self.create_one_example(query, neg))
+
+        return batch_data
+
+
+@dataclass
+class GroupCollator(DataCollatorWithPadding):
+    def __call__(self, features) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+        if isinstance(features[0], list):
+            features = sum(features, [])
+        return super().__call__(features)
+
+
+class TrainDatasetForEmbedding(Dataset):
+    def __init__(self, dataset, args, tokenizer):
+        self.dataset = dataset
+        self.tokenizer = tokenizer
+        self.args = args
+        self.total_len = len(self.dataset)
+
+    def __len__(self):
+        return self.total_len
+
+    def __getitem__(self, item) -> Tuple[str, List[str]]:
+        query = self.dataset[item]["query"]
+        if self.args["query_instruction_for_retrieval"] is not None:
+            query = self.args["query_instruction_for_retrieval"] + query
+
+        passages = []
+
+        assert isinstance(self.dataset[item]["pos"], list)
+        pos = random.choice(self.dataset[item]["pos"])
+        passages.append(pos)
+
+        train_group_size = self.args.get("train_group_size", 8)
+        if len(self.dataset[item]["neg"]) < train_group_size - 1:
+            num = math.ceil((train_group_size - 1) / len(self.dataset[item]["neg"]))
+            negs = random.sample(self.dataset[item]["neg"] * num, train_group_size - 1)
+        else:
+            negs = random.sample(self.dataset[item]["neg"], train_group_size - 1)
+        passages.extend(negs)
+
+        if self.args["passage_instruction_for_retrieval"] is not None:
+            passages = [self.args["passage_instruction_for_retrieval"] + p for p in passages]
+        return query, passages
+
+
+@dataclass
+class EmbedCollator(DataCollatorWithPadding):
+    """Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
+    and pass batch separately to the actual collator.
+
+    Abstract out data detail for the model.
+    """
+
+    query_max_len: int = 32
+    passage_max_len: int = 128
+
+    def __call__(self, features):
+        query = [f[0] for f in features]
+        passage = [f[1] for f in features]
+
+        if isinstance(query[0], list):
+            query = sum(query, [])
+        if isinstance(passage[0], list):
+            passage = sum(passage, [])
+
+        q_collated = self.tokenizer(
+            query,
+            padding=self.padding,
+            truncation=True,
+            max_length=self.query_max_len,
+            return_tensors="pt",
+        )
+        d_collated = self.tokenizer(
+            passage,
+            padding=self.padding,
+            truncation=True,
+            max_length=self.passage_max_len,
+            return_tensors="pt",
+        )
+        return {"query": q_collated, "passage": d_collated}
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py b/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
new file mode 100644
index 0000000000..82f2e65c1d
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
@@ -0,0 +1,602 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright 2023 The LLM-on-Ray Authors.
+
+#!/usr/bin/env python
+
+import argparse
+import os
+import sys
+from itertools import chain
+from typing import Any, Dict, Optional
+
+import datasets
+import ray
+import torch
+import transformers
+from peft import LoraConfig, get_peft_model
+from pydantic_yaml import parse_yaml_raw_as
+from ray.air import FailureConfig, RunConfig
+from ray.air.config import ScalingConfig
+from ray.train.torch import TorchTrainer
+from transformers import Trainer, TrainingArguments
+
+from comps import CustomLogger
+from comps.finetuning_sqft.finetune_sqft_config import FinetuneConfig
+from comps.finetuning_sqft.llm_on_ray import common
+from comps.finetuning_sqft.llm_on_ray.finetune.data_process import (
+    EmbedCollator,
+    GroupCollator,
+    InstructionDataProcessor,
+    PretrainingDataProcessor,
+    TrainDatasetForCE,
+    TrainDatasetForEmbedding,
+)
+from comps.finetuning_sqft.llm_on_ray.finetune.modeling import BiEncoderModel, CrossEncoder
+
+logger = CustomLogger("llm_on_ray/finetune")
+
+try:
+    from comps.finetuning_sqft.utils.nncf_config_process import load_nncf_config
+    from nncf.experimental.torch.nas.bootstrapNAS.training.model_creator_helpers import (
+        create_compressed_model_from_algo_names,
+    )
+    from nncf.torch.model_creation import create_nncf_network
+    is_nncf_available = True
+except ImportError:
+    is_nncf_available = False
+    logger.info("NNCF is not installed. Please install it if necessary.")
+
+
+def adapt_transformers_to_device(config: Dict):
+    device = config["Training"]["device"]
+    if device in ["hpu"]:
+        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+        # adapt transformers to gaudi
+        adapt_transformers_to_gaudi()
+
+
+def set_seed(config: Dict):
+    seed = config["Training"].get("seed", None)
+    if seed is None:
+        return
+    device = config["Training"]["device"]
+    if device in ["cpu", "gpu"]:
+        from accelerate.utils import set_seed as _set_seed
+
+        _set_seed(seed)
+    elif device in ["hpu"]:
+        from optimum.habana.utils import set_seed as _set_seed
+
+        _set_seed(seed)
+
+
+def convert_to_training_args(cls, config: Dict):
+    device = config["Training"]["device"]
+    accelerate_mode = config["Training"]["accelerate_mode"]
+    save_strategy = config["General"]["save_strategy"]
+
+    args = {
+        "output_dir": config["General"]["output_dir"],
+        "report_to": config["General"]["report_to"],
+        "resume_from_checkpoint": config["General"]["resume_from_checkpoint"],
+        "gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
+        "save_strategy": save_strategy if save_strategy != "False" else "no",
+        "bf16": config["Training"]["mixed_precision"] == "bf16",
+        "num_train_epochs": config["Training"]["epochs"],
+        "per_device_train_batch_size": config["Training"]["batch_size"],
+        "per_device_eval_batch_size": config["Training"]["batch_size"],
+        "optim": config["Training"]["optimizer"],
+        "learning_rate": config["Training"]["learning_rate"],
+        "logging_steps": config["Training"]["logging_steps"],
+        "lr_scheduler_type": config["Training"]["lr_scheduler"],
+        "weight_decay": config["Training"]["weight_decay"],
+        "gradient_accumulation_steps": config["Training"]["gradient_accumulation_steps"],
+        "do_train": True,
+    }
+
+    # set attr do_eval
+    vf = config["Dataset"].get("validation_file", None)
+    vsp = config["Dataset"].get("validation_split_percentage", 0)
+    if vf is not None or (vsp / 100 > 0.0 and vsp / 100 < 1.0):
+        args.update({"do_eval": True})
+
+    # set attr max_steps
+    if config["Training"]["max_train_steps"] is not None:
+        args.update({"max_steps": config["Training"]["max_train_steps"]})
+
+    # set attr for device cpu
+    if device == "cpu":
+        if hasattr(cls, "use_cpu"):
+            args.update({"use_cpu": True})
+        if hasattr(cls, "no_cuda"):
+            args.update({"no_cuda": True})
+        # To be tested: whether it works when enabling Neural Lora Search (using NNCF)
+        args.update({"use_ipex": True})
+
+    # set attr 'deepspeed'
+    if accelerate_mode == "DEEPSPEED":
+        args.update({"deepspeed": config["Training"]["deepspeed_config_file"]})
+
+    # set attr for FSDP
+    # if accelerate_mode == "FSDP":
+    #     args.updatwe({})
+
+    # set attr for Intel Gaudi
+    if device == "hpu":
+        args.update({"use_habana": True})
+        args.update({"use_lazy_mode": config["Training"]["hpu_execution_mode"] == "lazy"})
+        args.update({"pipelining_fwd_bwd": True})
+
+    return cls(**args)
+
+
+def convert_dtype(dtype: str) -> Optional[torch.dtype]:
+    supported_dtypes = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "no": None,
+    }
+    return supported_dtypes[dtype]
+
+
+def load_tokenizer(config: Dict):
+    if config["General"].get("tokenizer_name") is not None:
+        tokenizer_name = config["General"].get("tokenizer_name")
+    else:
+        tokenizer_name = config["General"]["base_model"]
+    load_config = config["General"].get("config", {})
+    # default padding side is right
+    padding_side = config["Dataset"].get("padding_side", "right")
+    # default truncation side is right
+    truncation_side = config["Dataset"].get("truncation_side", "right")
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        tokenizer_name, padding_side=padding_side, truncation_side=truncation_side, **load_config
+    )
+    return tokenizer
+
+
+def load_dataset(config: Dict):
+    dataset_file = config["Dataset"].get("train_file", None)
+    if dataset_file is None:
+        return
+
+    if os.path.exists(dataset_file):
+        # load from local file
+        def local_load(name, **load_config):
+            if os.path.isfile(name):
+                file = os.path.basename(os.path.abspath(name))
+                path = os.path.dirname(os.path.abspath(name))
+                dataset = datasets.load_dataset(path, data_files=file, **load_config)
+            else:
+                dataset = datasets.load_dataset(name, **load_config)
+            return dataset["train"]
+
+        train_dataset = local_load(dataset_file)
+        validation_file = config["Dataset"].get("validation_file", None)
+        if validation_file is not None:
+            validation_dataset = local_load(validation_file)
+            return datasets.DatasetDict({"train": train_dataset, "validation": validation_dataset})
+
+        validation_split_percentage = config["Dataset"].get("validation_split_percentage", 0)
+        if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
+            dataset_dict = train_dataset.train_test_split(test_size=validation_split_percentage / 100)
+            dataset_dict["validation"] = dataset_dict["test"]
+            return dataset_dict
+
+        return datasets.DatasetDict({"train": train_dataset})
+    else:
+        # try to download and load dataset from huggingface.co
+        load_config = config["General"].get("config", {})
+        use_auth_token = load_config.get("token", None)
+        raw_dataset = datasets.load_dataset(dataset_file, token=use_auth_token)
+
+        validation_split_percentage = config["Dataset"].get("validation_split_percentage", 0)
+        if "validation" not in raw_dataset.keys() and (
+            validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0
+        ):
+            dataset_dict = raw_dataset["train"].train_test_split(test_size=validation_split_percentage / 100)
+            dataset_dict["validation"] = dataset_dict["test"]
+            return dataset_dict
+
+        return raw_dataset
+
+
+def tokenize_dataset(config: Dict, tokenizer, dataset):
+    task = config["General"].get("task", "instruction_tuning")
+    if task == "instruction_tuning":
+        group = config["Dataset"].get("group", True)
+        block_size = config["Dataset"].get("block_size", 512)
+        tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
+
+        processor = InstructionDataProcessor(config, tokenizer)
+
+        for key in dataset:
+            prompts = processor.make_prompt(dataset[key])
+            dataset[key] = datasets.Dataset.from_dict(prompts)
+
+        column_names = list(dataset["train"].features)
+        tokenize_fn = (
+            processor.tokenize_by_neural_chat
+            if config["Dataset"].get("data_preprocess_type", "") == "neural_chat"
+            else processor.tokenize
+        )
+
+        tokenized_dataset = dataset.map(
+            tokenize_fn,
+            remove_columns=column_names,
+            batched=True,
+            load_from_cache_file=False,
+            desc="Tokenize dataset",
+        )
+
+        if group:
+
+            def group_texts(examples):
+                # Concatenate all texts.
+                concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+                total_length = len(concatenated_examples[list(examples.keys())[0]])
+                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+                # customize this part to your needs.
+                if total_length >= block_size:
+                    total_length = (total_length // block_size) * block_size
+                # Split by chunks of max_len.
+                result = {
+                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                    for k, t in concatenated_examples.items()
+                }
+                return result
+
+            tokenized_dataset = tokenized_dataset.map(
+                group_texts,
+                batched=True,
+                load_from_cache_file=False,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+
+        return tokenized_dataset
+    elif task == "pretraining":
+        group = True
+        block_size = config["Dataset"].get("block_size", 512)
+        tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
+
+        processor = PretrainingDataProcessor(config, tokenizer)
+
+        column_names = list(dataset["train"].features)
+
+        tokenized_dataset = dataset.map(
+            processor.tokenize,
+            remove_columns=column_names,
+            batched=True,
+            load_from_cache_file=False,
+            desc="Tokenize dataset",
+        )
+
+        if group:
+
+            def group_texts(examples):
+                # Concatenate all texts.
+                concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
+                total_length = len(concatenated_examples[list(examples.keys())[0]])
+                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+                # customize this part to your needs.
+                if total_length >= block_size:
+                    total_length = (total_length // block_size) * block_size
+                # Split by chunks of max_len.
+                result = {
+                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+                    for k, t in concatenated_examples.items()
+                }
+                return result
+
+            tokenized_dataset = tokenized_dataset.map(
+                group_texts,
+                batched=True,
+                load_from_cache_file=False,
+                desc=f"Grouping texts in chunks of {block_size}",
+            )
+
+        return tokenized_dataset
+    elif task == "rerank":
+        dataset["train"] = TrainDatasetForCE(dataset["train"], config["Dataset"], tokenizer)
+        return dataset
+    elif task == "embedding":
+        dataset["train"] = TrainDatasetForEmbedding(dataset["train"], config["Dataset"], tokenizer)
+        return dataset
+    else:
+        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
+
+
+def prepare_data_collator(config: Dict, tokenizer):
+    task = config["General"].get("task", "instruction_tuning")
+    if task == "instruction_tuning" or task == "pretraining":
+        return transformers.DataCollatorForLanguageModeling(
+            tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
+        )
+    elif task == "rerank":
+        return GroupCollator(tokenizer)
+    elif task == "embedding":
+        return EmbedCollator(
+            tokenizer=tokenizer,
+            padding=config["Dataset"]["padding"],
+            query_max_len=config["Dataset"]["query_max_len"],
+            passage_max_len=config["Dataset"]["passage_max_len"],
+        )
+    else:
+        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
+
+
+def load_model(config: Dict):
+    model_name = config["General"]["base_model"]
+    model_dtype = convert_dtype(config["Training"].get("mixed_precision", "no"))
+    model_config = config["General"].get("config", {})
+    task = config["General"].get("task", "instruction_tuning")
+    compression_ctrl = None
+    if task == "instruction_tuning" or task == "pretraining":
+        model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, **model_config)
+        lora_config = config["General"].get("lora_config", None)
+        if lora_config and task != "pretraining":
+            neural_lora_search = lora_config.pop("neural_lora_search", False)
+            target_module_groups = lora_config.pop("target_module_groups", None)
+            search_space = lora_config.pop("search_space", None)
+            nncf_config = lora_config.pop("nncf_config", None)
+            if not lora_config.get("sparse_adapter", False):
+                # To avoid the error in the following case:
+                # not using SparsePEFT and not having the peft library that supports SparsePEFT installed.
+                lora_config.pop("sparse_adapter", False)
+            peft_config = LoraConfig(**lora_config)
+            model = get_peft_model(model, peft_config)
+
+            # Neural LoRA Search (NLS)
+            if neural_lora_search:
+                if not is_nncf_available:
+                    raise ImportError("NNCF is not installed. Please install it.")
+                nncf_config = load_nncf_config(
+                    config=config,
+                    model=model,
+                    target_module_groups=target_module_groups,
+                    search_space=search_space,
+                    nncf_config=nncf_config
+                )
+                model = create_nncf_network(model, nncf_config)
+                compression_ctrl, model = create_compressed_model_from_algo_names(
+                    model, nncf_config, algo_names=["progressive_shrinking"]
+                )
+    elif task == "rerank":
+        model = CrossEncoder.from_pretrained(
+            config["Dataset"].get("train_group_size", 8),
+            config["Training"]["batch_size"],
+            model_name,
+            from_tf=bool(".ckpt" in model_name),
+            config=model_config,
+        )
+    elif task == "embedding":
+        should_concat = False
+        if (
+            config["Dataset"]["query_max_len"] == config["Dataset"]["passage_max_len"]
+            and config["Dataset"]["padding"] == "max_length"
+        ):
+            should_concat = True
+        if config["Training"]["device"] == "hpu" and not should_concat:
+            raise ValueError("please set query_max_len==passage_max_len and padding='max_length' for hpu.")
+
+        if config["Training"].get("embedding_training_config", None) is not None:
+            model = BiEncoderModel(
+                model_name=model_name, should_concat=should_concat, **config["Training"]["embedding_training_config"]
+            )
+        else:
+            model = BiEncoderModel(model_name=model_name, should_concat=should_concat)
+    else:
+        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
+
+    egc = config["General"].get("enable_gradient_checkpointing", False)
+    if egc:
+        model.enable_input_require_grads()
+        model.gradient_checkpointing_enable()
+        model.config.use_cache = False
+
+    model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"]))
+
+    return model, compression_ctrl
+
+def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=None):
+    device = config["Training"]["device"]
+    if device in ["cpu", "gpu", "cuda"]:
+        training_args = convert_to_training_args(TrainingArguments, config)
+        trainer_args = {
+            "model": model,
+            "args": training_args,
+            "train_dataset": tokenized_dataset["train"],
+            "eval_dataset": tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
+            "tokenizer": tokenizer,
+            "data_collator": data_collator,
+        }
+        if compression_ctrl is not None:
+            trainer_args["compression_ctrl"] = compression_ctrl
+
+        trainer = Trainer(**trainer_args)
+        return training_args, trainer
+    elif device in ["hpu"]:
+        assert compression_ctrl is None
+        from optimum.habana import GaudiConfig
+        from optimum.habana.transformers import GaudiTrainer, GaudiTrainingArguments
+
+        # If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), otherwise use default gaudi_config
+        gaudi_config_name = config["General"].get("gaudi_config_name", None)
+        if gaudi_config_name is not None:
+            gaudi_config = GaudiConfig.from_pretrained(gaudi_config_name)
+        else:
+            gaudi_config = GaudiConfig()
+            gaudi_config.use_fused_adam = True
+            gaudi_config.use_fused_clip_norm = True
+
+        training_args = convert_to_training_args(GaudiTrainingArguments, config)
+        trainer = GaudiTrainer(
+            model=model,
+            args=training_args,
+            gaudi_config=gaudi_config,
+            train_dataset=tokenized_dataset["train"],
+            eval_dataset=tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+        )
+        return training_args, trainer
+    return None
+
+
+def train_func(config: Dict[str, Any]):
+    os.chdir(config["cwd"])
+
+    adapt_transformers_to_device(config)
+
+    set_seed(config)
+
+    tokenizer = load_tokenizer(config)
+
+    dataset = load_dataset(config)
+
+    max_train_samples = config["Dataset"].get("max_train_samples", 0)
+    if 0 < max_train_samples < len(dataset["train"]):
+        dataset["train"] = dataset["train"].select(range(max_train_samples))
+
+    max_eval_samples = config["Dataset"].get("max_eval_samples", 0)
+    if "validation" in dataset and 0 < max_eval_samples < len(dataset["validation"]):
+        dataset["validation"] = dataset["validation"].select(range(max_eval_samples))
+
+    tokenized_dataset = tokenize_dataset(config, tokenizer, dataset)
+
+    data_collator = prepare_data_collator(config, tokenizer)
+
+    model, compression_ctrl = load_model(config)
+
+    training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=compression_ctrl)
+
+    logger.info("train start")
+    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
+    trainer.save_model()
+    logger.info("train finish")
+
+
+def get_finetune_config():
+    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        required=True,
+        default=None,
+        help="The name of the dataset to use (via the datasets library).",
+    )
+
+    # Print help if no arguments were provided
+    if len(sys.argv) == 1:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    args = parser.parse_args()
+    config_file = args.config_file
+
+    with open(config_file) as f:
+        finetune_config = parse_yaml_raw_as(FinetuneConfig, f)
+    return finetune_config.dict()
+
+
+def main(external_config=None):
+    if not external_config:
+        config = get_finetune_config()
+    else:
+        config = external_config
+
+    config["cwd"] = os.getcwd()
+
+    num_training_workers = config["Training"].get("num_training_workers")
+    resources_per_worker = config["Training"].get("resources_per_worker")
+
+    if num_training_workers > 1 and config["Training"].get("accelerate_mode", None) is None:
+        config["Training"]["accelerate_mode"] = "DDP"  # will use DDP to accelerate if no method specified
+
+    ccl_worker_count = 1
+    device = config["Training"]["device"]
+    if device != "cpu":
+        ccl_worker_count = num_training_workers
+
+    if not ray.is_initialized():
+        runtime_env = {
+            "env_vars": {
+                "OMP_NUM_THREADS": str(resources_per_worker["CPU"]),
+                "CCL_ZE_IPC_EXCHANGE": "sockets",
+                "CCL_WORKER_COUNT": str(ccl_worker_count),
+                "CCL_LOG_LEVEL": "info",
+                "FI_TCP_IFACE": "lo",
+                "FI_PROVIDER": "tcp",
+            }
+        }
+
+        if config["General"]["gpt_base_model"] is True:
+            runtime_env["pip"] = ["transformers==4.26.0"]
+
+        if device == "gpu":
+            num_cpus = resources_per_worker["CPU"] * num_training_workers + 1  # additional 1 for head worker
+            ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
+        else:
+            ray.init(runtime_env=runtime_env)
+
+    logger.info(f"ray available resources = {ray.available_resources()}")
+
+    use_gpu = True if device == "gpu" else False
+    scaling_config = ScalingConfig(
+        num_workers=num_training_workers,
+        use_gpu=use_gpu,
+        resources_per_worker=resources_per_worker,
+        placement_strategy="SPREAD",
+    )
+
+    # if try to use Intel GPU, convert device to 'xpu'
+    # due to accelerate internal use 'xpu' represent Intel GPU
+    if device == "gpu":
+        from accelerate.utils import is_xpu_available
+
+        if is_xpu_available():
+            device = "xpu"
+
+    # Jinjie: commented out the code from line 572 to 581 to temporarily disable CCL for debugging purposes.
+    # if config.get("torch_config", None) is None:
+    #     backend = None
+    #     if device == "cpu" or device == "xpu" or device == "gpu":
+    #         backend = "ccl"
+    #     elif device == "hpu":
+    #         backend = "hccl"
+    #     torch_config = common.TorchConfig(backend=backend, device=device)
+    # else:
+    #     customer_torch_config = config.get("torch_config")
+    #     torch_config = common.TorchConfig(**customer_torch_config, device=device)
+
+    if config.get("failure_config", None) is None:
+        failure_config = FailureConfig()
+    else:
+        customer_failure_config = config.get("failure_config")
+        failure_config = FailureConfig(**customer_failure_config)
+
+    if config.get("run_config", None) is None:
+        run_config = RunConfig(failure_config=failure_config)
+    else:
+        customer_run_config = config.get("run_config")
+        if customer_run_config.get("failure_config", None) is None:
+            customer_run_config["failure_config"] = failure_config
+        run_config = RunConfig(**customer_run_config)
+
+    trainer = TorchTrainer(
+        train_func,
+        train_loop_config=config,
+        scaling_config=scaling_config,
+        # torch_config=torch_config,    # Jinjie: check line 571.
+        run_config=run_config,
+    )
+    results = trainer.fit()
+    if external_config is not None:
+        return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py b/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
new file mode 100644
index 0000000000..7a2884f3bc
--- /dev/null
+++ b/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
@@ -0,0 +1,211 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, Optional
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import AutoModel, AutoModelForSequenceClassification, PreTrainedModel
+from transformers.modeling_outputs import MaskedLMOutput, SequenceClassifierOutput
+
+from comps import CustomLogger
+
+logger = CustomLogger("llm_on_ray/finetune/modeling")
+
+
+class CrossEncoder(PreTrainedModel):
+    def __init__(self, hf_model: PreTrainedModel, train_group_size: int, batch_size: int):
+        super().__init__(hf_model.config)
+        self.hf_model = hf_model
+        self.train_group_size = train_group_size
+        self.batch_size = batch_size
+
+        self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
+
+        self.register_buffer("target_label", torch.zeros(self.batch_size, dtype=torch.long))
+
+    def gradient_checkpointing_enable(self, **kwargs):
+        self.hf_model.gradient_checkpointing_enable(**kwargs)
+
+    def forward(self, **batch):
+        ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True)
+        logits = ranker_out.logits
+
+        if self.training:
+            scores = logits.view(-1, self.train_group_size)
+            loss = self.cross_entropy(scores, self.target_label[: scores.shape[0]])
+
+            return SequenceClassifierOutput(
+                loss=loss,
+                **ranker_out,
+            )
+        else:
+            return ranker_out
+
+    @classmethod
+    def from_pretrained(cls, train_group_size: int, batch_size: int, *args, **kwargs):
+        hf_model = AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
+        reranker = cls(hf_model, train_group_size, batch_size)
+        return reranker
+
+    def save_pretrained(self, output_dir: str, **kwargs):
+        state_dict = self.hf_model.state_dict()
+        state_dict = type(state_dict)({k: v.clone().cpu() for k, v in state_dict.items()})
+        kwargs.pop("state_dict")
+        self.hf_model.save_pretrained(output_dir, state_dict=state_dict, **kwargs)
+
+
+class BiEncoderModel(nn.Module):
+    TRANSFORMER_CLS = AutoModel
+
+    def __init__(
+        self,
+        model_name: str = None,
+        should_concat: bool = False,
+        normalized: bool = False,
+        sentence_pooling_method: str = "cls",
+        negatives_cross_device: bool = False,
+        temperature: float = 1.0,
+        use_inbatch_neg: bool = True,
+    ):
+        super().__init__()
+        self.model = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
+        self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
+
+        self.should_concat = should_concat
+        self.normalized = normalized
+        self.sentence_pooling_method = sentence_pooling_method
+        self.temperature = temperature
+        self.use_inbatch_neg = use_inbatch_neg
+        self.config = self.model.config
+
+        if not normalized:
+            self.temperature = 1.0
+            logger.info("reset temperature = 1.0 due to using inner product to compute similarity")
+        if normalized:
+            if self.temperature > 0.5:
+                raise ValueError(
+                    "Temperature should be smaller than 1.0 when use cosine similarity (i.e., normalized=True). Recommend to set it 0.01-0.1"
+                )
+
+        self.negatives_cross_device = negatives_cross_device
+        if self.negatives_cross_device:
+            if not dist.is_initialized():
+                raise ValueError("Distributed training has not been initialized for representation all gather.")
+            #     logger.info("Run in a single GPU, set negatives_cross_device=False")
+            #     self.negatives_cross_device = False
+            # else:
+            self.process_rank = dist.get_rank()
+            self.world_size = dist.get_world_size()
+
+    def gradient_checkpointing_enable(self, **kwargs):
+        self.model.gradient_checkpointing_enable(**kwargs)
+
+    def sentence_embedding(self, hidden_state, mask):
+        if self.sentence_pooling_method == "mean":
+            s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
+            d = mask.sum(axis=1, keepdim=True).float()
+            return s / d
+        elif self.sentence_pooling_method == "cls":
+            return hidden_state[:, 0]
+
+    def encode(self, features):
+        if features is None:
+            return None
+        psg_out = self.model(**features, return_dict=True)
+        p_reps = self.sentence_embedding(psg_out.last_hidden_state, features["attention_mask"])
+        if self.normalized:
+            p_reps = torch.nn.functional.normalize(p_reps, dim=-1)
+        return p_reps.contiguous()
+
+    def encode_concat(self, query, passage):
+        if query is None or passage is None:
+            return None
+
+        batch_size = query["input_ids"].size()[0]
+
+        psg_out = self.model(
+            input_ids=torch.cat([query["input_ids"], passage["input_ids"]]),
+            attention_mask=torch.cat([query["attention_mask"], passage["attention_mask"]]),
+            return_dict=True,
+        )
+        reps = self.sentence_embedding(
+            psg_out.last_hidden_state, torch.cat([query["attention_mask"], passage["attention_mask"]])
+        )
+        if self.normalized:
+            reps = torch.nn.functional.normalize(reps, dim=-1)
+
+        q_reps = reps[:batch_size]
+        p_reps = reps[batch_size:]
+
+        return q_reps.contiguous(), p_reps.contiguous()
+
+    def compute_similarity(self, q_reps, p_reps):
+        if len(p_reps.size()) == 2:
+            return torch.matmul(q_reps, p_reps.transpose(0, 1))
+        return torch.matmul(q_reps, p_reps.transpose(-2, -1))
+
+    def forward(self, query: Dict[str, torch.Tensor] = None, passage: Dict[str, torch.Tensor] = None):
+        if self.should_concat:
+            q_reps, p_reps = self.encode_concat(query, passage)
+        else:
+            q_reps = self.encode(query)
+            p_reps = self.encode(passage)
+
+        if self.training:
+            if self.negatives_cross_device and self.use_inbatch_neg:
+                q_reps = self._dist_gather_tensor(q_reps)
+                p_reps = self._dist_gather_tensor(p_reps)
+
+            group_size = p_reps.size(0) // q_reps.size(0)
+            if self.use_inbatch_neg:
+                scores = self.compute_similarity(q_reps, p_reps) / self.temperature  # B B*G
+                scores = scores.view(q_reps.size(0), -1)
+
+                target = torch.arange(scores.size(0), device=scores.device, dtype=torch.long)
+                target = target * group_size
+                loss = self.compute_loss(scores, target)
+            else:
+                scores = (
+                    self.compute_similarity(
+                        q_reps[
+                            :,
+                            None,
+                            :,
+                        ],
+                        p_reps.view(q_reps.size(0), group_size, -1),
+                    ).squeeze(1)
+                    / self.temperature
+                )  # B G
+
+                scores = scores.view(q_reps.size(0), -1)
+                target = torch.zeros(scores.size(0), device=scores.device, dtype=torch.long)
+                loss = self.compute_loss(scores, target)
+
+        else:
+            scores = self.compute_similarity(q_reps, p_reps)
+            loss = None
+
+        return MaskedLMOutput(loss=loss, logits=None, hidden_states=None, attentions=None)
+
+    def compute_loss(self, scores, target):
+        return self.cross_entropy(scores, target)
+
+    def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
+        if t is None:
+            return None
+        t = t.contiguous()
+
+        all_tensors = [torch.empty_like(t) for _ in range(self.world_size)]
+        dist.all_gather(all_tensors, t)
+
+        all_tensors[self.process_rank] = t
+        all_tensors = torch.cat(all_tensors, dim=0)
+
+        return all_tensors
+
+    def save(self, output_dir: str):
+        state_dict = self.model.state_dict()
+        state_dict = type(state_dict)({k: v.clone().cpu() for k, v in state_dict.items()})
+        self.model.save_pretrained(output_dir, state_dict=state_dict)
diff --git a/comps/finetuning_sqft/patches/nncf-v2.12.0.patch b/comps/finetuning_sqft/patches/nncf-v2.12.0.patch
new file mode 100644
index 0000000000..f4cbfe0401
--- /dev/null
+++ b/comps/finetuning_sqft/patches/nncf-v2.12.0.patch
@@ -0,0 +1,72 @@
+diff --git a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
+index bc6464b24..ca2666626 100644
+--- a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
++++ b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
+@@ -152,3 +152,16 @@ class ElasticityBuilder(PTCompressionAlgorithmBuilder):
+ 
+         # No conflict resolving with the related config options, parameters are overridden by compression state
+         self._available_elasticity_dims = list(map(ElasticityDim, available_elasticity_dims_state))
++
++    def _are_frozen_layers_allowed(self):
++        """
++        Check if frozen layers are allowed based on NNCF configuration.
++        If specified in NNCF configuration, frozen layers will be allowed.
++
++        :return: A tuple where the first element is a boolean indicating if frozen layers are allowed,
++                 and the second element is a string message explaining the reason.
++        """
++        frozen_layers_allowed = self.config.get("bootstrapNAS", {}).get("training", {}).get("frozen_layers_allowed", False)
++        if frozen_layers_allowed:
++            return True, "Frozen layers are allowed (`frozen_layers_allowed` is set to True in NNCF config)"
++        return super()._are_frozen_layers_allowed()
+diff --git a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
+index 92609327f..7a0555e3e 100644
+--- a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
++++ b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
+@@ -152,3 +152,16 @@ class ProgressiveShrinkingBuilder(PTCompressionAlgorithmBuilder):
+         self._bn_adapt_params = state_without_name[self._state_names.BN_ADAPTATION_PARAMS]
+         bn_adapt_algo_kwargs = get_bn_adapt_algo_kwargs(self.config, self._bn_adapt_params)
+         self._bn_adaptation = BatchnormAdaptationAlgorithm(**bn_adapt_algo_kwargs) if bn_adapt_algo_kwargs else None
++
++    def _are_frozen_layers_allowed(self):
++        """
++        Check if frozen layers are allowed based on the algorithm configuration.
++        If specified in the algorithm configuration, frozen layers will be allowed.
++
++        :return: A tuple where the first element is a boolean indicating if frozen layers are allowed,
++                 and the second element is a string message explaining the reason.
++        """
++        frozen_layers_allowed = self._algo_config.get("frozen_layers_allowed", False)
++        if frozen_layers_allowed:
++            return True, "Frozen layers are allowed (`frozen_layers_allowed` is set to True in the algorithm config)"
++        return super()._are_frozen_layers_allowed()
+diff --git a/nncf/torch/layer_utils.py b/nncf/torch/layer_utils.py
+index fb7d7bed7..3b8fda98e 100644
+--- a/nncf/torch/layer_utils.py
++++ b/nncf/torch/layer_utils.py
+@@ -127,6 +127,25 @@ class _NNCFModuleMixin:
+                 results = op_results
+         return results
+ 
++    def get_proxy_module(self, *args):
++        """
++        Gets a proxy module with pre-operations applied.
++
++        Args:
++            *args: Arguments for the pre-operations.
++
++        Returns:
++            ProxyModule: The proxy module with pre-operations applied.
++        """
++        proxy_module = ProxyModule(self)
++        for op in self.pre_ops.values():
++            op_args = op(proxy_module, args)
++            if op_args is not None:
++                if not isinstance(op_args, tuple):
++                    op_args = tuple([op_args])
++                args = op_args
++        return proxy_module
++
+ 
+ class CompressionParameter(nn.Parameter):
+     """
diff --git a/comps/finetuning_sqft/patches/peft-v0.10.0.patch b/comps/finetuning_sqft/patches/peft-v0.10.0.patch
new file mode 100644
index 0000000000..caefc3e735
--- /dev/null
+++ b/comps/finetuning_sqft/patches/peft-v0.10.0.patch
@@ -0,0 +1,220 @@
+diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
+index cc5c60a..fa1422e 100644
+--- a/src/peft/tuners/lora/config.py
++++ b/src/peft/tuners/lora/config.py
+@@ -268,6 +268,31 @@ class LoraConfig(PeftConfig):
+             )
+         },
+     )
++    sparse_adapter: bool = field(
++        default=False,
++        metadata={
++            "help": (
++                "Enable 'SparsePEFT'. This strategy is designed for fine-tuning sparse models using adapters. "
++                "It sparsifies the adapter's parameter matrix (BA) such that the sparsity pattern of BA aligns "
++                "with that of the base model's weights (W). This alignment allows for the merging of the adapter "
++                "with the base model without disrupting its sparsity. It is derived from SQFT() and is used in the "
++                "pipelines SQFT + SparsePEFT and SQFT + QA-SparsePEFT."
++            )
++        }
++    )
++    quantization_aware: bool = field(
++        default=False,
++        metadata={
++            "help": (
++                "Enable quantization-aware training. This strategy is designed for fine-tuning GPTQ quantized models "
++                "using adapters. It activates the `SQFTQuantAwareLinear` from SQFT in place of `QuantLinear`, enabling "
++                "quantization-aware training for adapters. This helps optimize model accuracy and allows the adapter "
++                "to be merged with the base quantized model, improving performance and deployment efficiency during "
++                "inference. This strategy, when used in conjunction with `sparse_adapter`, corresponds to the "
++                "SQFT + QA-SparsePEFT method described in the SQFT paper."
++            )
++        }
++    )
+ 
+     def __post_init__(self):
+         self.peft_type = PeftType.LORA
+diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
+index 333dfa6..7272824 100644
+--- a/src/peft/tuners/lora/gptq.py
++++ b/src/peft/tuners/lora/gptq.py
+@@ -108,7 +108,17 @@ def dispatch_gptq(
+     AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
+ 
+     if AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear):
+-        new_module = QuantLinear(target, adapter_name, **kwargs)
++        quantization_aware = kwargs.get("quantization_aware", False)
++        if quantization_aware:
++            # Attempt to import the `SQFTQuantAwareLinear` module
++            # from https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/blob/main/SQFT/modules/sqft_linear.py
++            try:
++                from modules.sqft_linear import SQFTQuantAwareLinear
++            except ImportError:
++                raise ImportError("The module 'SQFTQuantAwareLinear' could not be imported.")
++            new_module = SQFTQuantAwareLinear(target, adapter_name, **kwargs)
++        else:
++            new_module = QuantLinear(target, adapter_name, **kwargs)
+         target.qweight = target_base_layer.qweight
+ 
+     return new_module
+diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
+index 829b7bd..9d83967 100644
+--- a/src/peft/tuners/lora/layer.py
++++ b/src/peft/tuners/lora/layer.py
+@@ -28,6 +28,10 @@ from peft.utils.other import transpose
+ 
+ from .config import LoraConfig
+ 
++try:
++    from nncf.torch.layers import NNCFLinear
++except ImportError:
++    NNCFLinear = None
+ 
+ class LoraLayer(BaseTunerLayer):
+     # All names of layers that may contain (trainable) adapter weights
+@@ -346,6 +350,7 @@ class Linear(nn.Module, LoraLayer):
+         init_lora_weights: Union[bool, str] = True,
+         use_rslora: bool = False,
+         use_dora: bool = False,
++        sparse_adapter: bool = False,  # Set this to True if enabling 'SparsePEFT' for fine-tuning sparse models
+         **kwargs,
+     ) -> None:
+         super().__init__()
+@@ -363,6 +368,7 @@ class Linear(nn.Module, LoraLayer):
+             use_dora=use_dora,
+         )
+         self.is_target_conv_1d_layer = is_target_conv_1d_layer
++        self.sparse_adapter = sparse_adapter
+ 
+     def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
+         """
+@@ -471,6 +477,10 @@ class Linear(nn.Module, LoraLayer):
+             weight_B = weight_B.float()
+ 
+         output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
++        if self.sparse_adapter:
++            # Apply the sparse mask to BA (`output_tensor`).
++            mask = (self.base_layer.weight != 0)
++            output_tensor = output_tensor * mask
+ 
+         if cast_to_fp32:
+             output_tensor = output_tensor.to(dtype=dtype)
+@@ -506,7 +516,26 @@ class Linear(nn.Module, LoraLayer):
+                 x = x.to(lora_A.weight.dtype)
+ 
+                 if not self.use_dora[active_adapter]:
+-                    result = result + lora_B(lora_A(dropout(x))) * scaling
++                    if not self.sparse_adapter:
++                        result = result + lora_B(lora_A(dropout(x))) * scaling
++                    else:
++                        # Since 'sparse_adapter' is enabled, we need to multiply the parameter matrices of `lora_B` and
++                        # `lora_A` here instead of calling the forward methods of `lora_B` and `lora_A`. This results
++                        # in the NNCF graph not recognizing lora A and lora B nodes when using NLS strategy. Therefore,
++                        # we execute `lora_B(lora_A(x))` solely to include these two NNCFLinear nodes in the NNCF graph.
++                        if NNCFLinear is not None and not self.training:
++                            lora_B(lora_A(x))
++                        if NNCFLinear is not None and isinstance(lora_A, NNCFLinear):
++                            adapter_weight = torch.matmul(
++                                lora_B.get_proxy_module(x).weight,
++                                lora_A.get_proxy_module(x).weight
++                            ) * scaling
++                        else:
++                            adapter_weight = torch.matmul(lora_B.weight, lora_A.weight) * scaling
++                        # Apply the sparse mask to BA (`adapter_weight`).
++                        mask = (self.base_layer.weight != 0).detach()
++                        adapter_weight = adapter_weight * mask
++                        result = result + nn.functional.linear(dropout(x), adapter_weight)
+                 else:
+                     x = dropout(x)
+                     result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
+diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
+index 3f381ef..3e696ca 100644
+--- a/src/peft/tuners/lora/model.py
++++ b/src/peft/tuners/lora/model.py
+@@ -193,6 +193,8 @@ class LoraModel(BaseTuner):
+             "init_lora_weights": lora_config.init_lora_weights,
+             "use_rslora": lora_config.use_rslora,
+             "use_dora": lora_config.use_dora,
++            "quantization_aware": lora_config.quantization_aware,
++            "sparse_adapter": lora_config.sparse_adapter,
+             "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
+             "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
+         }
+@@ -233,7 +235,10 @@ class LoraModel(BaseTuner):
+             child = child.base_layer
+ 
+         if not hasattr(new_module, "base_layer"):
+-            new_module.weight = child.weight
++            if hasattr(child, "qweight"):
++                new_module.qweight = child.qweight
++            else:
++                new_module.weight = child.weight
+             if hasattr(child, "bias"):
+                 new_module.bias = child.bias
+ 
+@@ -401,7 +406,11 @@ class LoraModel(BaseTuner):
+         Currently gptq quantization and replicated layers do not support merging.
+         """
+         if getattr(self.model, "quantization_method", None) == "gptq":
+-            raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
++            peft_config = self.get_peft_config_as_dict()
++            # Check if the 'quantization_aware' flag is set to False in the PEFT configuration
++            # Raise an error if the model is GPTQ quantized and 'quantization_aware' is not enabled
++            if not peft_config.get("quantization_aware", False):
++                raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
+         if self.peft_config.get("layer_replication"):
+             raise ValueError("Cannot merge LORA layers when base model layers are replicated")
+ 
+diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
+index 5ac1264..acb5d27 100644
+--- a/src/peft/utils/save_and_load.py
++++ b/src/peft/utils/save_and_load.py
+@@ -246,6 +246,48 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
+     else:
+         raise NotImplementedError
+ 
++    def module_reshape(state_dict):
++        """Reshape the linear module to match the state dict.
++
++        Args:
++            state_dict (dict): The state dict containing the parameters.
++        """
++        for param_name, param in state_dict.items():
++            tensor_name = param_name
++            splits = tensor_name.split(".")
++
++            # If the parameter name has multiple parts, navigate through the module hierarchy
++            if len(splits) > 1:
++                module = model
++                parent = None
++
++                # Traverse the module hierarchy to find the target module
++                for split in splits[:-1]:
++                    new_module = getattr(module, split, None)
++                    if new_module is None:
++                        raise ValueError(f"{module} has no attribute {split}.")
++                    parent = module
++                    module = new_module
++
++                tensor_name = splits[-1]
++                old_value = getattr(module, tensor_name)
++
++                # Check if the shape of the original module differs from the shape of the loaded parameter
++                if old_value.shape != param.shape and isinstance(module, torch.nn.Linear):
++                    # Create a new Linear module with the new shape
++                    new_module = torch.nn.Linear(
++                        param.shape[1],
++                        param.shape[0],
++                        bias=module.bias is not None,
++                        dtype=module.weight.dtype,
++                        device=module.weight.device
++                    )
++                    # Replace the old module with the new one in the parent module
++                    setattr(parent, splits[-2], new_module)
++
++    # Reshape the modules in the peft model to match the state dict
++    module_reshape(peft_model_state_dict)
++
+     load_result = model.load_state_dict(peft_model_state_dict, strict=False)
+     if config.is_prompt_learning:
+         model.prompt_encoder[adapter_name].embedding.load_state_dict(
diff --git a/comps/finetuning_sqft/patches/transformers-v4.44.2.patch b/comps/finetuning_sqft/patches/transformers-v4.44.2.patch
new file mode 100644
index 0000000000..a35e96297a
--- /dev/null
+++ b/comps/finetuning_sqft/patches/transformers-v4.44.2.patch
@@ -0,0 +1,171 @@
+diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
+index 68ba7babf..6b54a3987 100755
+--- a/src/transformers/trainer.py
++++ b/src/transformers/trainer.py
+@@ -155,6 +155,7 @@ from .utils import (
+     is_in_notebook,
+     is_ipex_available,
+     is_lomo_available,
++    is_nncf_available,
+     is_peft_available,
+     is_safetensors_available,
+     is_sagemaker_dp_enabled,
+@@ -245,6 +246,11 @@ if is_accelerate_available():
+ if is_accelerate_available("0.28.0"):
+     from accelerate.utils import DataLoaderConfiguration
+ 
++if is_nncf_available():
++    from nncf.torch.compression_method_api import PTCompressionAlgorithmController
++else:
++    PTCompressionAlgorithmController = None
++
+ 
+ def _is_peft_model(model):
+     if is_peft_available():
+@@ -352,6 +358,8 @@ class Trainer:
+             by this function will be reflected in the predictions received by `compute_metrics`.
+ 
+             Note that the labels (second parameter) will be `None` if the dataset does not have them.
++        compression_ctrl ([`PTCompressionAlgorithmController`], *optional*): A compression controller to use. Note that
++            this script only supports `ProgressiveShrinkingController` of NNCF (https://github.com/openvinotoolkit/nncf).
+ 
+     Important attributes:
+ 
+@@ -387,6 +395,7 @@ class Trainer:
+         callbacks: Optional[List[TrainerCallback]] = None,
+         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
++        compression_ctrl: PTCompressionAlgorithmController = None
+     ):
+         if args is None:
+             output_dir = "tmp_trainer"
+@@ -400,6 +409,7 @@ class Trainer:
+                     " summary statistics should be returned by the function."
+                 )
+         self.args = args
++        self.compression_ctrl = compression_ctrl
+         # Seed must be set before instantiating the model when using model
+         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
+         self.hp_name = None
+@@ -1040,7 +1050,10 @@ class Trainer:
+             optimizer = self.optimizer.optimizer
+         else:
+             optimizer = self.optimizer
+-        self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
++        # If compression_ctrl (`ProgressiveShrinkingController`) is not used, create a scheduler.
++        # If compression_ctrl is used (not None), it will use its own learning rate scheduler.
++        if self.compression_ctrl is None:
++            self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+ 
+     def get_decay_parameter_names(self, model) -> List[str]:
+         """
+@@ -1569,7 +1582,9 @@ class Trainer:
+             self.state.stateful_callbacks["TrainerControl"] = self.control.state()
+             self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+             torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+-            torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
++            # Save the learning rate scheduler state if compression_ctrl is not used.
++            if self.compression_ctrl is None:
++                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+ 
+     def call_model_init(self, trial=None):
+         model_init_argcount = number_of_arguments(self.model_init)
+@@ -2204,8 +2219,16 @@ class Trainer:
+         if args.eval_on_start:
+             self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
+ 
++        # Initialize the learning rate scheduler if compression_ctrl is used.
++        if self.compression_ctrl is not None:
++            train_iters = len(train_dataloader)
++            self.compression_ctrl.set_training_lr_scheduler_args(self.optimizer, train_iters)
++
+         total_batched_samples = 0
+         for epoch in range(epochs_trained, num_train_epochs):
++            # Perform an epoch step for the compression controller's scheduler if it is used.
++            if self.compression_ctrl is not None:
++                self.compression_ctrl.scheduler.epoch_step()
+             epoch_iterator = train_dataloader
+             if hasattr(epoch_iterator, "set_epoch"):
+                 epoch_iterator.set_epoch(epoch)
+@@ -2234,6 +2257,10 @@ class Trainer:
+ 
+             step = -1
+             for step, inputs in enumerate(epoch_iterator):
++                # Perform a step for the compression controller's scheduler if it is used.
++                # Include actions such as activating the subnetwork or updating the learning rate.
++                if self.compression_ctrl is not None:
++                    self.compression_ctrl.scheduler.step()
+                 total_batched_samples += 1
+ 
+                 if self.args.include_num_input_tokens_seen:
+@@ -2345,7 +2372,10 @@ class Trainer:
+                     optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
+                     if optimizer_was_run:
+                         # Delay optimizer scheduling until metrics are generated
+-                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
++                        if (
++                            not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
++                            and self.compression_ctrl is None
++                        ):
+                             self.lr_scheduler.step()
+ 
+                     model.zero_grad()
+@@ -2791,7 +2821,11 @@ class Trainer:
+             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
+             if grad_norm is not None:
+                 logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
+-            logs["learning_rate"] = self._get_learning_rate()
++            # Retrieve the current learning rate from the compression controller if available, otherwise use the default method
++            if self.compression_ctrl is not None:
++                logs["learning_rate"] = self.compression_ctrl.scheduler.lr_scheduler.get_last_lr()[0]
++            else:
++                logs["learning_rate"] = self._get_learning_rate()
+ 
+             self._total_loss_scalar += tr_loss_scalar
+             self._globalstep_last_logged = self.state.global_step
+@@ -3015,7 +3049,9 @@ class Trainer:
+             and not is_torch_xla_available()
+         ):
+             with warnings.catch_warnings(record=True) as caught_warnings:
+-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
++                # Save the learning rate scheduler state if compression_ctrl is not used.
++                if self.compression_ctrl is None:
++                    torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+             reissue_pt_warnings(caught_warnings)
+ 
+     def _load_optimizer_and_scheduler(self, checkpoint):
+diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
+index efe473a6c..1040a75f4 100755
+--- a/src/transformers/utils/__init__.py
++++ b/src/transformers/utils/__init__.py
+@@ -152,6 +152,7 @@ from .import_utils import (
+     is_natten_available,
+     is_ninja_available,
+     is_nltk_available,
++    is_nncf_available,
+     is_onnx_available,
+     is_openai_available,
+     is_optimum_available,
+diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
+index 3b0abd334..823e8919f 100755
+--- a/src/transformers/utils/import_utils.py
++++ b/src/transformers/utils/import_utils.py
+@@ -131,6 +131,7 @@ _levenshtein_available = _is_package_available("Levenshtein")
+ _librosa_available = _is_package_available("librosa")
+ _natten_available = _is_package_available("natten")
+ _nltk_available = _is_package_available("nltk")
++_nncf_available = _is_package_available("nncf")
+ _onnx_available = _is_package_available("onnx")
+ _openai_available = _is_package_available("openai")
+ _optimum_available = _is_package_available("optimum")
+@@ -1056,6 +1057,10 @@ def is_nltk_available():
+     return _nltk_available
+ 
+ 
++def is_nncf_available():
++    return _nncf_available
++
++
+ def is_torchaudio_available():
+     return _torchaudio_available
+ 
diff --git a/comps/finetuning_sqft/requirements.txt b/comps/finetuning_sqft/requirements.txt
new file mode 100644
index 0000000000..6eff6b62ac
--- /dev/null
+++ b/comps/finetuning_sqft/requirements.txt
@@ -0,0 +1,17 @@
+aiohttp
+datasets
+docarray
+fastapi
+httpx
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+pydantic==2.8.2
+pydantic_yaml
+python-multipart
+pyyaml
+ray[all]
+requests
+shortuuid
+uvicorn
diff --git a/comps/finetuning_sqft/utils/extract_sub_adapter.py b/comps/finetuning_sqft/utils/extract_sub_adapter.py
new file mode 100644
index 0000000000..ae2a3b7faf
--- /dev/null
+++ b/comps/finetuning_sqft/utils/extract_sub_adapter.py
@@ -0,0 +1,101 @@
+import argparse
+import os
+import re
+
+import torch
+from nncf import NNCFConfig
+from peft.utils import CONFIG_NAME, WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME
+
+PATTERN = re.compile(r"[[](.*?)[]]", re.S)
+
+
+def get_width_for_query_prefix(torch_module_to_width, query_module, length=5):
+    """
+    Get the width for a given query module prefix.
+
+    Args:
+        torch_module_to_width (dict): Mapping from torch module to width.
+        query_module (str): The query module name.
+        length (int, optional): The length of the prefix to match. Default is 5.
+
+    Returns:
+        int: The width for the query module prefix.
+    """
+    query_module_list = query_module.split(".")
+    width = next(
+        (
+            value
+            for torch_module, value in torch_module_to_width.items()
+            if torch_module.split(".")[:length] == query_module_list[:length]
+        ),
+        None,
+    )
+    return width
+
+
+def main(adapter_model_path, nncf_config, sub_adapter_version, custom_config=None):
+    output_dir = os.path.join(adapter_model_path, sub_adapter_version)
+    os.makedirs(output_dir, exist_ok=True)
+    nncf_config = NNCFConfig.from_json(nncf_config)
+    try:
+        overwrite_groups = nncf_config["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"]
+        overwrite_groups_widths = nncf_config["bootstrapNAS"]["training"]["elasticity"]["width"][
+            "overwrite_groups_widths"
+        ]
+        assert len(overwrite_groups) == len(overwrite_groups_widths)
+    except Exception:
+        raise ValueError("Cannot get the search space in NNCF config.")
+
+    if sub_adapter_version == "maximal":
+        subnetwork_config = {idx: space[0] for idx, space in enumerate(overwrite_groups_widths)}
+    elif sub_adapter_version == "heuristic":
+        subnetwork_config = {idx: space[(len(space) - 1) // 2] for idx, space in enumerate(overwrite_groups_widths)}
+    elif sub_adapter_version == "minimal":
+        subnetwork_config = {idx: space[-1] for idx, space in enumerate(overwrite_groups_widths)}
+    else:
+        assert custom_config is not None, "Missing custom subnetwork config."
+        assert isinstance(custom_config, list), "Custom config must be a list."
+        subnetwork_config = {i: value for i, value in enumerate(custom_config)}
+
+    # Mapping: nncf node -> width
+    nncf_node_to_width = {}
+    for idx, value in subnetwork_config.items():
+        space = overwrite_groups_widths[idx]
+        assert min(space) <= value <= max(space)
+        cur_dict = {node: value for node in overwrite_groups[idx]}
+        nncf_node_to_width.update(cur_dict)
+
+    # Prune adapter model (LoRA low-rank)
+    lora_torch_module_to_width = {
+        ".".join(re.findall(PATTERN, k)): v for k, v in nncf_node_to_width.items() if "lora_A" in k
+    }
+    num_module_name_item = list(lora_torch_module_to_width.keys())[0].split(".").index("lora_A")
+    # Load adapter weights
+    try:
+        super_adapter_weights = torch.load(os.path.join(adapter_model_path, WEIGHTS_NAME))
+    except:
+        from safetensors.torch import load_file
+        super_adapter_weights = load_file(os.path.join(adapter_model_path, SAFETENSORS_WEIGHTS_NAME))
+    sub_adapter_weights = {}
+    for weight_key, weight_tensor in super_adapter_weights.items():
+        width = get_width_for_query_prefix(lora_torch_module_to_width, weight_key, length=num_module_name_item)
+        if width is not None:
+            is_loraA = "lora_A" in weight_key
+            new_weight_tensor = weight_tensor[:width].clone() if is_loraA else weight_tensor[:, :width].clone()
+        else:
+            new_weight_tensor = weight_tensor.clone()
+        sub_adapter_weights[weight_key] = new_weight_tensor
+    os.makedirs(output_dir, exist_ok=True)
+    torch.save(sub_adapter_weights, os.path.join(output_dir, WEIGHTS_NAME))
+    config_path = os.path.join(adapter_model_path, CONFIG_NAME)
+    os.system(f"cp {config_path} {output_dir}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Merge base model and adapter model with additional configurations")
+    parser.add_argument('--adapter_model_path', type=str, required=True, help="Path to the adapter model")
+    parser.add_argument('--nncf_config', type=str, required=True, help="Path to the NNCF configuration")
+    parser.add_argument('--sub_adapter_version', type=str, required=True, help="Sub adapter version")
+    parser.add_argument('--custom_config', type=str, default=None, help="Path to custom configuration (optional)")
+    args = parser.parse_args()
+    main(args.adapter_model_path, args.nncf_config, args.sub_adapter_version, args.custom_config)
diff --git a/comps/finetuning_sqft/utils/merge.py b/comps/finetuning_sqft/utils/merge.py
new file mode 100644
index 0000000000..51b8381235
--- /dev/null
+++ b/comps/finetuning_sqft/utils/merge.py
@@ -0,0 +1,27 @@
+import argparse
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def main(base_model_path, adapter_model_path, output_path):
+    base_model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True)
+    model = PeftModel.from_pretrained(base_model, adapter_model_path)
+    model.eval()
+    for name, param in model.named_parameters():
+        param.requires_grad = False
+    merged_model = model.merge_and_unload()
+    merged_model.train(False)
+    base_model.save_pretrained(output_path, state_dict=merged_model.state_dict())
+
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
+    tokenizer.save_pretrained(output_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Merge base model and adapter model")
+    parser.add_argument('--base_model_path', type=str, required=True, help="Path to the base model")
+    parser.add_argument('--adapter_model_path', type=str, required=True, help="Path to the adapter model")
+    parser.add_argument('--output_path', type=str, required=True, help="Path to save the merged model")
+
+    args = parser.parse_args()
+    main(args.base_model_path, args.adapter_model_path, args.output_path)
diff --git a/comps/finetuning_sqft/utils/nncf_config_process.py b/comps/finetuning_sqft/utils/nncf_config_process.py
new file mode 100644
index 0000000000..521e6fefa7
--- /dev/null
+++ b/comps/finetuning_sqft/utils/nncf_config_process.py
@@ -0,0 +1,156 @@
+import os
+import json
+from nncf import NNCFConfig
+
+
+NNCF_CONFIG_TEMPLATE = {
+    "input_info": [
+        {
+            "sample_size": [1, 256],
+            "type": "long",
+            "keyword": "input_ids"
+        },
+        {
+            "sample_size": [1, 256],
+            "type": "long",
+            "keyword": "attention_mask"
+        }
+    ],
+    "bootstrapNAS": {
+        "training": {
+            "algorithm": "progressive_shrinking",
+            "frozen_layers_allowed": True,
+            "progressivity_of_elasticity": ["width"],
+            "batchnorm_adaptation": {
+                "num_bn_adaptation_samples": 0
+            },
+            "schedule": {
+                "list_stage_descriptions": [
+                    {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 8, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1}
+                ]
+            },
+            "elasticity": {
+                "available_elasticity_dims": ["width"],
+                "width": {
+                    "overwrite_groups": [],
+                    "overwrite_groups_widths": []
+                }
+            }
+        }
+    }
+}
+
+
+def add_lr_epochs(nncf_config, learning_rate=3e-4, num_epochs=3):
+    """Add learning rate and epochs to the NNCF configuration.
+
+    Args:
+        nncf_config (dict): The NNCF configuration dictionary.
+        learning_rate (float): The initial learning rate to set.
+        num_epochs (int): The number of epochs to set.
+
+    Returns:
+        dict: The updated NNCF configuration.
+    """
+    stage_description = nncf_config["bootstrapNAS"]["training"]["schedule"]["list_stage_descriptions"][0]
+    if stage_description["init_lr"] == -1:
+        stage_description["init_lr"] = learning_rate
+    if stage_description["epochs"] == -1:
+        stage_description["epochs"] = num_epochs
+        stage_description["epochs_lr"] = num_epochs
+
+    return nncf_config
+
+
+def get_model_paths(model, target_module_name):
+    """
+    Find all paths to the target layer in the model.
+
+    Args:
+        model (torch.nn.Module): The model to search.
+        target_module_name (str): The name of the target layer.
+
+    Returns:
+        list: A list of paths to the target layer.
+    """
+    def find_layers(module, target_module_name, path, paths):
+        for name, sub_module in module.named_children():
+            new_path = f"{path}/{sub_module.__class__.__name__}[{name}]"
+            if target_module_name in name:
+                # Check if 'lora_A' is in the sub_module's children
+                for sub_name, _ in sub_module.named_children():
+                    if "lora_A" in sub_name:
+                        paths.append(f"{new_path}/ModuleDict[lora_A]/NNCFLinear[default]/linear_0")
+            find_layers(sub_module, target_module_name, new_path, paths)
+
+    base_path = model.__class__.__name__
+    paths = []
+    find_layers(model, target_module_name, base_path, paths)
+    return paths
+
+def load_nncf_config(
+    config,
+    model,
+    target_module_groups=None,
+    search_space=None,
+    nncf_config=None
+):
+    """Load and preprocess the NNCF configuration file.
+
+    Returns:
+        NNCFConfig: The preprocessed NNCF configuration object.
+    """
+
+    if nncf_config is not None:
+        nncf_config = NNCFConfig.from_json(nncf_config)
+    else:
+        if search_space is None and target_module_groups:
+            raise ValueError("Neural LoRA search is enabled, `search_space` and `target_module_groups` must be provided.")
+        # The NNCF Config will be automatically generated based on `target_module_groups` and `search_space`.
+        num_hidden_layers = model.config.num_hidden_layers
+        nncf_config_dict = NNCF_CONFIG_TEMPLATE
+        overwrite_groups = []
+        for group in target_module_groups:
+            group_paths = []
+            for module in group:
+                target_layer_name = module
+                paths = get_model_paths(model, target_layer_name)
+                assert paths, f"No paths found for module {module}"
+                group_paths.append(paths)
+            # Transpose the list of lists to combine paths by their positions
+            transposed_paths = list(zip(*group_paths))
+            overwrite_groups.extend([list(path_group) for path_group in transposed_paths])
+        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"] = overwrite_groups
+
+        overwrite_groups_widths = []
+        for space in search_space:
+            space = [int(width) for width in space.split(",")]
+            overwrite_groups_widths.extend([space] * num_hidden_layers)
+        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups_widths"] = overwrite_groups_widths
+        assert len(overwrite_groups) == len(overwrite_groups_widths)
+        nncf_config_dict = add_lr_epochs(
+            nncf_config_dict,
+            learning_rate=config["Training"]["learning_rate"],
+            num_epochs=config["Training"]["epochs"]
+        )
+        nncf_config = NNCFConfig.from_dict(nncf_config_dict)
+
+    nncf_config["log_dir"] = config["General"]["output_dir"]
+    os.makedirs(nncf_config["log_dir"], exist_ok=True)
+    with open(os.path.join(nncf_config["log_dir"], "nncf_config.json"), "w") as f:
+        json.dump(nncf_config, f, indent=4)
+    return nncf_config
+
+
+if __name__ == '__main__':
+    import transformers
+    from peft import LoraConfig, get_peft_model
+    model = transformers.AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    lora_config = {
+        "task_type": "CAUSAL_LM",
+        "r": 16,
+        "target_modules": ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj"]
+    }
+    peft_config = LoraConfig(**lora_config)
+    model = get_peft_model(model, peft_config)
+    load_nncf_config(None, model, [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]], ["16,12,8", "16", "16,12"])

From 6cbab5998ef2b7e3c89acdb829a8dcc0f565fb80 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:26:49 +0000
Subject: [PATCH 02/17] [pre-commit.ci] auto fixes from pre-commit.com hooks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

for more information, see https://pre-commit.ci

Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning/README.md                    |   4 +-
 comps/finetuning/finetune_config.py           |  10 +-
 comps/finetuning/finetuning_service.py        |   9 +-
 comps/finetuning/handlers.py                  |  21 +-
 .../llm_on_ray/finetune/finetune.py           |  20 +-
 .../utils/create_sqft_nncf_config.py          |  40 +-
 comps/finetuning/utils/extract_sub_adapter.py |  11 +-
 comps/finetuning/utils/merge_adapter.py       |   3 +
 comps/finetuning_sqft/README.md               |  18 +-
 .../example_nncf_config/nncf_config.json      | 978 +++++++-----------
 comps/finetuning_sqft/finetune_sqft_config.py |  10 +-
 .../finetuning_sqft_service.py                |   9 +-
 comps/finetuning_sqft/handlers.py             |  21 +-
 .../llm_on_ray/finetune/finetune.py           |  15 +-
 .../patches/peft-v0.10.0.patch                |   2 +-
 .../utils/extract_sub_adapter.py              |  17 +-
 comps/finetuning_sqft/utils/merge.py          |  10 +-
 .../utils/nncf_config_process.py              |  76 +-
 18 files changed, 510 insertions(+), 764 deletions(-)

diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md
index d2e26582f4..d6ad323670 100644
--- a/comps/finetuning/README.md
+++ b/comps/finetuning/README.md
@@ -114,9 +114,9 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-#### 3.2.2 Instruction Tuning with SQFT's Neural Low-Rank Adapter Search (NLS) 
+#### 3.2.2 Instruction Tuning with SQFT's Neural Low-Rank Adapter Search (NLS)
 
-In addition to traditional fine-tuning, you can use SQFT's NLS to fine-tune your model. 
+In addition to traditional fine-tuning, you can use SQFT's NLS to fine-tune your model.
 More details about SQFT can be found in [this paper](https://aclanthology.org/2024.findings-emnlp.749.pdf).
 Please follow the additional installation requirements [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#-start-nls-microservice-with-python).
 Use the following command to launch a finetuning job with the NLS algorithm:
diff --git a/comps/finetuning/finetune_config.py b/comps/finetuning/finetune_config.py
index 3f297c80f1..df9a6be9bb 100644
--- a/comps/finetuning/finetune_config.py
+++ b/comps/finetuning/finetune_config.py
@@ -5,7 +5,7 @@
 
 from typing import List, Optional, Union
 
-from pydantic import BaseModel, Field, validator, root_validator
+from pydantic import BaseModel, Field, root_validator, validator
 
 from comps.cores.proto.api_protocol import FineTuningJobIDRequest, FineTuningJobsRequest
 
@@ -44,10 +44,10 @@ class SQFTNLSConfig(LoraConfig):
 
     @root_validator(pre=True)
     def set_target_modules(cls, values):
-        target_module_groups = values.get('target_module_groups')
+        target_module_groups = values.get("target_module_groups")
         if target_module_groups is not None:
-            values['target_modules'] = [item for sublist in target_module_groups for item in sublist]
-        search_space = values.get('search_space')
+            values["target_modules"] = [item for sublist in target_module_groups for item in sublist]
+        search_space = values.get("search_space")
         if search_space is not None:
             assert len(search_space) == len(target_module_groups)
         return values
@@ -217,9 +217,11 @@ class FineTuningParams(FineTuningJobsRequest):
     Dataset: DatasetConfig = DatasetConfig()
     Training: TrainingConfig = TrainingConfig()
 
+
 class ExtractSubAdapterParams(FineTuningJobIDRequest):
     adapter_version: str = "heuristic"
     custom_config: Optional[List[int]] = None
 
+
 class MergeAdapterParams(FineTuningJobIDRequest):
     adapter_version: Optional[str] = None
diff --git a/comps/finetuning/finetuning_service.py b/comps/finetuning/finetuning_service.py
index 4a925ff837..1d76eab0ae 100644
--- a/comps/finetuning/finetuning_service.py
+++ b/comps/finetuning/finetuning_service.py
@@ -4,7 +4,7 @@
 
 from comps import opea_microservices, register_microservice
 from comps.cores.proto.api_protocol import FineTuningJobIDRequest, UploadFileRequest
-from comps.finetuning.finetune_config import FineTuningParams, ExtractSubAdapterParams, MergeAdapterParams
+from comps.finetuning.finetune_config import ExtractSubAdapterParams, FineTuningParams, MergeAdapterParams
 from comps.finetuning.handlers import (
     handle_cancel_finetuning_job,
     handle_create_finetuning_jobs,
@@ -22,6 +22,7 @@
 def create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
     return handle_create_finetuning_jobs(request, background_tasks)
 
+
 @register_microservice(
     name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
 )
@@ -63,10 +64,14 @@ def list_checkpoints(request: FineTuningJobIDRequest):
     checkpoints = handle_list_finetuning_checkpoints(request)
     return checkpoints
 
-@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/extract_sub_adapter", host="0.0.0.0", port=8015)
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/finetune/extract_sub_adapter", host="0.0.0.0", port=8015
+)
 def extract_sub_adapter(request: ExtractSubAdapterParams):
     return handle_extract_sub_adapter(request)
 
+
 @register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/merge_adapter", host="0.0.0.0", port=8015)
 def merge_adapter(request: MergeAdapterParams):
     return handle_merge_adapter(request)
diff --git a/comps/finetuning/handlers.py b/comps/finetuning/handlers.py
index dde6424420..fb4fe9afdc 100644
--- a/comps/finetuning/handlers.py
+++ b/comps/finetuning/handlers.py
@@ -152,7 +152,7 @@ def handle_extract_sub_adapter(request: ExtractSubAdapterParams):
     if not os.path.exists(finetuned_model_path):
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
         )
     if job.status != "succeeded":
         raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
@@ -160,27 +160,27 @@ def handle_extract_sub_adapter(request: ExtractSubAdapterParams):
     if finetune_config.General.lora_config is None:
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
         )
     if not finetune_config.General.lora_config.neural_lora_search:
         raise HTTPException(
             status_code=404,
             detail=f"The fine-tuning job '{fine_tuning_job_id}' did not enable NLS algorithm, "
-                   f"there is no need to extract sub-adapters!"
+            f"there is no need to extract sub-adapters!",
         )
     nncf_config_path = os.path.join(finetune_config.General.output_dir, "nncf_config.json")
     if not os.path.exists(nncf_config_path):
         raise HTTPException(
-            status_code=404,
-            detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
+            status_code=404, detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
         )
 
     from comps.finetuning.utils.extract_sub_adapter import main as extract_sub_adapter_main
+
     extract_sub_adapter_main(
         adapter_model_path=finetuned_model_path,
         nncf_config=nncf_config_path,
         adapter_version=request.adapter_version,
-        custom_config=request.custom_config
+        custom_config=request.custom_config,
     )
 
     return fine_tuning_job_id
@@ -199,7 +199,7 @@ def handle_merge_adapter(request: MergeAdapterParams):
     if not os.path.exists(finetuned_model_path):
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
         )
     if job.status != "succeeded":
         raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
@@ -207,7 +207,7 @@ def handle_merge_adapter(request: MergeAdapterParams):
     if finetune_config.General.lora_config is None:
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
         )
 
     adapter_path = finetuned_model_path
@@ -217,14 +217,15 @@ def handle_merge_adapter(request: MergeAdapterParams):
         if not os.path.exists(adapter_path):
             raise HTTPException(
                 status_code=404,
-                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!"
+                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!",
             )
 
     from comps.finetuning.utils.merge_adapter import main as merge_adapter_main
+
     merge_adapter_main(
         base_model_path=finetune_config.General.base_model,
         adapter_model_path=adapter_path,
-        output_path=os.path.join(adapter_path, "merged_model")
+        output_path=os.path.join(adapter_path, "merged_model"),
     )
 
     return fine_tuning_job_id
diff --git a/comps/finetuning/llm_on_ray/finetune/finetune.py b/comps/finetuning/llm_on_ray/finetune/finetune.py
index 97a6257b33..5216cc660b 100644
--- a/comps/finetuning/llm_on_ray/finetune/finetune.py
+++ b/comps/finetuning/llm_on_ray/finetune/finetune.py
@@ -40,11 +40,13 @@
 logger = CustomLogger("llm_on_ray/finetune")
 
 try:
-    from comps.finetuning.utils.create_sqft_nncf_config import create_sqft_nncf_config
     from nncf.experimental.torch.nas.bootstrapNAS.training.model_creator_helpers import (
         create_compressed_model_from_algo_names,
     )
     from nncf.torch.model_creation import create_nncf_network
+
+    from comps.finetuning.utils.create_sqft_nncf_config import create_sqft_nncf_config
+
     is_nncf_available = True
 except ImportError:
     is_nncf_available = False
@@ -368,15 +370,10 @@ def load_model(config: Dict):
                 if not is_nncf_available:
                     raise NotImplementedError("NNCF is not installed. Please install it for enabling NLS algorithm.")
                 nncf_config = create_sqft_nncf_config(
-                    config=config,
-                    model=model,
-                    target_module_groups=target_module_groups,
-                    search_space=search_space
+                    config=config, model=model, target_module_groups=target_module_groups, search_space=search_space
                 )
                 model = create_nncf_network(model, nncf_config)
-                nls_controller, model = create_compressed_model_from_algo_names(
-                    model, nncf_config, algo_names=["nls"]
-                )
+                nls_controller, model = create_compressed_model_from_algo_names(model, nncf_config, algo_names=["nls"])
     elif task == "rerank":
         model = CrossEncoder.from_pretrained(
             config["Dataset"].get("train_group_size", 8),
@@ -414,6 +411,7 @@ def load_model(config: Dict):
 
     return model, ref_model, nls_controller
 
+
 def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, data_collator, nls_controller=None):
     device = config["Training"]["device"]
     task = config["General"].get("task", "instruction_tuning")
@@ -443,7 +441,9 @@ def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, da
                 "model": model,
                 "args": training_args,
                 "train_dataset": tokenized_dataset["train"],
-                "eval_dataset": tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
+                "eval_dataset": (
+                    tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None
+                ),
                 "tokenizer": tokenizer,
                 "data_collator": data_collator,
             }
@@ -453,7 +453,7 @@ def get_trainer(config: Dict, model, ref_model, tokenizer, tokenized_dataset, da
         return training_args, trainer
     elif device in ["hpu"]:
         if nls_controller is not None:
-            raise NotImplementedError(f"NLS algorithm is not supported on HPU now.")
+            raise NotImplementedError("NLS algorithm is not supported on HPU now.")
         from optimum.habana import GaudiConfig
         from optimum.habana.transformers import GaudiTrainer, GaudiTrainingArguments
 
diff --git a/comps/finetuning/utils/create_sqft_nncf_config.py b/comps/finetuning/utils/create_sqft_nncf_config.py
index eb76fcc310..731791da41 100644
--- a/comps/finetuning/utils/create_sqft_nncf_config.py
+++ b/comps/finetuning/utils/create_sqft_nncf_config.py
@@ -1,9 +1,13 @@
-import os
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
+import os
 
 try:
     from nncf import NNCFConfig
     from nncf.experimental.torch import sqft
+
     is_nncf_available = True
 except ImportError:
     is_nncf_available = False
@@ -11,29 +15,18 @@
 
 NNCF_CONFIG_TEMPLATE = {
     "input_info": [
-        {
-            "sample_size": [1, 256],
-            "type": "long",
-            "keyword": "input_ids"
-        },
-        {
-            "sample_size": [1, 256],
-            "type": "long",
-            "keyword": "attention_mask"
-        }
+        {"sample_size": [1, 256], "type": "long", "keyword": "input_ids"},
+        {"sample_size": [1, 256], "type": "long", "keyword": "attention_mask"},
     ],
     "SQFT": {
         "training": {
             "algorithm": "nls",
             "elasticity": {
                 "available_elasticity_dims": ["width"],
-                "width": {
-                    "overwrite_groups": [],
-                    "overwrite_groups_widths": []
-                }
-            }
+                "width": {"overwrite_groups": [], "overwrite_groups_widths": []},
+            },
         }
-    }
+    },
 }
 
 
@@ -65,8 +58,7 @@ def add_lr_epochs(nncf_config, learning_rate=3e-4, num_train_epochs=3):
 
 
 def get_model_paths(model, target_module_name):
-    """
-    Find all paths to the target layer in the model.
+    """Find all paths to the target layer in the model.
 
     Args:
         model (torch.nn.Module): The model to search.
@@ -75,6 +67,7 @@ def get_model_paths(model, target_module_name):
     Returns:
         list: A list of paths to the target layer.
     """
+
     def find_layers(module, target_module_name, path, paths):
         for name, sub_module in module.named_children():
             new_path = f"{path}/{sub_module.__class__.__name__}[{name}]"
@@ -91,12 +84,7 @@ def find_layers(module, target_module_name, path, paths):
     return paths
 
 
-def create_sqft_nncf_config(
-    config,
-    model,
-    target_module_groups=None,
-    search_space=None
-):
+def create_sqft_nncf_config(config, model, target_module_groups=None, search_space=None):
     """Load and preprocess the NNCF configuration file.
 
     Returns:
@@ -131,7 +119,7 @@ def create_sqft_nncf_config(
     nncf_config_dict = add_lr_epochs(
         nncf_config_dict,
         learning_rate=config["Training"]["learning_rate"],
-        num_train_epochs=config["Training"]["epochs"]
+        num_train_epochs=config["Training"]["epochs"],
     )
     nncf_config = NNCFConfig.from_dict(nncf_config_dict)
 
diff --git a/comps/finetuning/utils/extract_sub_adapter.py b/comps/finetuning/utils/extract_sub_adapter.py
index f7b0bf6ff1..00f477f684 100644
--- a/comps/finetuning/utils/extract_sub_adapter.py
+++ b/comps/finetuning/utils/extract_sub_adapter.py
@@ -1,12 +1,15 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import re
 
 import torch
-
-from peft.utils import CONFIG_NAME, WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME
+from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
 
 try:
     from nncf import NNCFConfig
+
     is_nncf_available = True
 except ImportError:
     is_nncf_available = False
@@ -16,8 +19,7 @@
 
 
 def get_width_for_query_prefix(torch_module_to_width, query_module, length=5):
-    """
-    Get the width for a given query module prefix.
+    """Get the width for a given query module prefix.
 
     Args:
         torch_module_to_width (dict): Mapping from torch module to width.
@@ -81,6 +83,7 @@ def main(adapter_model_path, nncf_config, adapter_version, custom_config=None):
         super_adapter_weights = torch.load(os.path.join(adapter_model_path, WEIGHTS_NAME))
     except:
         from safetensors.torch import load_file
+
         super_adapter_weights = load_file(os.path.join(adapter_model_path, SAFETENSORS_WEIGHTS_NAME))
     sub_adapter_weights = {}
     for weight_key, weight_tensor in super_adapter_weights.items():
diff --git a/comps/finetuning/utils/merge_adapter.py b/comps/finetuning/utils/merge_adapter.py
index a127061ef6..f1bca2ab51 100644
--- a/comps/finetuning/utils/merge_adapter.py
+++ b/comps/finetuning/utils/merge_adapter.py
@@ -1,3 +1,6 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
diff --git a/comps/finetuning_sqft/README.md b/comps/finetuning_sqft/README.md
index a5748caf76..e9d2c3e596 100644
--- a/comps/finetuning_sqft/README.md
+++ b/comps/finetuning_sqft/README.md
@@ -12,6 +12,7 @@ python -m pip install intel-extension-for-pytorch
 python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
 pip install -r requirements.txt
 ```
+
 To enable elastic adapter fine-tuning (Neural Low-Rank Adapter Search) or SparsePEFT from [SQFT](https://arxiv.org/abs/2410.03750), please perform this additional installation:
 
 ```bash
@@ -94,7 +95,6 @@ Download a training file, such as `alpaca_data.json` for instruction tuning and
 curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./alpaca_data.json" -F purpose="fine-tune"
 ```
 
-
 ### 3.2 Create fine-tuning job
 
 #### 3.2.1 Instruction Tuning
@@ -110,7 +110,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
     "training_file": "alpaca_data.json",
     "model": "meta-llama/Llama-2-7b-chat-hf"
   }'
- 
+
 # create a finetuning job (with SparsePEFT)
 curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   -X POST \
@@ -124,7 +124,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
       }
     }
   }'
-  
+
 # create a fine-tuning job (with Neural Low-rank adapter Search)
 # Max LoRA rank: 16
 #   LoRA target modules            -> Low-rank search space
@@ -151,8 +151,8 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
 Below are some explanations for the parameters related to the Neural Low-rank adapter Search algorithm:
 
 - `target_module_groups` specifies the target module groups, which means that the adapters within the same group will share the same activated low-rank value.
-- `search_space` specifies the search space for each target module (adapter) group. 
-Here, it is `["16,12,8", "16,12,8", "16,12,8"]`, meaning that the search space for each group is [16, 12, 8].
+- `search_space` specifies the search space for each target module (adapter) group.
+  Here, it is `["16,12,8", "16,12,8", "16,12,8"]`, meaning that the search space for each group is [16, 12, 8].
 
 Note that the number of groups should be equal to the number of search spaces (one-to-one correspondence).
 Feel free to try your favorite group design and search spaces.
@@ -179,7 +179,7 @@ curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Typ
 
 #### 3.4.1 Extract the sub-adapter
 
-After completing the super-adapter fine-tuning (the checkpoints of the fine-tuning job), 
+After completing the super-adapter fine-tuning (the checkpoints of the fine-tuning job),
 the following command demonstrates how to extract the heuristic sub-adapter.
 Additionally, more powerful sub-adapters can be obtained through other advanced search algorithms.
 
@@ -211,10 +211,10 @@ curl http://${your_ip}:8015/v1/finetune/extract_adapter \
   }'
 ```
 
-In the fine-tuning job with Neural Low-rank adapter Search algorithm,  the `nncf_config.json` file (which includes the elastic adapter information) will be saved in the output directory.
+In the fine-tuning job with Neural Low-rank adapter Search algorithm, the `nncf_config.json` file (which includes the elastic adapter information) will be saved in the output directory.
 The `custom_config` must correspond with the `overwrite_groups` (adapter modules) or `overwrite_groups_widths`
-(search space for the rank of adapter modules) in `nncf_config.json`. 
-The above command corresponds to the example in [example_nncf_config/nncf_config.json](./example_nncf_config/nncf_config.json), 
+(search space for the rank of adapter modules) in `nncf_config.json`.
+The above command corresponds to the example in [example_nncf_config/nncf_config.json](./example_nncf_config/nncf_config.json),
 and it will save the sub-adapter to `<path to output directory> / custom`.
 
 </details>
diff --git a/comps/finetuning_sqft/example_nncf_config/nncf_config.json b/comps/finetuning_sqft/example_nncf_config/nncf_config.json
index ead7ffe4c6..7ec9b3a578 100644
--- a/comps/finetuning_sqft/example_nncf_config/nncf_config.json
+++ b/comps/finetuning_sqft/example_nncf_config/nncf_config.json
@@ -1,630 +1,354 @@
 {
-    "input_info": [
-        {
-            "sample_size": [
-                1,
-                256
-            ],
-            "type": "long",
-            "keyword": "input_ids"
-        },
-        {
-            "sample_size": [
-                1,
-                256
-            ],
-            "type": "long",
-            "keyword": "attention_mask"
-        }
-    ],
-    "bootstrapNAS": {
-        "training": {
-            "algorithm": "progressive_shrinking",
-            "frozen_layers_allowed": true,
-            "progressivity_of_elasticity": [
-                "width"
-            ],
-            "batchnorm_adaptation": {
-                "num_bn_adaptation_samples": 0
-            },
-            "schedule": {
-                "list_stage_descriptions": [
-                    {
-                        "train_dims": [
-                            "width"
-                        ],
-                        "epochs": 3,
-                        "depth_indicator": 1,
-                        "width_indicator": 8,
-                        "init_lr": 0.0003,
-                        "epochs_lr": 3,
-                        "sample_rate": 1
-                    }
-                ]
-            },
-            "elasticity": {
-                "available_elasticity_dims": [
-                    "width"
-                ],
-                "width": {
-                    "overwrite_groups": [
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ],
-                        [
-                            "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-                        ]
-                    ],
-                    "overwrite_groups_widths": [
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ],
-                        [
-                            16,
-                            12,
-                            8
-                        ]
-                    ]
-                }
-            }
+  "input_info": [
+    {
+      "sample_size": [1, 256],
+      "type": "long",
+      "keyword": "input_ids"
+    },
+    {
+      "sample_size": [1, 256],
+      "type": "long",
+      "keyword": "attention_mask"
+    }
+  ],
+  "bootstrapNAS": {
+    "training": {
+      "algorithm": "progressive_shrinking",
+      "frozen_layers_allowed": true,
+      "progressivity_of_elasticity": ["width"],
+      "batchnorm_adaptation": {
+        "num_bn_adaptation_samples": 0
+      },
+      "schedule": {
+        "list_stage_descriptions": [
+          {
+            "train_dims": ["width"],
+            "epochs": 3,
+            "depth_indicator": 1,
+            "width_indicator": 8,
+            "init_lr": 0.0003,
+            "epochs_lr": 3,
+            "sample_rate": 1
+          }
+        ]
+      },
+      "elasticity": {
+        "available_elasticity_dims": ["width"],
+        "width": {
+          "overwrite_groups": [
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ],
+            [
+              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
+            ]
+          ],
+          "overwrite_groups_widths": [
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8],
+            [16, 12, 8]
+          ]
         }
+      }
     }
-}
\ No newline at end of file
+  }
+}
diff --git a/comps/finetuning_sqft/finetune_sqft_config.py b/comps/finetuning_sqft/finetune_sqft_config.py
index a34a9e7c3b..e5b35cef91 100644
--- a/comps/finetuning_sqft/finetune_sqft_config.py
+++ b/comps/finetuning_sqft/finetune_sqft_config.py
@@ -5,7 +5,7 @@
 
 from typing import List, Optional, Union
 
-from pydantic import BaseModel, Field, validator, root_validator
+from pydantic import BaseModel, Field, root_validator, validator
 
 from comps.cores.proto.api_protocol import FineTuningJobIDRequest, FineTuningJobsRequest
 
@@ -46,10 +46,10 @@ class SQFTLoRAConfig(LoraConfig):
 
     @root_validator(pre=True)
     def set_target_modules(cls, values):
-        target_module_groups = values.get('target_module_groups')
+        target_module_groups = values.get("target_module_groups")
         if target_module_groups is not None:
-            values['target_modules'] = [item for sublist in target_module_groups for item in sublist]
-        search_space = values.get('search_space')
+            values["target_modules"] = [item for sublist in target_module_groups for item in sublist]
+        search_space = values.get("search_space")
         if search_space is not None:
             assert len(search_space) == len(target_module_groups)
         return values
@@ -207,9 +207,11 @@ class FineTuningParams(FineTuningJobsRequest):
     Dataset: DatasetConfig = DatasetConfig()
     Training: TrainingConfig = TrainingConfig()
 
+
 class ExtractAdapterParams(FineTuningJobIDRequest):
     sub_adapter_version: str = "heuristic"
     custom_config: Optional[List[int]] = None
 
+
 class MergeAdapterParams(FineTuningJobIDRequest):
     adapter_version: Optional[str] = None
diff --git a/comps/finetuning_sqft/finetuning_sqft_service.py b/comps/finetuning_sqft/finetuning_sqft_service.py
index bc11a6cd23..af9f237399 100644
--- a/comps/finetuning_sqft/finetuning_sqft_service.py
+++ b/comps/finetuning_sqft/finetuning_sqft_service.py
@@ -4,7 +4,7 @@
 
 from comps import opea_microservices, register_microservice
 from comps.cores.proto.api_protocol import FineTuningJobIDRequest, UploadFileRequest
-from comps.finetuning_sqft.finetune_sqft_config import FineTuningParams, ExtractAdapterParams, MergeAdapterParams
+from comps.finetuning_sqft.finetune_sqft_config import ExtractAdapterParams, FineTuningParams, MergeAdapterParams
 from comps.finetuning_sqft.handlers import (
     handle_cancel_finetuning_job,
     handle_create_finetuning_jobs,
@@ -22,6 +22,7 @@
 def create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
     return handle_create_finetuning_jobs(request, background_tasks)
 
+
 @register_microservice(
     name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
 )
@@ -63,10 +64,14 @@ def list_checkpoints(request: FineTuningJobIDRequest):
     checkpoints = handle_list_finetuning_checkpoints(request)
     return checkpoints
 
-@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/extract_adapter", host="0.0.0.0", port=8015)
+
+@register_microservice(
+    name="opea_service@finetuning", endpoint="/v1/finetune/extract_adapter", host="0.0.0.0", port=8015
+)
 def extract_sub_adapter(request: ExtractAdapterParams):
     return handle_extract_sub_adapter(request)
 
+
 @register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/merge_adapter", host="0.0.0.0", port=8015)
 def merge_adapter(request: MergeAdapterParams):
     return handle_merge_adapter(request)
diff --git a/comps/finetuning_sqft/handlers.py b/comps/finetuning_sqft/handlers.py
index 03e5745981..cdb6b224aa 100644
--- a/comps/finetuning_sqft/handlers.py
+++ b/comps/finetuning_sqft/handlers.py
@@ -152,7 +152,7 @@ def handle_extract_sub_adapter(request: ExtractAdapterParams):
     if not os.path.exists(finetuned_model_path):
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
         )
     if job.status != "succeeded":
         raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
@@ -160,27 +160,27 @@ def handle_extract_sub_adapter(request: ExtractAdapterParams):
     if finetune_config.General.lora_config is None:
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
         )
     if not finetune_config.General.lora_config.neural_lora_search:
         raise HTTPException(
             status_code=404,
             detail=f"The fine-tuning job '{fine_tuning_job_id}' did not enable NLS algorithm, "
-                   f"there is no need to extract sub-adapters!"
+            f"there is no need to extract sub-adapters!",
         )
     nncf_config_path = os.path.join(finetune_config.General.output_dir, "nncf_config.json")
     if not os.path.exists(nncf_config_path):
         raise HTTPException(
-            status_code=404,
-            detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
+            status_code=404, detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
         )
 
     from comps.finetuning_sqft.utils.extract_sub_adapter import main as extract_sub_adapter_main
+
     extract_sub_adapter_main(
         adapter_model_path=finetuned_model_path,
         nncf_config=nncf_config_path,
         sub_adapter_version=request.sub_adapter_version,
-        custom_config=request.custom_config
+        custom_config=request.custom_config,
     )
 
     return fine_tuning_job_id
@@ -199,7 +199,7 @@ def handle_merge_adapter(request: MergeAdapterParams):
     if not os.path.exists(finetuned_model_path):
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!"
+            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
         )
     if job.status != "succeeded":
         raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
@@ -207,7 +207,7 @@ def handle_merge_adapter(request: MergeAdapterParams):
     if finetune_config.General.lora_config is None:
         raise HTTPException(
             status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!"
+            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
         )
 
     adapter_path = finetuned_model_path
@@ -217,14 +217,15 @@ def handle_merge_adapter(request: MergeAdapterParams):
         if not os.path.exists(adapter_path):
             raise HTTPException(
                 status_code=404,
-                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!"
+                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!",
             )
 
     from comps.finetuning_sqft.utils.merge import main as merge_adapter_main
+
     merge_adapter_main(
         base_model_path=finetune_config.General.base_model,
         adapter_model_path=adapter_path,
-        output_path=os.path.join(adapter_path, "merged_model")
+        output_path=os.path.join(adapter_path, "merged_model"),
     )
 
     return fine_tuning_job_id
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py b/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
index 82f2e65c1d..8433cbacb8 100644
--- a/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
+++ b/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
@@ -38,11 +38,13 @@
 logger = CustomLogger("llm_on_ray/finetune")
 
 try:
-    from comps.finetuning_sqft.utils.nncf_config_process import load_nncf_config
     from nncf.experimental.torch.nas.bootstrapNAS.training.model_creator_helpers import (
         create_compressed_model_from_algo_names,
     )
     from nncf.torch.model_creation import create_nncf_network
+
+    from comps.finetuning_sqft.utils.nncf_config_process import load_nncf_config
+
     is_nncf_available = True
 except ImportError:
     is_nncf_available = False
@@ -358,7 +360,7 @@ def load_model(config: Dict):
                     model=model,
                     target_module_groups=target_module_groups,
                     search_space=search_space,
-                    nncf_config=nncf_config
+                    nncf_config=nncf_config,
                 )
                 model = create_nncf_network(model, nncf_config)
                 compression_ctrl, model = create_compressed_model_from_algo_names(
@@ -401,6 +403,7 @@ def load_model(config: Dict):
 
     return model, compression_ctrl
 
+
 def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=None):
     device = config["Training"]["device"]
     if device in ["cpu", "gpu", "cuda"]:
@@ -409,7 +412,9 @@ def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator
             "model": model,
             "args": training_args,
             "train_dataset": tokenized_dataset["train"],
-            "eval_dataset": tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
+            "eval_dataset": (
+                tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None
+            ),
             "tokenizer": tokenizer,
             "data_collator": data_collator,
         }
@@ -471,7 +476,9 @@ def train_func(config: Dict[str, Any]):
 
     model, compression_ctrl = load_model(config)
 
-    training_args, trainer = get_trainer(config, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=compression_ctrl)
+    training_args, trainer = get_trainer(
+        config, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=compression_ctrl
+    )
 
     logger.info("train start")
     trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
diff --git a/comps/finetuning_sqft/patches/peft-v0.10.0.patch b/comps/finetuning_sqft/patches/peft-v0.10.0.patch
index caefc3e735..9606bd24ef 100644
--- a/comps/finetuning_sqft/patches/peft-v0.10.0.patch
+++ b/comps/finetuning_sqft/patches/peft-v0.10.0.patch
@@ -169,7 +169,7 @@ diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
 index 5ac1264..acb5d27 100644
 --- a/src/peft/utils/save_and_load.py
 +++ b/src/peft/utils/save_and_load.py
-@@ -246,6 +246,48 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="defaul
+@@ -246,6 +246,48 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default
      else:
          raise NotImplementedError
  
diff --git a/comps/finetuning_sqft/utils/extract_sub_adapter.py b/comps/finetuning_sqft/utils/extract_sub_adapter.py
index ae2a3b7faf..82e4471719 100644
--- a/comps/finetuning_sqft/utils/extract_sub_adapter.py
+++ b/comps/finetuning_sqft/utils/extract_sub_adapter.py
@@ -1,17 +1,19 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 import re
 
 import torch
 from nncf import NNCFConfig
-from peft.utils import CONFIG_NAME, WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME
+from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
 
 PATTERN = re.compile(r"[[](.*?)[]]", re.S)
 
 
 def get_width_for_query_prefix(torch_module_to_width, query_module, length=5):
-    """
-    Get the width for a given query module prefix.
+    """Get the width for a given query module prefix.
 
     Args:
         torch_module_to_width (dict): Mapping from torch module to width.
@@ -75,6 +77,7 @@ def main(adapter_model_path, nncf_config, sub_adapter_version, custom_config=Non
         super_adapter_weights = torch.load(os.path.join(adapter_model_path, WEIGHTS_NAME))
     except:
         from safetensors.torch import load_file
+
         super_adapter_weights = load_file(os.path.join(adapter_model_path, SAFETENSORS_WEIGHTS_NAME))
     sub_adapter_weights = {}
     for weight_key, weight_tensor in super_adapter_weights.items():
@@ -93,9 +96,9 @@ def main(adapter_model_path, nncf_config, sub_adapter_version, custom_config=Non
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Merge base model and adapter model with additional configurations")
-    parser.add_argument('--adapter_model_path', type=str, required=True, help="Path to the adapter model")
-    parser.add_argument('--nncf_config', type=str, required=True, help="Path to the NNCF configuration")
-    parser.add_argument('--sub_adapter_version', type=str, required=True, help="Sub adapter version")
-    parser.add_argument('--custom_config', type=str, default=None, help="Path to custom configuration (optional)")
+    parser.add_argument("--adapter_model_path", type=str, required=True, help="Path to the adapter model")
+    parser.add_argument("--nncf_config", type=str, required=True, help="Path to the NNCF configuration")
+    parser.add_argument("--sub_adapter_version", type=str, required=True, help="Sub adapter version")
+    parser.add_argument("--custom_config", type=str, default=None, help="Path to custom configuration (optional)")
     args = parser.parse_args()
     main(args.adapter_model_path, args.nncf_config, args.sub_adapter_version, args.custom_config)
diff --git a/comps/finetuning_sqft/utils/merge.py b/comps/finetuning_sqft/utils/merge.py
index 51b8381235..266ee0eac4 100644
--- a/comps/finetuning_sqft/utils/merge.py
+++ b/comps/finetuning_sqft/utils/merge.py
@@ -1,4 +1,8 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
+
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -19,9 +23,9 @@ def main(base_model_path, adapter_model_path, output_path):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Merge base model and adapter model")
-    parser.add_argument('--base_model_path', type=str, required=True, help="Path to the base model")
-    parser.add_argument('--adapter_model_path', type=str, required=True, help="Path to the adapter model")
-    parser.add_argument('--output_path', type=str, required=True, help="Path to save the merged model")
+    parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model")
+    parser.add_argument("--adapter_model_path", type=str, required=True, help="Path to the adapter model")
+    parser.add_argument("--output_path", type=str, required=True, help="Path to save the merged model")
 
     args = parser.parse_args()
     main(args.base_model_path, args.adapter_model_path, args.output_path)
diff --git a/comps/finetuning_sqft/utils/nncf_config_process.py b/comps/finetuning_sqft/utils/nncf_config_process.py
index 521e6fefa7..5f6abb7c8f 100644
--- a/comps/finetuning_sqft/utils/nncf_config_process.py
+++ b/comps/finetuning_sqft/utils/nncf_config_process.py
@@ -1,43 +1,41 @@
-import os
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import json
-from nncf import NNCFConfig
+import os
 
+from nncf import NNCFConfig
 
 NNCF_CONFIG_TEMPLATE = {
     "input_info": [
-        {
-            "sample_size": [1, 256],
-            "type": "long",
-            "keyword": "input_ids"
-        },
-        {
-            "sample_size": [1, 256],
-            "type": "long",
-            "keyword": "attention_mask"
-        }
+        {"sample_size": [1, 256], "type": "long", "keyword": "input_ids"},
+        {"sample_size": [1, 256], "type": "long", "keyword": "attention_mask"},
     ],
     "bootstrapNAS": {
         "training": {
             "algorithm": "progressive_shrinking",
             "frozen_layers_allowed": True,
             "progressivity_of_elasticity": ["width"],
-            "batchnorm_adaptation": {
-                "num_bn_adaptation_samples": 0
-            },
+            "batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
             "schedule": {
                 "list_stage_descriptions": [
-                    {"train_dims": ["width"], "epochs": -1, "depth_indicator": 1, "width_indicator": 8, "init_lr": -1, "epochs_lr": -1, "sample_rate": 1}
+                    {
+                        "train_dims": ["width"],
+                        "epochs": -1,
+                        "depth_indicator": 1,
+                        "width_indicator": 8,
+                        "init_lr": -1,
+                        "epochs_lr": -1,
+                        "sample_rate": 1,
+                    }
                 ]
             },
             "elasticity": {
                 "available_elasticity_dims": ["width"],
-                "width": {
-                    "overwrite_groups": [],
-                    "overwrite_groups_widths": []
-                }
-            }
+                "width": {"overwrite_groups": [], "overwrite_groups_widths": []},
+            },
         }
-    }
+    },
 }
 
 
@@ -63,8 +61,7 @@ def add_lr_epochs(nncf_config, learning_rate=3e-4, num_epochs=3):
 
 
 def get_model_paths(model, target_module_name):
-    """
-    Find all paths to the target layer in the model.
+    """Find all paths to the target layer in the model.
 
     Args:
         model (torch.nn.Module): The model to search.
@@ -73,6 +70,7 @@ def get_model_paths(model, target_module_name):
     Returns:
         list: A list of paths to the target layer.
     """
+
     def find_layers(module, target_module_name, path, paths):
         for name, sub_module in module.named_children():
             new_path = f"{path}/{sub_module.__class__.__name__}[{name}]"
@@ -88,13 +86,8 @@ def find_layers(module, target_module_name, path, paths):
     find_layers(model, target_module_name, base_path, paths)
     return paths
 
-def load_nncf_config(
-    config,
-    model,
-    target_module_groups=None,
-    search_space=None,
-    nncf_config=None
-):
+
+def load_nncf_config(config, model, target_module_groups=None, search_space=None, nncf_config=None):
     """Load and preprocess the NNCF configuration file.
 
     Returns:
@@ -105,7 +98,9 @@ def load_nncf_config(
         nncf_config = NNCFConfig.from_json(nncf_config)
     else:
         if search_space is None and target_module_groups:
-            raise ValueError("Neural LoRA search is enabled, `search_space` and `target_module_groups` must be provided.")
+            raise ValueError(
+                "Neural LoRA search is enabled, `search_space` and `target_module_groups` must be provided."
+            )
         # The NNCF Config will be automatically generated based on `target_module_groups` and `search_space`.
         num_hidden_layers = model.config.num_hidden_layers
         nncf_config_dict = NNCF_CONFIG_TEMPLATE
@@ -126,12 +121,12 @@ def load_nncf_config(
         for space in search_space:
             space = [int(width) for width in space.split(",")]
             overwrite_groups_widths.extend([space] * num_hidden_layers)
-        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups_widths"] = overwrite_groups_widths
+        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"][
+            "overwrite_groups_widths"
+        ] = overwrite_groups_widths
         assert len(overwrite_groups) == len(overwrite_groups_widths)
         nncf_config_dict = add_lr_epochs(
-            nncf_config_dict,
-            learning_rate=config["Training"]["learning_rate"],
-            num_epochs=config["Training"]["epochs"]
+            nncf_config_dict, learning_rate=config["Training"]["learning_rate"], num_epochs=config["Training"]["epochs"]
         )
         nncf_config = NNCFConfig.from_dict(nncf_config_dict)
 
@@ -142,15 +137,18 @@ def load_nncf_config(
     return nncf_config
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import transformers
     from peft import LoraConfig, get_peft_model
+
     model = transformers.AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
     lora_config = {
         "task_type": "CAUSAL_LM",
         "r": 16,
-        "target_modules": ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj"]
+        "target_modules": ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj"],
     }
     peft_config = LoraConfig(**lora_config)
     model = get_peft_model(model, peft_config)
-    load_nncf_config(None, model, [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]], ["16,12,8", "16", "16,12"])
+    load_nncf_config(
+        None, model, [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]], ["16,12,8", "16", "16,12"]
+    )

From 156c2255cf7295ec51467cb889e2da43e908cace Mon Sep 17 00:00:00 2001
From: Yuan Jinjie <jinjie.yuan@intel.com>
Date: Tue, 26 Nov 2024 08:57:52 +0800
Subject: [PATCH 03/17] Delete old finetuning_sqft directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning_sqft/Dockerfile              |  50 --
 comps/finetuning_sqft/README.md               | 240 -------
 .../example_nncf_config/nncf_config.json      | 354 ----------
 comps/finetuning_sqft/finetune_runner.py      |  38 --
 comps/finetuning_sqft/finetune_sqft_config.py | 217 -------
 .../finetuning_sqft_service.py                |  81 ---
 comps/finetuning_sqft/handlers.py             | 339 ----------
 comps/finetuning_sqft/launch.sh               |  12 -
 .../llm_on_ray/common/__init__.py             |   6 -
 .../llm_on_ray/common/common.py               |  29 -
 .../llm_on_ray/common/torch_config.py         |  72 ---
 .../llm_on_ray/finetune/__init__.py           |   4 -
 .../llm_on_ray/finetune/data_process.py       | 352 ----------
 .../llm_on_ray/finetune/finetune.py           | 609 ------------------
 .../llm_on_ray/finetune/modeling.py           | 211 ------
 .../patches/nncf-v2.12.0.patch                |  72 ---
 .../patches/peft-v0.10.0.patch                | 220 -------
 .../patches/transformers-v4.44.2.patch        | 171 -----
 comps/finetuning_sqft/requirements.txt        |  17 -
 .../utils/extract_sub_adapter.py              | 104 ---
 comps/finetuning_sqft/utils/merge.py          |  31 -
 .../utils/nncf_config_process.py              | 154 -----
 22 files changed, 3383 deletions(-)
 delete mode 100644 comps/finetuning_sqft/Dockerfile
 delete mode 100644 comps/finetuning_sqft/README.md
 delete mode 100644 comps/finetuning_sqft/example_nncf_config/nncf_config.json
 delete mode 100644 comps/finetuning_sqft/finetune_runner.py
 delete mode 100644 comps/finetuning_sqft/finetune_sqft_config.py
 delete mode 100644 comps/finetuning_sqft/finetuning_sqft_service.py
 delete mode 100644 comps/finetuning_sqft/handlers.py
 delete mode 100644 comps/finetuning_sqft/launch.sh
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/common/__init__.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/common/common.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/common/torch_config.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
 delete mode 100644 comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
 delete mode 100644 comps/finetuning_sqft/patches/nncf-v2.12.0.patch
 delete mode 100644 comps/finetuning_sqft/patches/peft-v0.10.0.patch
 delete mode 100644 comps/finetuning_sqft/patches/transformers-v4.44.2.patch
 delete mode 100644 comps/finetuning_sqft/requirements.txt
 delete mode 100644 comps/finetuning_sqft/utils/extract_sub_adapter.py
 delete mode 100644 comps/finetuning_sqft/utils/merge.py
 delete mode 100644 comps/finetuning_sqft/utils/nncf_config_process.py

diff --git a/comps/finetuning_sqft/Dockerfile b/comps/finetuning_sqft/Dockerfile
deleted file mode 100644
index 4715470aec..0000000000
--- a/comps/finetuning_sqft/Dockerfile
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Use the same python version with ray
-FROM python:3.10.14
-
-ARG HF_TOKEN
-
-ENV HF_TOKEN=$HF_TOKEN
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-COPY comps /home/user/comps
-
-RUN chown -R user /home/user/comps/finetuning_sqft
-
-USER user
-
-ENV PATH=$PATH:/home/user/.local/bin
-
-RUN python -m pip install --no-cache-dir --upgrade pip && \
-    python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
-    python -m pip install --no-cache-dir intel-extension-for-pytorch && \
-    python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
-    python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt
-
-# Set up third-party dependencies (SQFT)
-ENV PATH_TO_FINETUNE=/home/user/comps/finetuning_sqft
-RUN mkdir -p $PATH_TO_FINETUNE/third_party && cd $PATH_TO_FINETUNE/third_party && \
-    git clone https://github.com/huggingface/peft.git && \
-    cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/peft-v0.10.0.patch && pip install -e . && cd .. && \
-    git clone https://github.com/huggingface/transformers.git && \
-    cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/transformers-v4.44.2.patch && pip install -e . && cd .. && \
-    git clone https://github.com/openvinotoolkit/nncf.git && \
-    cd nncf && git checkout v2.12.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/nncf-v2.12.0.patch && pip install -e . && cd ..
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-WORKDIR /home/user/comps/finetuning_sqft
-
-RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \
-    echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \
-    echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \
-    echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \
-    echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \
-    echo python finetuning_sqft_service_service.py >> run.sh
-
-CMD bash run.sh
diff --git a/comps/finetuning_sqft/README.md b/comps/finetuning_sqft/README.md
deleted file mode 100644
index e9d2c3e596..0000000000
--- a/comps/finetuning_sqft/README.md
+++ /dev/null
@@ -1,240 +0,0 @@
-# SQFT Fine-tuning Microservice
-
-Fine-tuning microservice with SQFT involves adapting a model to a specific task or dataset to improve its performance on that task, we currently support instruction tuning for LLMs.
-
-## 🚀1. Start Microservice with Python (Option 1)
-
-### 1.1 Install Requirements
-
-```bash
-python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-python -m pip install intel-extension-for-pytorch
-python -m pip install oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
-pip install -r requirements.txt
-```
-
-To enable elastic adapter fine-tuning (Neural Low-Rank Adapter Search) or SparsePEFT from [SQFT](https://arxiv.org/abs/2410.03750), please perform this additional installation:
-
-```bash
-PATH_TO_FINETUNE=$PWD
-mkdir third_party && cd third_party
-
-# transformers (for Neural Lora Search)
-git clone https://github.com/huggingface/transformers.git
-cd transformers && git checkout v4.44.2 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/transformers-v4.44.2.patch && pip install -e . && cd ..
-
-# peft (for Neural Low-Rank Adapter Search and SparsePEFT)
-git clone https://github.com/huggingface/peft.git
-cd peft && git checkout v0.10.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/peft-v0.10.0.patch && pip install -e . && cd ..
-
-# nncf (for Neural Lora Search)
-git clone https://github.com/openvinotoolkit/nncf.git
-cd nncf && git checkout v2.12.0 && git apply --ignore-space-change --ignore-whitespace ${PATH_TO_FINETUNE}/patches/nncf-v2.12.0.patch && pip install -e . && cd ..
-```
-
-### 1.2 Start Fine-tuning Service with Python Script
-
-#### 1.2.1 Start Ray Cluster
-
-OneCCL and Intel MPI libraries should be dynamically linked in every node before Ray starts:
-
-```bash
-source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl; print(torch_ccl.cwd)")/env/setvars.sh
-```
-
-Start Ray locally using the following command.
-
-```bash
-ray start --head
-```
-
-For a multi-node cluster, start additional Ray worker nodes with below command.
-
-```bash
-ray start --address='${head_node_ip}:6379'
-```
-
-#### 1.2.2 Start Finetuning Service
-
-```bash
-export HF_TOKEN=<your huggingface token>
-export PYTHONPATH=<path to GenAIComps>
-python finetuning_sqft_service.py
-```
-
-## 🚀2. Start Microservice with Docker (Option 2)
-
-### 2.1 Setup on CPU
-
-#### 2.1.1 Build Docker Image
-
-Build docker image with below command:
-
-```bash
-export HF_TOKEN=${your_huggingface_token}
-cd ../../
-docker build -t opea/finetuning:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning_sqft/Dockerfile .
-```
-
-#### 2.1.2 Run Docker with CLI
-
-Start docker container with below command:
-
-```bash
-docker run -d --name="finetuning-server" -p 8015:8015 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/finetuning:latest
-```
-
-## 🚀3. Consume Fine-tuning Service
-
-### 3.1 Upload a training file
-
-Download a training file, such as `alpaca_data.json` for instruction tuning and upload it to the server with below command, this file can be downloaded in [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json):
-
-```bash
-# upload a training file
-curl http://${your_ip}:8015/v1/files -X POST -H "Content-Type: multipart/form-data" -F "file=@./alpaca_data.json" -F purpose="fine-tune"
-```
-
-### 3.2 Create fine-tuning job
-
-#### 3.2.1 Instruction Tuning
-
-After a training file like `alpaca_data.json` is uploaded, use the following command to launch a fine-tuning job using `meta-llama/Llama-2-7b-chat-hf` as base model:
-
-```bash
-# create a finetuning job
-curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "training_file": "alpaca_data.json",
-    "model": "meta-llama/Llama-2-7b-chat-hf"
-  }'
-
-# create a finetuning job (with SparsePEFT)
-curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "training_file": "alpaca_data.json",
-    "model": <path to sparse model>,
-    "General": {
-      "lora_config": {
-        "sparse_adapter": true
-      }
-    }
-  }'
-
-# create a fine-tuning job (with Neural Low-rank adapter Search)
-# Max LoRA rank: 16
-#   LoRA target modules            -> Low-rank search space
-#   ["q_proj", "k_proj", "v_proj"] -> [16,12,8]
-#   ["up_proj"]                    -> [16,12,8]
-#   ["down_proj"]                  -> [16,12,8]
-curl http://${your_ip}:8015/v1/fine_tuning/jobs \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "training_file": "alpaca_data.json",
-    "model": "meta-llama/Llama-2-7b-chat-hf",
-    "General": {
-      "lora_config": {
-        "r": 16,
-        "neural_lora_search": true,
-        "target_module_groups": [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]],
-        "search_space": ["16,12,8", "16,12,8", "16,12,8"]
-      }
-    }
-  }'
-```
-
-Below are some explanations for the parameters related to the Neural Low-rank adapter Search algorithm:
-
-- `target_module_groups` specifies the target module groups, which means that the adapters within the same group will share the same activated low-rank value.
-- `search_space` specifies the search space for each target module (adapter) group.
-  Here, it is `["16,12,8", "16,12,8", "16,12,8"]`, meaning that the search space for each group is [16, 12, 8].
-
-Note that the number of groups should be equal to the number of search spaces (one-to-one correspondence).
-Feel free to try your favorite group design and search spaces.
-
-### 3.3 Manage fine-tuning job
-
-Below commands show how to list fine-tuning jobs, retrieve a fine-tuning job, cancel a fine-tuning job and list checkpoints of a fine-tuning job.
-
-```bash
-# list fine-tuning jobs
-curl http://${your_ip}:8015/v1/fine_tuning/jobs -X GET
-
-# retrieve one fine-tuning job
-curl http://localhost:8015/v1/fine_tuning/jobs/retrieve -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
-
-# cancel one fine-tuning job
-curl http://localhost:8015/v1/fine_tuning/jobs/cancel -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
-
-# list checkpoints of a fine-tuning job
-curl http://${your_ip}:8015/v1/finetune/list_checkpoints -X POST -H "Content-Type: application/json" -d '{"fine_tuning_job_id": ${fine_tuning_job_id}}'
-```
-
-### 3.4 Leverage fine-tuned model
-
-#### 3.4.1 Extract the sub-adapter
-
-After completing the super-adapter fine-tuning (the checkpoints of the fine-tuning job),
-the following command demonstrates how to extract the heuristic sub-adapter.
-Additionally, more powerful sub-adapters can be obtained through other advanced search algorithms.
-
-```bash
-curl http://${your_ip}:8015/v1/finetune/extract_adapter \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "fine_tuning_job_id": ${fine_tuning_job_id},
-    "sub_adapter_version": "heuristic"
-  }'
-```
-
-`sub_adapter_version` can be heuristic, minimal, or a custom name.
-When `sub_adapter_version` is set to a custom name, we need to provide a specific configuration in `custom_config`.
-The extracted adapter will be saved in `<path to output directory> / <sub_adapter_version>`.
-
-<details>
-<summary>An example of a custom configuration </summary>
-
-```bash
-curl http://${your_ip}:8015/v1/finetune/extract_adapter \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "fine_tuning_job_id": ${fine_tuning_job_id},
-    "sub_adapter_version": "custom",
-    "custom_config": [8, 8, 16, 8, 8, 12, 8, 12, 12, 12, 8, 16, 12, 16, 16, 12, 12, 8, 8, 16, 8, 8, 12, 8, 16, 12, 8, 16, 8, 16, 12, 8, 8, 16, 16, 16, 16, 16, 8, 12, 12, 16, 12, 16, 12, 16, 16, 12, 8, 12, 12, 8, 8, 12, 8, 12, 12, 8, 16, 8, 8, 8, 8, 12, 16, 16],
-  }'
-```
-
-In the fine-tuning job with Neural Low-rank adapter Search algorithm, the `nncf_config.json` file (which includes the elastic adapter information) will be saved in the output directory.
-The `custom_config` must correspond with the `overwrite_groups` (adapter modules) or `overwrite_groups_widths`
-(search space for the rank of adapter modules) in `nncf_config.json`.
-The above command corresponds to the example in [example_nncf_config/nncf_config.json](./example_nncf_config/nncf_config.json),
-and it will save the sub-adapter to `<path to output directory> / custom`.
-
-</details>
-
-#### 3.4.2 Merge
-
-The following command demonstrates how to merge the sub-adapter to the base pretrained model:
-
-```bash
-curl http://${your_ip}:8015/v1/ffinetune/merge_adapter \
-  -X POST \
-  -H "Content-Type: application/json" \
-  -d '{
-    "fine_tuning_job_id": ${fine_tuning_job_id},
-    "sub_adapter_version": "heuristic"
-  }'
-```
-
-The merged model will be saved in `<path to output directory> / <sub_adapter_version> / merged_model`.
-
-## 🚀4. Descriptions for Finetuning parameters
-
-We utilize [OpenAI finetuning parameters](https://platform.openai.com/docs/api-reference/fine-tuning) and extend it with more customizable parameters, see the definitions at [finetune_sqft_config](./finetune_sqft_config.py).
diff --git a/comps/finetuning_sqft/example_nncf_config/nncf_config.json b/comps/finetuning_sqft/example_nncf_config/nncf_config.json
deleted file mode 100644
index 7ec9b3a578..0000000000
--- a/comps/finetuning_sqft/example_nncf_config/nncf_config.json
+++ /dev/null
@@ -1,354 +0,0 @@
-{
-  "input_info": [
-    {
-      "sample_size": [1, 256],
-      "type": "long",
-      "keyword": "input_ids"
-    },
-    {
-      "sample_size": [1, 256],
-      "type": "long",
-      "keyword": "attention_mask"
-    }
-  ],
-  "bootstrapNAS": {
-    "training": {
-      "algorithm": "progressive_shrinking",
-      "frozen_layers_allowed": true,
-      "progressivity_of_elasticity": ["width"],
-      "batchnorm_adaptation": {
-        "num_bn_adaptation_samples": 0
-      },
-      "schedule": {
-        "list_stage_descriptions": [
-          {
-            "train_dims": ["width"],
-            "epochs": 3,
-            "depth_indicator": 1,
-            "width_indicator": 8,
-            "init_lr": 0.0003,
-            "epochs_lr": 3,
-            "sample_rate": 1
-          }
-        ]
-      },
-      "elasticity": {
-        "available_elasticity_dims": ["width"],
-        "width": {
-          "overwrite_groups": [
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[q_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[k_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0",
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaSdpaAttention[self_attn]/Linear[v_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[up_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[0]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[1]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[2]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[3]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[4]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[5]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[6]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[7]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[8]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[9]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[10]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[11]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[12]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[13]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[14]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[15]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[16]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[17]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[18]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[19]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[20]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ],
-            [
-              "PeftModelForCausalLM/LoraModel[base_model]/LlamaForCausalLM[model]/LlamaModel[model]/ModuleList[layers]/LlamaDecoderLayer[21]/LlamaMLP[mlp]/Linear[down_proj]/ModuleDict[lora_A]/NNCFLinear[default]/linear_0"
-            ]
-          ],
-          "overwrite_groups_widths": [
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8],
-            [16, 12, 8]
-          ]
-        }
-      }
-    }
-  }
-}
diff --git a/comps/finetuning_sqft/finetune_runner.py b/comps/finetuning_sqft/finetune_runner.py
deleted file mode 100644
index 45cad43d56..0000000000
--- a/comps/finetuning_sqft/finetune_runner.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-
-from pydantic_yaml import parse_yaml_raw_as
-from transformers import TrainerCallback, TrainerControl, TrainerState, TrainingArguments
-
-from comps.finetuning_sqft.finetune_sqft_config import FinetuneConfig
-
-
-class FineTuneCallback(TrainerCallback):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
-        print("FineTuneCallback:", args, state)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Runner for llm_on_ray-finetune")
-    parser.add_argument("--config_file", type=str, required=True, default=None)
-    args = parser.parse_args()
-    model_config_file = args.config_file
-
-    with open(model_config_file) as f:
-        finetune_config = parse_yaml_raw_as(FinetuneConfig, f).model_dump()
-
-    callback = FineTuneCallback()
-    finetune_config["Training"]["callbacks"] = [callback]
-
-    from comps.finetuning_sqft.llm_on_ray.finetune.finetune import main as llm_on_ray_finetune_main
-
-    llm_on_ray_finetune_main(finetune_config)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/comps/finetuning_sqft/finetune_sqft_config.py b/comps/finetuning_sqft/finetune_sqft_config.py
deleted file mode 100644
index e5b35cef91..0000000000
--- a/comps/finetuning_sqft/finetune_sqft_config.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-from typing import List, Optional, Union
-
-from pydantic import BaseModel, Field, root_validator, validator
-
-from comps.cores.proto.api_protocol import FineTuningJobIDRequest, FineTuningJobsRequest
-
-PRECISION_BF16 = "bf16"
-PRECISION_FP16 = "fp16"
-PRECISION_NO = "no"
-
-DEVICE_CPU = "cpu"
-DEVICE_HPU = "hpu"
-DEVICE_GPU = "gpu"
-DEVICE_CUDA = "cuda"
-
-ACCELERATE_STRATEGY_DDP = "DDP"
-ACCELERATE_STRATEGY_FSDP = "FSDP"
-ACCELERATE_STRATEGY_DEEPSPEED = "DEEPSPEED"
-
-
-class LoadConfig(BaseModel):
-    trust_remote_code: bool = False
-    # set Huggingface token to access dataset/model
-    token: Optional[str] = None
-
-
-class LoraConfig(BaseModel):
-    task_type: str = "CAUSAL_LM"
-    r: int = 8
-    lora_alpha: int = 16
-    lora_dropout: float = 0.1
-    target_modules: Optional[List[str]] = None
-
-
-class SQFTLoRAConfig(LoraConfig):
-    neural_lora_search: bool = False
-    target_module_groups: Optional[List[List[str]]] = None
-    search_space: Optional[List[str]] = None
-    sparse_adapter: bool = False
-    nncf_config: Optional[str] = None
-
-    @root_validator(pre=True)
-    def set_target_modules(cls, values):
-        target_module_groups = values.get("target_module_groups")
-        if target_module_groups is not None:
-            values["target_modules"] = [item for sublist in target_module_groups for item in sublist]
-        search_space = values.get("search_space")
-        if search_space is not None:
-            assert len(search_space) == len(target_module_groups)
-        return values
-
-
-class GeneralConfig(BaseModel):
-    base_model: str = None
-    tokenizer_name: Optional[str] = None
-    gaudi_config_name: Optional[str] = None
-    gpt_base_model: bool = False
-    output_dir: str = "./tmp"
-    report_to: str = "none"
-    resume_from_checkpoint: Optional[str] = None
-    save_strategy: str = "no"
-    config: LoadConfig = LoadConfig()
-    lora_config: Optional[Union[LoraConfig, SQFTLoRAConfig]] = LoraConfig()
-    enable_gradient_checkpointing: bool = False
-    task: str = "instruction_tuning"
-
-    @validator("report_to")
-    def check_report_to(cls, v: str):
-        assert v in ["none", "tensorboard"]
-        return v
-
-    @validator("task")
-    def check_task(cls, v: str):
-        assert v in ["instruction_tuning"]
-        return v
-
-
-class DatasetConfig(BaseModel):
-    train_file: str = None
-    validation_file: Optional[str] = None
-    validation_split_percentage: int = 5
-    max_length: int = 512
-    group: bool = True
-    block_size: int = 512
-    shuffle: bool = False
-    max_source_length: int = 384
-    padding_side: str = "right"
-    truncation_side: str = "right"
-    max_seq_length: int = 512
-    truncation: bool = True
-    padding: Union[bool, str] = True
-    mask_input: bool = True
-    mask_response: bool = True
-    data_preprocess_type: str = "neural_chat"
-    max_train_samples: int = 0
-    max_eval_samples: int = 0
-    train_group_size: int = 8
-    query_max_len: int = Field(
-        default=128,
-        description=(
-            "The maximum total input sequence length after tokenization for passage. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    passage_max_len: int = Field(
-        default=128,
-        description=(
-            "The maximum total input sequence length after tokenization for passage. Sequences longer "
-            "than this will be truncated, sequences shorter will be padded."
-        ),
-    )
-    query_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for query")
-    passage_instruction_for_retrieval: Optional[str] = Field(default=None, description="instruction for passage")
-
-
-class RayResourceConfig(BaseModel):
-    CPU: int = 32
-    GPU: int = 0
-    HPU: int = 0
-
-
-class TrainingConfig(BaseModel):
-    optimizer: str = "adamw_torch"
-    batch_size: int = 2
-    epochs: int = 1
-    max_train_steps: Optional[int] = None
-    learning_rate: float = 5.0e-5
-    lr_scheduler: str = "linear"
-    weight_decay: float = 0.0
-    device: str = DEVICE_CPU
-    hpu_execution_mode: str = "lazy"
-    num_training_workers: int = 1
-    resources_per_worker: RayResourceConfig = RayResourceConfig()
-    accelerate_mode: str = ACCELERATE_STRATEGY_DDP
-    mixed_precision: str = PRECISION_NO
-    gradient_accumulation_steps: int = 1
-    logging_steps: int = 10
-    deepspeed_config_file: str = ""
-
-    @validator("device")
-    def check_device(cls, v: str):
-        # will convert to lower case
-        if v:
-            assert v.lower() in [DEVICE_CPU, DEVICE_GPU, DEVICE_HPU, DEVICE_CUDA]
-        return v.lower()
-
-    @validator("hpu_execution_mode")
-    def check_hpu_execution_mode(cls, v: str):
-        if v:
-            assert v in ["lazy", "eager", "eager.compile"]
-        return v
-
-    @validator("accelerate_mode")
-    def check_accelerate_mode(cls, v: str):
-        if v:
-            assert v in [
-                ACCELERATE_STRATEGY_DDP,
-                ACCELERATE_STRATEGY_FSDP,
-                ACCELERATE_STRATEGY_DEEPSPEED,
-            ]
-        return v
-
-    @validator("mixed_precision")
-    def check_mixed_precision(cls, v: str):
-        if v:
-            assert v in [PRECISION_BF16, PRECISION_FP16, PRECISION_NO]
-        return v
-
-    @validator("logging_steps")
-    def check_logging_steps(cls, v: int):
-        assert v > 0
-        return v
-
-    # @model_validator(mode='after')
-    # def check_device_and_accelerate_mode(self) -> "Training":
-    #     dev = self.device
-    #     res = self.resources_per_worker
-    #     mode = self.accelerate_mode
-    #     if dev == "CPU":
-    #         if res.GPU is not None and res.GPU > 0:
-    #             raise ValueError("Please not specified GPU resource when use CPU only in Ray.")
-    #         if mode != "CPU_DDP":
-    #             raise ValueError("Please specified CPU related accelerate mode when use CPU only in Ray.")
-    #     elif dev == "GPU":
-    #         if res.GPU is None or res.GPU == 0:
-    #             raise ValueError("Please specified GPU resource when use GPU to fine tune in Ray.")
-    #         if mode not in ["GPU_DDP", "GPU_FSDP"]:
-    #             raise ValueError("Please speicifed GPU related accelerate mode when use GPU to fine tune in Ray.")
-
-    #     return self
-
-
-class FinetuneConfig(BaseModel):
-    General: GeneralConfig = GeneralConfig()
-    Dataset: DatasetConfig = DatasetConfig()
-    Training: TrainingConfig = TrainingConfig()
-
-
-class FineTuningParams(FineTuningJobsRequest):
-    # priority use FineTuningJobsRequest params
-    General: GeneralConfig = GeneralConfig()
-    Dataset: DatasetConfig = DatasetConfig()
-    Training: TrainingConfig = TrainingConfig()
-
-
-class ExtractAdapterParams(FineTuningJobIDRequest):
-    sub_adapter_version: str = "heuristic"
-    custom_config: Optional[List[int]] = None
-
-
-class MergeAdapterParams(FineTuningJobIDRequest):
-    adapter_version: Optional[str] = None
diff --git a/comps/finetuning_sqft/finetuning_sqft_service.py b/comps/finetuning_sqft/finetuning_sqft_service.py
deleted file mode 100644
index af9f237399..0000000000
--- a/comps/finetuning_sqft/finetuning_sqft_service.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-from fastapi import BackgroundTasks, Depends
-
-from comps import opea_microservices, register_microservice
-from comps.cores.proto.api_protocol import FineTuningJobIDRequest, UploadFileRequest
-from comps.finetuning_sqft.finetune_sqft_config import ExtractAdapterParams, FineTuningParams, MergeAdapterParams
-from comps.finetuning_sqft.handlers import (
-    handle_cancel_finetuning_job,
-    handle_create_finetuning_jobs,
-    handle_extract_sub_adapter,
-    handle_list_finetuning_checkpoints,
-    handle_list_finetuning_jobs,
-    handle_merge_adapter,
-    handle_retrieve_finetuning_job,
-    handle_upload_training_files,
-    upload_file,
-)
-
-
-@register_microservice(name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015)
-def create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
-    return handle_create_finetuning_jobs(request, background_tasks)
-
-
-@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs", host="0.0.0.0", port=8015, methods=["GET"]
-)
-def list_finetuning_jobs():
-    return handle_list_finetuning_jobs()
-
-
-@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/retrieve", host="0.0.0.0", port=8015
-)
-def retrieve_finetuning_job(request: FineTuningJobIDRequest):
-    job = handle_retrieve_finetuning_job(request)
-    return job
-
-
-@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/fine_tuning/jobs/cancel", host="0.0.0.0", port=8015
-)
-def cancel_finetuning_job(request: FineTuningJobIDRequest):
-    job = handle_cancel_finetuning_job(request)
-    return job
-
-
-@register_microservice(
-    name="opea_service@finetuning",
-    endpoint="/v1/files",
-    host="0.0.0.0",
-    port=8015,
-)
-async def upload_training_files(request: UploadFileRequest = Depends(upload_file)):
-    uploadFileInfo = await handle_upload_training_files(request)
-    return uploadFileInfo
-
-
-@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/finetune/list_checkpoints", host="0.0.0.0", port=8015
-)
-def list_checkpoints(request: FineTuningJobIDRequest):
-    checkpoints = handle_list_finetuning_checkpoints(request)
-    return checkpoints
-
-
-@register_microservice(
-    name="opea_service@finetuning", endpoint="/v1/finetune/extract_adapter", host="0.0.0.0", port=8015
-)
-def extract_sub_adapter(request: ExtractAdapterParams):
-    return handle_extract_sub_adapter(request)
-
-
-@register_microservice(name="opea_service@finetuning", endpoint="/v1/finetune/merge_adapter", host="0.0.0.0", port=8015)
-def merge_adapter(request: MergeAdapterParams):
-    return handle_merge_adapter(request)
-
-
-if __name__ == "__main__":
-    opea_microservices["opea_service@finetuning"].start()
diff --git a/comps/finetuning_sqft/handlers.py b/comps/finetuning_sqft/handlers.py
deleted file mode 100644
index cdb6b224aa..0000000000
--- a/comps/finetuning_sqft/handlers.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import random
-import re
-import time
-import urllib.parse
-import uuid
-from pathlib import Path
-from typing import Dict
-
-from fastapi import BackgroundTasks, File, Form, HTTPException, UploadFile
-from pydantic_yaml import parse_yaml_file_as, to_yaml_file
-from ray.job_submission import JobSubmissionClient
-
-from comps import CustomLogger
-from comps.cores.proto.api_protocol import (
-    FileObject,
-    FineTuningJob,
-    FineTuningJobCheckpoint,
-    FineTuningJobIDRequest,
-    FineTuningJobList,
-    UploadFileRequest,
-)
-from comps.finetuning_sqft.finetune_sqft_config import (
-    ExtractAdapterParams,
-    FinetuneConfig,
-    FineTuningParams,
-    MergeAdapterParams,
-)
-
-logger = CustomLogger("finetuning_handlers")
-
-DATASET_BASE_PATH = "datasets"
-JOBS_PATH = "jobs"
-OUTPUT_DIR = "output"
-
-if not os.path.exists(DATASET_BASE_PATH):
-    os.mkdir(DATASET_BASE_PATH)
-if not os.path.exists(JOBS_PATH):
-    os.mkdir(JOBS_PATH)
-if not os.path.exists(OUTPUT_DIR):
-    os.mkdir(OUTPUT_DIR)
-
-FineTuningJobID = str
-CheckpointID = str
-CheckpointPath = str
-
-CHECK_JOB_STATUS_INTERVAL = 5  # Check every 5 secs
-
-global ray_client
-ray_client: JobSubmissionClient = None
-
-running_finetuning_jobs: Dict[FineTuningJobID, FineTuningJob] = {}
-finetuning_job_to_ray_job: Dict[FineTuningJobID, str] = {}
-checkpoint_id_to_checkpoint_path: Dict[CheckpointID, CheckpointPath] = {}
-
-
-# Add a background task to periodicly update job status
-def update_job_status(job_id: FineTuningJobID):
-    while True:
-        job_status = ray_client.get_job_status(finetuning_job_to_ray_job[job_id])
-        status = str(job_status).lower()
-        # Ray status "stopped" is OpenAI status "cancelled"
-        status = "cancelled" if status == "stopped" else status
-        logger.info(f"Status of job {job_id} is '{status}'")
-        running_finetuning_jobs[job_id].status = status
-        if status == "succeeded" or status == "cancelled" or status == "failed":
-            break
-        time.sleep(CHECK_JOB_STATUS_INTERVAL)
-
-
-def handle_create_finetuning_jobs(request: FineTuningParams, background_tasks: BackgroundTasks):
-    base_model = request.model
-    train_file = request.training_file
-    train_file_path = os.path.join(DATASET_BASE_PATH, train_file)
-
-    if not os.path.exists(train_file_path):
-        raise HTTPException(status_code=404, detail=f"Training file '{train_file}' not found!")
-
-    finetune_config = FinetuneConfig(General=request.General, Dataset=request.Dataset, Training=request.Training)
-    finetune_config.General.base_model = base_model
-    finetune_config.Dataset.train_file = train_file_path
-    if request.hyperparameters is not None:
-        if request.hyperparameters.epochs != "auto":
-            finetune_config.Training.epochs = request.hyperparameters.epochs
-
-        if request.hyperparameters.batch_size != "auto":
-            finetune_config.Training.batch_size = request.hyperparameters.batch_size
-
-        if request.hyperparameters.learning_rate_multiplier != "auto":
-            finetune_config.Training.learning_rate = request.hyperparameters.learning_rate_multiplier
-
-    if os.getenv("HF_TOKEN", None):
-        finetune_config.General.config.token = os.getenv("HF_TOKEN", None)
-
-    job = FineTuningJob(
-        id=f"ft-job-{uuid.uuid4()}",
-        model=base_model,
-        created_at=int(time.time()),
-        training_file=train_file,
-        hyperparameters={
-            "n_epochs": finetune_config.Training.epochs,
-            "batch_size": finetune_config.Training.batch_size,
-            "learning_rate_multiplier": finetune_config.Training.learning_rate,
-        },
-        status="running",
-        seed=random.randint(0, 1000) if request.seed is None else request.seed,
-    )
-    finetune_config.General.output_dir = os.path.join(OUTPUT_DIR, job.id)
-    if os.getenv("DEVICE", ""):
-        logger.info(f"specific device: {os.getenv('DEVICE')}")
-
-        finetune_config.Training.device = os.getenv("DEVICE")
-        if finetune_config.Training.device == "hpu":
-            if finetune_config.Training.resources_per_worker.HPU == 0:
-                # set 1
-                finetune_config.Training.resources_per_worker.HPU = 1
-
-    finetune_config_file = f"{JOBS_PATH}/{job.id}.yaml"
-    to_yaml_file(finetune_config_file, finetune_config)
-
-    global ray_client
-    ray_client = JobSubmissionClient() if ray_client is None else ray_client
-
-    ray_job_id = ray_client.submit_job(
-        # Entrypoint shell command to execute
-        entrypoint=f"python finetune_runner.py --config_file {finetune_config_file}",
-    )
-
-    logger.info(f"Submitted Ray job: {ray_job_id} ...")
-
-    running_finetuning_jobs[job.id] = job
-    finetuning_job_to_ray_job[job.id] = ray_job_id
-
-    background_tasks.add_task(update_job_status, job.id)
-
-    return job
-
-
-def handle_extract_sub_adapter(request: ExtractAdapterParams):
-    fine_tuning_job_id = request.fine_tuning_job_id
-    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
-    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
-
-    job = running_finetuning_jobs.get(fine_tuning_job_id)
-    if job is None:
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
-    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
-    assert finetuned_model_path == finetune_config.General.output_dir
-    if not os.path.exists(finetuned_model_path):
-        raise HTTPException(
-            status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
-        )
-    if job.status != "succeeded":
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
-
-    if finetune_config.General.lora_config is None:
-        raise HTTPException(
-            status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
-        )
-    if not finetune_config.General.lora_config.neural_lora_search:
-        raise HTTPException(
-            status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' did not enable NLS algorithm, "
-            f"there is no need to extract sub-adapters!",
-        )
-    nncf_config_path = os.path.join(finetune_config.General.output_dir, "nncf_config.json")
-    if not os.path.exists(nncf_config_path):
-        raise HTTPException(
-            status_code=404, detail=f"The NNCF config file does not exist in the fine-tuning job '{fine_tuning_job_id}!"
-        )
-
-    from comps.finetuning_sqft.utils.extract_sub_adapter import main as extract_sub_adapter_main
-
-    extract_sub_adapter_main(
-        adapter_model_path=finetuned_model_path,
-        nncf_config=nncf_config_path,
-        sub_adapter_version=request.sub_adapter_version,
-        custom_config=request.custom_config,
-    )
-
-    return fine_tuning_job_id
-
-
-def handle_merge_adapter(request: MergeAdapterParams):
-    fine_tuning_job_id = request.fine_tuning_job_id
-    finetune_config_file = f"{JOBS_PATH}/{fine_tuning_job_id}.yaml"
-    finetune_config = parse_yaml_file_as(FinetuneConfig, finetune_config_file)
-
-    job = running_finetuning_jobs.get(fine_tuning_job_id)
-    if job is None:
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
-    finetuned_model_path = os.path.join(OUTPUT_DIR, fine_tuning_job_id)
-    assert finetuned_model_path == finetune_config.General.output_dir
-    if not os.path.exists(finetuned_model_path):
-        raise HTTPException(
-            status_code=404,
-            detail=f"The fine-tuned model saved by the fine-tuning job '{fine_tuning_job_id}' was not found!",
-        )
-    if job.status != "succeeded":
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' has not completed!")
-
-    if finetune_config.General.lora_config is None:
-        raise HTTPException(
-            status_code=404,
-            detail=f"The fine-tuning job '{fine_tuning_job_id}' does not enable LoRA adapter fine-tuning!",
-        )
-
-    adapter_path = finetuned_model_path
-    adapter_version = request.adapter_version
-    if adapter_version is not None:
-        adapter_path = os.path.join(adapter_path, adapter_version)
-        if not os.path.exists(adapter_path):
-            raise HTTPException(
-                status_code=404,
-                detail=f"The fine-tuning job '{fine_tuning_job_id}' does not have a '{adapter_version}' adapter!",
-            )
-
-    from comps.finetuning_sqft.utils.merge import main as merge_adapter_main
-
-    merge_adapter_main(
-        base_model_path=finetune_config.General.base_model,
-        adapter_model_path=adapter_path,
-        output_path=os.path.join(adapter_path, "merged_model"),
-    )
-
-    return fine_tuning_job_id
-
-
-def handle_list_finetuning_jobs():
-    finetuning_jobs_list = FineTuningJobList(data=list(running_finetuning_jobs.values()), has_more=False)
-
-    return finetuning_jobs_list
-
-
-def handle_retrieve_finetuning_job(request: FineTuningJobIDRequest):
-    fine_tuning_job_id = request.fine_tuning_job_id
-
-    job = running_finetuning_jobs.get(fine_tuning_job_id)
-    if job is None:
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
-    return job
-
-
-def handle_cancel_finetuning_job(request: FineTuningJobIDRequest):
-    fine_tuning_job_id = request.fine_tuning_job_id
-
-    ray_job_id = finetuning_job_to_ray_job.get(fine_tuning_job_id)
-    if ray_job_id is None:
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
-
-    global ray_client
-    ray_client = JobSubmissionClient() if ray_client is None else ray_client
-    ray_client.stop_job(ray_job_id)
-
-    job = running_finetuning_jobs.get(fine_tuning_job_id)
-    job.status = "cancelled"
-    return job
-
-
-async def save_content_to_local_disk(save_path: str, content):
-    save_path = Path(save_path)
-    try:
-        if isinstance(content, str):
-            with open(save_path, "w", encoding="utf-8") as file:
-                file.write(content)
-        else:
-            with save_path.open("wb") as fout:
-                content = await content.read()
-                fout.write(content)
-    except Exception as e:
-        logger.info(f"Write file failed. Exception: {e}")
-        raise Exception(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
-
-
-def handle_list_finetuning_checkpoints(request: FineTuningJobIDRequest):
-    fine_tuning_job_id = request.fine_tuning_job_id
-
-    job = running_finetuning_jobs.get(fine_tuning_job_id)
-    if job is None:
-        raise HTTPException(status_code=404, detail=f"Fine-tuning job '{fine_tuning_job_id}' not found!")
-    output_dir = os.path.join(OUTPUT_DIR, job.id)
-    checkpoints = []
-    if os.path.exists(output_dir):
-        # Iterate over the contents of the directory and add an entry for each
-        files = os.listdir(output_dir)
-        for file in files:  # Loop over directory contents
-            file_path = os.path.join(output_dir, file)
-            if os.path.isdir(file_path) and file.startswith("checkpoint"):
-                steps = re.findall("\d+", file)[0]
-                checkpointsResponse = FineTuningJobCheckpoint(
-                    id=f"ftckpt-{uuid.uuid4()}",  # Generate a unique ID
-                    created_at=int(time.time()),  # Use the current timestamp
-                    fine_tuned_model_checkpoint=file_path,  # Directory path itself
-                    fine_tuning_job_id=fine_tuning_job_id,
-                    object="fine_tuning.job.checkpoint",
-                    step_number=steps,
-                )
-                checkpoints.append(checkpointsResponse)
-        if job.status == "succeeded":
-            checkpointsResponse = FineTuningJobCheckpoint(
-                id=f"ftckpt-{uuid.uuid4()}",  # Generate a unique ID
-                created_at=int(time.time()),  # Use the current timestamp
-                fine_tuned_model_checkpoint=output_dir,  # Directory path itself
-                fine_tuning_job_id=fine_tuning_job_id,
-                object="fine_tuning.job.checkpoint",
-            )
-            checkpoints.append(checkpointsResponse)
-
-    return checkpoints
-
-
-async def upload_file(purpose: str = Form(...), file: UploadFile = File(...)):
-    return UploadFileRequest(purpose=purpose, file=file)
-
-
-async def handle_upload_training_files(request: UploadFileRequest):
-    file = request.file
-    if file is None:
-        raise HTTPException(status_code=404, detail="upload file failed!")
-    filename = urllib.parse.quote(file.filename, safe="")
-    save_path = os.path.join(DATASET_BASE_PATH, filename)
-    await save_content_to_local_disk(save_path, file)
-
-    fileBytes = os.path.getsize(save_path)
-    fileInfo = FileObject(
-        id=f"file-{uuid.uuid4()}",
-        object="file",
-        bytes=fileBytes,
-        created_at=int(time.time()),
-        filename=filename,
-        purpose="fine-tune",
-    )
-
-    return fileInfo
diff --git a/comps/finetuning_sqft/launch.sh b/comps/finetuning_sqft/launch.sh
deleted file mode 100644
index 034c82f3d2..0000000000
--- a/comps/finetuning_sqft/launch.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-if [[ -n "$RAY_PORT" ]];then
-    ray start --head --port $RAY_PORT --dashboard-host=0.0.0.0
-else
-    ray start --head --dashboard-host=0.0.0.0
-    export RAY_PORT=8265
-fi
-
-export RAY_ADDRESS=http://localhost:$RAY_PORT
-python finetuning_sqft_service.py
diff --git a/comps/finetuning_sqft/llm_on_ray/common/__init__.py b/comps/finetuning_sqft/llm_on_ray/common/__init__.py
deleted file mode 100644
index 954b7baa4b..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/common/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-from .torch_config import TorchConfig
diff --git a/comps/finetuning_sqft/llm_on_ray/common/common.py b/comps/finetuning_sqft/llm_on_ray/common/common.py
deleted file mode 100644
index ac01ae12e1..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/common/common.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-import glob
-import importlib
-import os
-
-from comps import CustomLogger
-
-logger = CustomLogger("llm_on_ray")
-
-
-def import_all_modules(basedir, prefix=None):
-    all_py_files = glob.glob(basedir + "/*.py")
-    modules = [os.path.basename(f) for f in all_py_files]
-
-    for module in modules:
-        if not module.startswith("_"):
-            module = module.rstrip(".py")
-            if prefix is None:
-                module_name = module
-            else:
-                module_name = f"{prefix}.{module}"
-            try:
-                importlib.import_module(module_name)
-            except Exception:
-                logger.warning(f"import {module_name} error", exc_info=True)
diff --git a/comps/finetuning_sqft/llm_on_ray/common/torch_config.py b/comps/finetuning_sqft/llm_on_ray/common/torch_config.py
deleted file mode 100644
index 9e3f48a7c3..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/common/torch_config.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-import os
-import sys
-from dataclasses import dataclass
-from typing import Optional
-
-from ray.train._internal.worker_group import WorkerGroup
-from ray.train.torch.config import TorchConfig as RayTorchConfig
-from ray.train.torch.config import _TorchBackend
-
-# The package importlib_metadata is in a different place, depending on the Python version.
-if sys.version_info < (3, 8):
-    import importlib_metadata
-else:
-    import importlib.metadata as importlib_metadata
-
-
-@dataclass
-class TorchConfig(RayTorchConfig):
-    device: Optional[str] = None
-
-    @property
-    def backend_cls(self):
-        EnableCCLBackend.device = self.device
-        return EnableCCLBackend
-
-
-def xpu_libs_import():
-    """Try to import IPEX and oneCCL."""
-    try:
-        import intel_extension_for_pytorch
-    except ImportError:
-        raise ImportError("Please install intel_extension_for_pytorch")
-    try:
-        ccl_version = importlib_metadata.version("oneccl_bind_pt")
-        if ccl_version >= "1.12":
-            import oneccl_bindings_for_pytorch
-        else:
-            import torch_ccl
-    except ImportError as ccl_not_exist:
-        raise ImportError("Please install torch-ccl") from ccl_not_exist
-
-
-def hpu_libs_import():
-    """Try to import habana frameworkfs for torch."""
-    try:
-        import habana_frameworks.torch  # noqa: F401
-    except ImportError as habana_not_exist:
-        raise ImportError("Please install habana_frameworks") from habana_not_exist
-
-
-def _set_torch_distributed_env_vars(device):
-    if device is not None:
-        os.environ["ACCELERATE_TORCH_DEVICE"] = device
-
-
-class EnableCCLBackend(_TorchBackend):
-    device: Optional[str] = None
-
-    def on_start(self, worker_group: WorkerGroup, backend_config: RayTorchConfig):
-        libs_import = hpu_libs_import if self.device is not None and self.device.startswith("hpu") else xpu_libs_import
-        for i in range(len(worker_group)):
-            worker_group.execute_single_async(i, libs_import)
-        super().on_start(worker_group, backend_config)
-
-    def on_training_start(self, worker_group: WorkerGroup, backend_config: RayTorchConfig):
-        super().on_training_start(worker_group, backend_config)
-        worker_group.execute(_set_torch_distributed_env_vars, self.device)
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py b/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
deleted file mode 100644
index 0262e494a9..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/finetune/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py b/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
deleted file mode 100644
index 07b12d71e1..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/finetune/data_process.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-import copy
-import math
-import random
-import re
-from dataclasses import dataclass
-from itertools import chain
-from typing import Dict, List, Tuple
-
-import torch
-from torch.utils.data import Dataset
-from transformers import BatchEncoding, DataCollatorWithPadding
-
-IGNORE_INDEX = -100
-
-
-class InstructionDataProcessor:
-    # We used the following prompts for fine-tuning the Alpaca model. You can find reference doc form this URL(https://github.com/tatsu-lab/stanford_alpaca/blob/main/README.md#data-release)
-    def __init__(self, config, tokenizer):
-        self.tokenizer = tokenizer
-        self.end = tokenizer.eos_token
-        self.intro = (
-            "Below is an instruction that describes a task. Write a response that appropriately completes the request."
-        )
-        self.instruction = "### Instruction:\n"
-        self.input = "### Input:\n"
-        self.response = "### Response:\n"
-        self.padding_side = config["Dataset"].get("padding_side", "right")
-        self.truncation_side = config["Dataset"].get("truncation_side", "right")
-        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
-        self.max_source_length = config["Dataset"].get("max_source_length", 384)
-        self.truncation = config["Dataset"].get("truncation", True)
-        self.padding = config["Dataset"].get("padding", True)
-        self.mask_input = config["Dataset"].get("mask_input", True)
-        self.mask_response = config["Dataset"].get("mask_response", True)
-
-    def make_prompt(self, examples):
-        prompts = {}
-        prompts["prompt_sources"] = []
-        prompts["prompt_targets"] = []
-        for rec in examples:
-            instruction = rec["instruction"]
-            response = rec["input"]
-            context = rec.get("output")
-            if not instruction:
-                raise ValueError(f"Expected an instruction in: {rec}")
-            # if not response:
-            #     raise ValueError(f"Expected a response in: {rec}")
-            if context:
-                prompt = (
-                    self.intro
-                    + self.end
-                    + "\n"
-                    + self.instruction
-                    + instruction
-                    + self.input
-                    + context
-                    + self.end
-                    + "\n"
-                    + self.response
-                )
-                prompts["prompt_sources"].append(prompt)
-            else:
-                prompt = self.intro + self.end + "\n" + self.instruction + instruction + self.end + "\n" + self.response
-                prompts["prompt_sources"].append(prompt)
-            prompt_response = response + self.end
-            prompts["prompt_targets"].append(prompt_response)
-        return prompts
-
-    def __truncate_sequences(self, sequences, max_length):
-        """
-        Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L40
-        """
-        words_to_cut = sum(list(map(len, sequences))) - max_length
-        if words_to_cut <= 0:
-            return sequences
-
-        while words_to_cut > 0 and len(sequences) > 0:
-            words_to_cut -= len(sequences[0])
-            sequences = sequences[1:]
-        return sequences
-
-    def tokenize_by_neural_chat(self, examples):
-        """
-        Copied from https://github.com/intel/intel-extension-for-transformers/blob/ae54f698b73a66e5729427cb19f69c33e1a5c34d/intel_extension_for_transformers/transformers/llm/finetuning/data_utils.py#L225
-        The only differences are:
-        - using our own prompt style
-        - add left or right padding and truncation
-        - add mask_input and mask_response
-        """
-        keys = list(examples.data.keys())
-        if len(keys) != 2:
-            raise ValueError("Unsupported dataset format")
-        assistant_tokens = self.tokenizer.tokenize(self.response)
-        header = self.intro + self.end + "\n"
-
-        examples["input_ids"] = []
-        examples["labels"] = []
-        examples["attention_mask"] = []
-        for instruction, response in zip(examples[keys[0]], examples[keys[1]]):
-            convs = re.findall(
-                r"{0}.*?{2}|{1}.*?{2}".format(self.instruction, self.response, self.end),
-                instruction,
-                re.DOTALL,
-            )
-            convs_tokens = [self.tokenizer.tokenize(conv) + self.tokenizer.tokenize("\n") for conv in convs]
-            header_tokens = self.tokenizer.tokenize(header) + self.tokenizer.tokenize("\n")
-            max_input = self.max_source_length - len(header_tokens) - len(assistant_tokens)
-            truncated_convs = self.__truncate_sequences(convs_tokens, max_input)
-            if len(truncated_convs) == 0:
-                truncated_convs = [convs_tokens[-1][: max_input - 3] + convs_tokens[-1][-3:]]
-
-            prompt_tokens = [header_tokens] + truncated_convs + [assistant_tokens]
-            prompt_ids = [self.tokenizer.convert_tokens_to_ids(prompt_token) for prompt_token in prompt_tokens]
-            prompt_ids = list(chain(*prompt_ids))
-
-            resp_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(response.strip()))
-            # keep last and eos_id
-            max_resp = self.max_seq_length - len(prompt_ids) - 1
-
-            # truncating response
-            if len(resp_ids) > max_resp:
-                if self.truncation_side == "right":
-                    resp_ids = resp_ids[: max_resp - 1] + resp_ids[-1:]
-                else:
-                    resp_ids = resp_ids[-max_resp:]
-
-            # masking
-            input_ids = prompt_ids + resp_ids + [self.tokenizer.eos_token_id]
-            if self.mask_input:
-                labels = [IGNORE_INDEX] * len(prompt_ids) + resp_ids + [self.tokenizer.eos_token_id]
-            elif self.mask_response:
-                labels = prompt_ids + [IGNORE_INDEX] * len(resp_ids) + [self.tokenizer.eos_token_id]
-            else:
-                labels = input_ids
-
-            # padding
-            input_len = len(input_ids)
-            pad_len = self.max_seq_length - input_len
-            if self.padding_side == "right":
-                input_ids = input_ids + [self.tokenizer.eos_token_id] * pad_len
-                labels = labels + [IGNORE_INDEX] * pad_len
-                attention_mask = [1] * input_len + [0] * pad_len
-            else:
-                input_ids = [self.tokenizer.eos_token_id] * pad_len + input_ids
-                labels = [IGNORE_INDEX] * pad_len + labels
-                attention_mask = [0] * pad_len + [1] * input_len
-
-            assert len(input_ids) == self.max_seq_length
-            assert len(prompt_ids) <= self.max_source_length
-            assert len(labels) == len(input_ids) == len(attention_mask)
-
-            examples["input_ids"].append(torch.tensor(input_ids))
-            examples["labels"].append(labels)
-            examples["attention_mask"].append(attention_mask)
-
-        return examples
-
-    def tokenize(self, examples):
-        keys = list(examples.data.keys())
-        if len(keys) != 2:
-            raise ValueError("Unsupported dataset format")
-
-        examples["input_ids"] = []
-        examples["labels"] = []
-        examples["attention_mask"] = []
-        for s, t in zip(examples[keys[0]], examples[keys[1]]):
-            results = self.tokenizer(
-                s + t,
-                padding=self.padding,
-                truncation=self.truncation,
-                return_tensors=None,
-                max_length=self.max_length,
-            )
-
-            input_ids = results["input_ids"]
-            input_len = len(input_ids)
-            labels = copy.deepcopy(input_ids)
-            if self.mask_input or self.mask_response:
-                sources_tokenized = self.tokenizer(
-                    s,
-                    padding=False,
-                    truncation=True,
-                    return_tensors=None,
-                    max_length=self.max_length,
-                )
-                input_id_len = len(sources_tokenized["input_ids"])
-                # mask input
-                if self.mask_input:
-                    labels[:input_id_len] = [IGNORE_INDEX] * input_id_len
-                # mask response
-                if self.mask_response:
-                    labels[input_id_len:input_len] = [IGNORE_INDEX] * (input_len - input_id_len)
-
-            examples["input_ids"].append(results["input_ids"])
-            examples["labels"].append(labels)
-            examples["attention_mask"].append(results["attention_mask"])
-        return examples
-
-
-class PretrainingDataProcessor:
-    def __init__(self, config, tokenizer):
-        self.tokenizer = tokenizer
-        self.max_length = self.max_seq_length = config["Dataset"].get("max_length", 512)
-        self.truncation = config["Dataset"].get("truncation", True)
-        self.padding = config["Dataset"].get("padding", True)
-
-    def tokenize(self, examples):
-        keys = list(examples.data.keys())
-        if len(keys) != 1 and "text" not in keys:
-            raise ValueError("Unsupported dataset format")
-
-        key = keys[0] if len(keys) == 1 else "text"
-        examples["input_ids"] = []
-        examples["labels"] = []
-        examples["attention_mask"] = []
-        for exp in examples[key]:
-            results = self.tokenizer(
-                exp,
-                padding=self.padding,
-                truncation=self.truncation,
-                return_tensors=None,
-                max_length=self.max_length,
-            )
-
-            input_ids = results["input_ids"]
-            labels = copy.deepcopy(input_ids)
-            examples["input_ids"].append(results["input_ids"])
-            examples["labels"].append(labels)
-            examples["attention_mask"].append(results["attention_mask"])
-        return examples
-
-
-class TrainDatasetForCE(Dataset):
-    def __init__(self, dataset, args, tokenizer):
-        self.dataset = dataset
-        self.tokenizer = tokenizer
-        self.args = args
-        self.total_len = len(self.dataset)
-
-    def create_one_example(self, qry_encoding: str, doc_encoding: str):
-        item = self.tokenizer.encode_plus(
-            qry_encoding,
-            doc_encoding,
-            truncation=True,
-            max_length=self.args.get("max_length", 512),
-            padding=False,
-        )
-        return item
-
-    def __len__(self):
-        return self.total_len
-
-    def __getitem__(self, item) -> List[BatchEncoding]:
-        query = self.dataset[item]["query"]
-        pos = random.choice(self.dataset[item]["pos"])
-        train_group_size = self.args.get("train_group_size", 8)
-        if len(self.dataset[item]["neg"]) < train_group_size - 1:
-            num = math.ceil((train_group_size - 1) / len(self.dataset[item]["neg"]))
-            negs = random.sample(self.dataset[item]["neg"] * num, train_group_size - 1)
-        else:
-            negs = random.sample(self.dataset[item]["neg"], train_group_size - 1)
-
-        batch_data = []
-        batch_data.append(self.create_one_example(query, pos))
-        for neg in negs:
-            batch_data.append(self.create_one_example(query, neg))
-
-        return batch_data
-
-
-@dataclass
-class GroupCollator(DataCollatorWithPadding):
-    def __call__(self, features) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
-        if isinstance(features[0], list):
-            features = sum(features, [])
-        return super().__call__(features)
-
-
-class TrainDatasetForEmbedding(Dataset):
-    def __init__(self, dataset, args, tokenizer):
-        self.dataset = dataset
-        self.tokenizer = tokenizer
-        self.args = args
-        self.total_len = len(self.dataset)
-
-    def __len__(self):
-        return self.total_len
-
-    def __getitem__(self, item) -> Tuple[str, List[str]]:
-        query = self.dataset[item]["query"]
-        if self.args["query_instruction_for_retrieval"] is not None:
-            query = self.args["query_instruction_for_retrieval"] + query
-
-        passages = []
-
-        assert isinstance(self.dataset[item]["pos"], list)
-        pos = random.choice(self.dataset[item]["pos"])
-        passages.append(pos)
-
-        train_group_size = self.args.get("train_group_size", 8)
-        if len(self.dataset[item]["neg"]) < train_group_size - 1:
-            num = math.ceil((train_group_size - 1) / len(self.dataset[item]["neg"]))
-            negs = random.sample(self.dataset[item]["neg"] * num, train_group_size - 1)
-        else:
-            negs = random.sample(self.dataset[item]["neg"], train_group_size - 1)
-        passages.extend(negs)
-
-        if self.args["passage_instruction_for_retrieval"] is not None:
-            passages = [self.args["passage_instruction_for_retrieval"] + p for p in passages]
-        return query, passages
-
-
-@dataclass
-class EmbedCollator(DataCollatorWithPadding):
-    """Wrapper that does conversion from List[Tuple[encode_qry, encode_psg]] to List[qry], List[psg]
-    and pass batch separately to the actual collator.
-
-    Abstract out data detail for the model.
-    """
-
-    query_max_len: int = 32
-    passage_max_len: int = 128
-
-    def __call__(self, features):
-        query = [f[0] for f in features]
-        passage = [f[1] for f in features]
-
-        if isinstance(query[0], list):
-            query = sum(query, [])
-        if isinstance(passage[0], list):
-            passage = sum(passage, [])
-
-        q_collated = self.tokenizer(
-            query,
-            padding=self.padding,
-            truncation=True,
-            max_length=self.query_max_len,
-            return_tensors="pt",
-        )
-        d_collated = self.tokenizer(
-            passage,
-            padding=self.padding,
-            truncation=True,
-            max_length=self.passage_max_len,
-            return_tensors="pt",
-        )
-        return {"query": q_collated, "passage": d_collated}
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py b/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
deleted file mode 100644
index 8433cbacb8..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/finetune/finetune.py
+++ /dev/null
@@ -1,609 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-#
-# Copyright 2023 The LLM-on-Ray Authors.
-
-#!/usr/bin/env python
-
-import argparse
-import os
-import sys
-from itertools import chain
-from typing import Any, Dict, Optional
-
-import datasets
-import ray
-import torch
-import transformers
-from peft import LoraConfig, get_peft_model
-from pydantic_yaml import parse_yaml_raw_as
-from ray.air import FailureConfig, RunConfig
-from ray.air.config import ScalingConfig
-from ray.train.torch import TorchTrainer
-from transformers import Trainer, TrainingArguments
-
-from comps import CustomLogger
-from comps.finetuning_sqft.finetune_sqft_config import FinetuneConfig
-from comps.finetuning_sqft.llm_on_ray import common
-from comps.finetuning_sqft.llm_on_ray.finetune.data_process import (
-    EmbedCollator,
-    GroupCollator,
-    InstructionDataProcessor,
-    PretrainingDataProcessor,
-    TrainDatasetForCE,
-    TrainDatasetForEmbedding,
-)
-from comps.finetuning_sqft.llm_on_ray.finetune.modeling import BiEncoderModel, CrossEncoder
-
-logger = CustomLogger("llm_on_ray/finetune")
-
-try:
-    from nncf.experimental.torch.nas.bootstrapNAS.training.model_creator_helpers import (
-        create_compressed_model_from_algo_names,
-    )
-    from nncf.torch.model_creation import create_nncf_network
-
-    from comps.finetuning_sqft.utils.nncf_config_process import load_nncf_config
-
-    is_nncf_available = True
-except ImportError:
-    is_nncf_available = False
-    logger.info("NNCF is not installed. Please install it if necessary.")
-
-
-def adapt_transformers_to_device(config: Dict):
-    device = config["Training"]["device"]
-    if device in ["hpu"]:
-        from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-        # adapt transformers to gaudi
-        adapt_transformers_to_gaudi()
-
-
-def set_seed(config: Dict):
-    seed = config["Training"].get("seed", None)
-    if seed is None:
-        return
-    device = config["Training"]["device"]
-    if device in ["cpu", "gpu"]:
-        from accelerate.utils import set_seed as _set_seed
-
-        _set_seed(seed)
-    elif device in ["hpu"]:
-        from optimum.habana.utils import set_seed as _set_seed
-
-        _set_seed(seed)
-
-
-def convert_to_training_args(cls, config: Dict):
-    device = config["Training"]["device"]
-    accelerate_mode = config["Training"]["accelerate_mode"]
-    save_strategy = config["General"]["save_strategy"]
-
-    args = {
-        "output_dir": config["General"]["output_dir"],
-        "report_to": config["General"]["report_to"],
-        "resume_from_checkpoint": config["General"]["resume_from_checkpoint"],
-        "gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
-        "save_strategy": save_strategy if save_strategy != "False" else "no",
-        "bf16": config["Training"]["mixed_precision"] == "bf16",
-        "num_train_epochs": config["Training"]["epochs"],
-        "per_device_train_batch_size": config["Training"]["batch_size"],
-        "per_device_eval_batch_size": config["Training"]["batch_size"],
-        "optim": config["Training"]["optimizer"],
-        "learning_rate": config["Training"]["learning_rate"],
-        "logging_steps": config["Training"]["logging_steps"],
-        "lr_scheduler_type": config["Training"]["lr_scheduler"],
-        "weight_decay": config["Training"]["weight_decay"],
-        "gradient_accumulation_steps": config["Training"]["gradient_accumulation_steps"],
-        "do_train": True,
-    }
-
-    # set attr do_eval
-    vf = config["Dataset"].get("validation_file", None)
-    vsp = config["Dataset"].get("validation_split_percentage", 0)
-    if vf is not None or (vsp / 100 > 0.0 and vsp / 100 < 1.0):
-        args.update({"do_eval": True})
-
-    # set attr max_steps
-    if config["Training"]["max_train_steps"] is not None:
-        args.update({"max_steps": config["Training"]["max_train_steps"]})
-
-    # set attr for device cpu
-    if device == "cpu":
-        if hasattr(cls, "use_cpu"):
-            args.update({"use_cpu": True})
-        if hasattr(cls, "no_cuda"):
-            args.update({"no_cuda": True})
-        # To be tested: whether it works when enabling Neural Lora Search (using NNCF)
-        args.update({"use_ipex": True})
-
-    # set attr 'deepspeed'
-    if accelerate_mode == "DEEPSPEED":
-        args.update({"deepspeed": config["Training"]["deepspeed_config_file"]})
-
-    # set attr for FSDP
-    # if accelerate_mode == "FSDP":
-    #     args.updatwe({})
-
-    # set attr for Intel Gaudi
-    if device == "hpu":
-        args.update({"use_habana": True})
-        args.update({"use_lazy_mode": config["Training"]["hpu_execution_mode"] == "lazy"})
-        args.update({"pipelining_fwd_bwd": True})
-
-    return cls(**args)
-
-
-def convert_dtype(dtype: str) -> Optional[torch.dtype]:
-    supported_dtypes = {
-        "fp16": torch.float16,
-        "bf16": torch.bfloat16,
-        "no": None,
-    }
-    return supported_dtypes[dtype]
-
-
-def load_tokenizer(config: Dict):
-    if config["General"].get("tokenizer_name") is not None:
-        tokenizer_name = config["General"].get("tokenizer_name")
-    else:
-        tokenizer_name = config["General"]["base_model"]
-    load_config = config["General"].get("config", {})
-    # default padding side is right
-    padding_side = config["Dataset"].get("padding_side", "right")
-    # default truncation side is right
-    truncation_side = config["Dataset"].get("truncation_side", "right")
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        tokenizer_name, padding_side=padding_side, truncation_side=truncation_side, **load_config
-    )
-    return tokenizer
-
-
-def load_dataset(config: Dict):
-    dataset_file = config["Dataset"].get("train_file", None)
-    if dataset_file is None:
-        return
-
-    if os.path.exists(dataset_file):
-        # load from local file
-        def local_load(name, **load_config):
-            if os.path.isfile(name):
-                file = os.path.basename(os.path.abspath(name))
-                path = os.path.dirname(os.path.abspath(name))
-                dataset = datasets.load_dataset(path, data_files=file, **load_config)
-            else:
-                dataset = datasets.load_dataset(name, **load_config)
-            return dataset["train"]
-
-        train_dataset = local_load(dataset_file)
-        validation_file = config["Dataset"].get("validation_file", None)
-        if validation_file is not None:
-            validation_dataset = local_load(validation_file)
-            return datasets.DatasetDict({"train": train_dataset, "validation": validation_dataset})
-
-        validation_split_percentage = config["Dataset"].get("validation_split_percentage", 0)
-        if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
-            dataset_dict = train_dataset.train_test_split(test_size=validation_split_percentage / 100)
-            dataset_dict["validation"] = dataset_dict["test"]
-            return dataset_dict
-
-        return datasets.DatasetDict({"train": train_dataset})
-    else:
-        # try to download and load dataset from huggingface.co
-        load_config = config["General"].get("config", {})
-        use_auth_token = load_config.get("token", None)
-        raw_dataset = datasets.load_dataset(dataset_file, token=use_auth_token)
-
-        validation_split_percentage = config["Dataset"].get("validation_split_percentage", 0)
-        if "validation" not in raw_dataset.keys() and (
-            validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0
-        ):
-            dataset_dict = raw_dataset["train"].train_test_split(test_size=validation_split_percentage / 100)
-            dataset_dict["validation"] = dataset_dict["test"]
-            return dataset_dict
-
-        return raw_dataset
-
-
-def tokenize_dataset(config: Dict, tokenizer, dataset):
-    task = config["General"].get("task", "instruction_tuning")
-    if task == "instruction_tuning":
-        group = config["Dataset"].get("group", True)
-        block_size = config["Dataset"].get("block_size", 512)
-        tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
-
-        processor = InstructionDataProcessor(config, tokenizer)
-
-        for key in dataset:
-            prompts = processor.make_prompt(dataset[key])
-            dataset[key] = datasets.Dataset.from_dict(prompts)
-
-        column_names = list(dataset["train"].features)
-        tokenize_fn = (
-            processor.tokenize_by_neural_chat
-            if config["Dataset"].get("data_preprocess_type", "") == "neural_chat"
-            else processor.tokenize
-        )
-
-        tokenized_dataset = dataset.map(
-            tokenize_fn,
-            remove_columns=column_names,
-            batched=True,
-            load_from_cache_file=False,
-            desc="Tokenize dataset",
-        )
-
-        if group:
-
-            def group_texts(examples):
-                # Concatenate all texts.
-                concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-                total_length = len(concatenated_examples[list(examples.keys())[0]])
-                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-                # customize this part to your needs.
-                if total_length >= block_size:
-                    total_length = (total_length // block_size) * block_size
-                # Split by chunks of max_len.
-                result = {
-                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-                    for k, t in concatenated_examples.items()
-                }
-                return result
-
-            tokenized_dataset = tokenized_dataset.map(
-                group_texts,
-                batched=True,
-                load_from_cache_file=False,
-                desc=f"Grouping texts in chunks of {block_size}",
-            )
-
-        return tokenized_dataset
-    elif task == "pretraining":
-        group = True
-        block_size = config["Dataset"].get("block_size", 512)
-        tokenizer.pad_token = tokenizer.eos_token if not tokenizer.pad_token else tokenizer.pad_token
-
-        processor = PretrainingDataProcessor(config, tokenizer)
-
-        column_names = list(dataset["train"].features)
-
-        tokenized_dataset = dataset.map(
-            processor.tokenize,
-            remove_columns=column_names,
-            batched=True,
-            load_from_cache_file=False,
-            desc="Tokenize dataset",
-        )
-
-        if group:
-
-            def group_texts(examples):
-                # Concatenate all texts.
-                concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
-                total_length = len(concatenated_examples[list(examples.keys())[0]])
-                # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
-                # customize this part to your needs.
-                if total_length >= block_size:
-                    total_length = (total_length // block_size) * block_size
-                # Split by chunks of max_len.
-                result = {
-                    k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
-                    for k, t in concatenated_examples.items()
-                }
-                return result
-
-            tokenized_dataset = tokenized_dataset.map(
-                group_texts,
-                batched=True,
-                load_from_cache_file=False,
-                desc=f"Grouping texts in chunks of {block_size}",
-            )
-
-        return tokenized_dataset
-    elif task == "rerank":
-        dataset["train"] = TrainDatasetForCE(dataset["train"], config["Dataset"], tokenizer)
-        return dataset
-    elif task == "embedding":
-        dataset["train"] = TrainDatasetForEmbedding(dataset["train"], config["Dataset"], tokenizer)
-        return dataset
-    else:
-        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
-
-
-def prepare_data_collator(config: Dict, tokenizer):
-    task = config["General"].get("task", "instruction_tuning")
-    if task == "instruction_tuning" or task == "pretraining":
-        return transformers.DataCollatorForLanguageModeling(
-            tokenizer=tokenizer, mlm=False, return_tensors="pt", pad_to_multiple_of=8
-        )
-    elif task == "rerank":
-        return GroupCollator(tokenizer)
-    elif task == "embedding":
-        return EmbedCollator(
-            tokenizer=tokenizer,
-            padding=config["Dataset"]["padding"],
-            query_max_len=config["Dataset"]["query_max_len"],
-            passage_max_len=config["Dataset"]["passage_max_len"],
-        )
-    else:
-        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
-
-
-def load_model(config: Dict):
-    model_name = config["General"]["base_model"]
-    model_dtype = convert_dtype(config["Training"].get("mixed_precision", "no"))
-    model_config = config["General"].get("config", {})
-    task = config["General"].get("task", "instruction_tuning")
-    compression_ctrl = None
-    if task == "instruction_tuning" or task == "pretraining":
-        model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=model_dtype, **model_config)
-        lora_config = config["General"].get("lora_config", None)
-        if lora_config and task != "pretraining":
-            neural_lora_search = lora_config.pop("neural_lora_search", False)
-            target_module_groups = lora_config.pop("target_module_groups", None)
-            search_space = lora_config.pop("search_space", None)
-            nncf_config = lora_config.pop("nncf_config", None)
-            if not lora_config.get("sparse_adapter", False):
-                # To avoid the error in the following case:
-                # not using SparsePEFT and not having the peft library that supports SparsePEFT installed.
-                lora_config.pop("sparse_adapter", False)
-            peft_config = LoraConfig(**lora_config)
-            model = get_peft_model(model, peft_config)
-
-            # Neural LoRA Search (NLS)
-            if neural_lora_search:
-                if not is_nncf_available:
-                    raise ImportError("NNCF is not installed. Please install it.")
-                nncf_config = load_nncf_config(
-                    config=config,
-                    model=model,
-                    target_module_groups=target_module_groups,
-                    search_space=search_space,
-                    nncf_config=nncf_config,
-                )
-                model = create_nncf_network(model, nncf_config)
-                compression_ctrl, model = create_compressed_model_from_algo_names(
-                    model, nncf_config, algo_names=["progressive_shrinking"]
-                )
-    elif task == "rerank":
-        model = CrossEncoder.from_pretrained(
-            config["Dataset"].get("train_group_size", 8),
-            config["Training"]["batch_size"],
-            model_name,
-            from_tf=bool(".ckpt" in model_name),
-            config=model_config,
-        )
-    elif task == "embedding":
-        should_concat = False
-        if (
-            config["Dataset"]["query_max_len"] == config["Dataset"]["passage_max_len"]
-            and config["Dataset"]["padding"] == "max_length"
-        ):
-            should_concat = True
-        if config["Training"]["device"] == "hpu" and not should_concat:
-            raise ValueError("please set query_max_len==passage_max_len and padding='max_length' for hpu.")
-
-        if config["Training"].get("embedding_training_config", None) is not None:
-            model = BiEncoderModel(
-                model_name=model_name, should_concat=should_concat, **config["Training"]["embedding_training_config"]
-            )
-        else:
-            model = BiEncoderModel(model_name=model_name, should_concat=should_concat)
-    else:
-        raise NotImplementedError(f"Unsupported task {task}, only support instruction_tuning, rerank, embedding now.")
-
-    egc = config["General"].get("enable_gradient_checkpointing", False)
-    if egc:
-        model.enable_input_require_grads()
-        model.gradient_checkpointing_enable()
-        model.config.use_cache = False
-
-    model.to(dtype=model_dtype, device=torch.device(config["Training"]["device"]))
-
-    return model, compression_ctrl
-
-
-def get_trainer(config: Dict, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=None):
-    device = config["Training"]["device"]
-    if device in ["cpu", "gpu", "cuda"]:
-        training_args = convert_to_training_args(TrainingArguments, config)
-        trainer_args = {
-            "model": model,
-            "args": training_args,
-            "train_dataset": tokenized_dataset["train"],
-            "eval_dataset": (
-                tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None
-            ),
-            "tokenizer": tokenizer,
-            "data_collator": data_collator,
-        }
-        if compression_ctrl is not None:
-            trainer_args["compression_ctrl"] = compression_ctrl
-
-        trainer = Trainer(**trainer_args)
-        return training_args, trainer
-    elif device in ["hpu"]:
-        assert compression_ctrl is None
-        from optimum.habana import GaudiConfig
-        from optimum.habana.transformers import GaudiTrainer, GaudiTrainingArguments
-
-        # If gaudi_config_name is provided, load gaudi_config from huggingface model hub(https://huggingface.co/Habana), otherwise use default gaudi_config
-        gaudi_config_name = config["General"].get("gaudi_config_name", None)
-        if gaudi_config_name is not None:
-            gaudi_config = GaudiConfig.from_pretrained(gaudi_config_name)
-        else:
-            gaudi_config = GaudiConfig()
-            gaudi_config.use_fused_adam = True
-            gaudi_config.use_fused_clip_norm = True
-
-        training_args = convert_to_training_args(GaudiTrainingArguments, config)
-        trainer = GaudiTrainer(
-            model=model,
-            args=training_args,
-            gaudi_config=gaudi_config,
-            train_dataset=tokenized_dataset["train"],
-            eval_dataset=tokenized_dataset["validation"] if tokenized_dataset.get("validation") is not None else None,
-            tokenizer=tokenizer,
-            data_collator=data_collator,
-        )
-        return training_args, trainer
-    return None
-
-
-def train_func(config: Dict[str, Any]):
-    os.chdir(config["cwd"])
-
-    adapt_transformers_to_device(config)
-
-    set_seed(config)
-
-    tokenizer = load_tokenizer(config)
-
-    dataset = load_dataset(config)
-
-    max_train_samples = config["Dataset"].get("max_train_samples", 0)
-    if 0 < max_train_samples < len(dataset["train"]):
-        dataset["train"] = dataset["train"].select(range(max_train_samples))
-
-    max_eval_samples = config["Dataset"].get("max_eval_samples", 0)
-    if "validation" in dataset and 0 < max_eval_samples < len(dataset["validation"]):
-        dataset["validation"] = dataset["validation"].select(range(max_eval_samples))
-
-    tokenized_dataset = tokenize_dataset(config, tokenizer, dataset)
-
-    data_collator = prepare_data_collator(config, tokenizer)
-
-    model, compression_ctrl = load_model(config)
-
-    training_args, trainer = get_trainer(
-        config, model, tokenizer, tokenized_dataset, data_collator, compression_ctrl=compression_ctrl
-    )
-
-    logger.info("train start")
-    trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
-    trainer.save_model()
-    logger.info("train finish")
-
-
-def get_finetune_config():
-    parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task")
-    parser.add_argument(
-        "--config_file",
-        type=str,
-        required=True,
-        default=None,
-        help="The name of the dataset to use (via the datasets library).",
-    )
-
-    # Print help if no arguments were provided
-    if len(sys.argv) == 1:
-        parser.print_help(sys.stderr)
-        sys.exit(1)
-
-    args = parser.parse_args()
-    config_file = args.config_file
-
-    with open(config_file) as f:
-        finetune_config = parse_yaml_raw_as(FinetuneConfig, f)
-    return finetune_config.dict()
-
-
-def main(external_config=None):
-    if not external_config:
-        config = get_finetune_config()
-    else:
-        config = external_config
-
-    config["cwd"] = os.getcwd()
-
-    num_training_workers = config["Training"].get("num_training_workers")
-    resources_per_worker = config["Training"].get("resources_per_worker")
-
-    if num_training_workers > 1 and config["Training"].get("accelerate_mode", None) is None:
-        config["Training"]["accelerate_mode"] = "DDP"  # will use DDP to accelerate if no method specified
-
-    ccl_worker_count = 1
-    device = config["Training"]["device"]
-    if device != "cpu":
-        ccl_worker_count = num_training_workers
-
-    if not ray.is_initialized():
-        runtime_env = {
-            "env_vars": {
-                "OMP_NUM_THREADS": str(resources_per_worker["CPU"]),
-                "CCL_ZE_IPC_EXCHANGE": "sockets",
-                "CCL_WORKER_COUNT": str(ccl_worker_count),
-                "CCL_LOG_LEVEL": "info",
-                "FI_TCP_IFACE": "lo",
-                "FI_PROVIDER": "tcp",
-            }
-        }
-
-        if config["General"]["gpt_base_model"] is True:
-            runtime_env["pip"] = ["transformers==4.26.0"]
-
-        if device == "gpu":
-            num_cpus = resources_per_worker["CPU"] * num_training_workers + 1  # additional 1 for head worker
-            ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
-        else:
-            ray.init(runtime_env=runtime_env)
-
-    logger.info(f"ray available resources = {ray.available_resources()}")
-
-    use_gpu = True if device == "gpu" else False
-    scaling_config = ScalingConfig(
-        num_workers=num_training_workers,
-        use_gpu=use_gpu,
-        resources_per_worker=resources_per_worker,
-        placement_strategy="SPREAD",
-    )
-
-    # if try to use Intel GPU, convert device to 'xpu'
-    # due to accelerate internal use 'xpu' represent Intel GPU
-    if device == "gpu":
-        from accelerate.utils import is_xpu_available
-
-        if is_xpu_available():
-            device = "xpu"
-
-    # Jinjie: commented out the code from line 572 to 581 to temporarily disable CCL for debugging purposes.
-    # if config.get("torch_config", None) is None:
-    #     backend = None
-    #     if device == "cpu" or device == "xpu" or device == "gpu":
-    #         backend = "ccl"
-    #     elif device == "hpu":
-    #         backend = "hccl"
-    #     torch_config = common.TorchConfig(backend=backend, device=device)
-    # else:
-    #     customer_torch_config = config.get("torch_config")
-    #     torch_config = common.TorchConfig(**customer_torch_config, device=device)
-
-    if config.get("failure_config", None) is None:
-        failure_config = FailureConfig()
-    else:
-        customer_failure_config = config.get("failure_config")
-        failure_config = FailureConfig(**customer_failure_config)
-
-    if config.get("run_config", None) is None:
-        run_config = RunConfig(failure_config=failure_config)
-    else:
-        customer_run_config = config.get("run_config")
-        if customer_run_config.get("failure_config", None) is None:
-            customer_run_config["failure_config"] = failure_config
-        run_config = RunConfig(**customer_run_config)
-
-    trainer = TorchTrainer(
-        train_func,
-        train_loop_config=config,
-        scaling_config=scaling_config,
-        # torch_config=torch_config,    # Jinjie: check line 571.
-        run_config=run_config,
-    )
-    results = trainer.fit()
-    if external_config is not None:
-        return results
-
-
-if __name__ == "__main__":
-    main()
diff --git a/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py b/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
deleted file mode 100644
index 7a2884f3bc..0000000000
--- a/comps/finetuning_sqft/llm_on_ray/finetune/modeling.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Dict, Optional
-
-import torch
-import torch.distributed as dist
-from torch import nn
-from transformers import AutoModel, AutoModelForSequenceClassification, PreTrainedModel
-from transformers.modeling_outputs import MaskedLMOutput, SequenceClassifierOutput
-
-from comps import CustomLogger
-
-logger = CustomLogger("llm_on_ray/finetune/modeling")
-
-
-class CrossEncoder(PreTrainedModel):
-    def __init__(self, hf_model: PreTrainedModel, train_group_size: int, batch_size: int):
-        super().__init__(hf_model.config)
-        self.hf_model = hf_model
-        self.train_group_size = train_group_size
-        self.batch_size = batch_size
-
-        self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
-
-        self.register_buffer("target_label", torch.zeros(self.batch_size, dtype=torch.long))
-
-    def gradient_checkpointing_enable(self, **kwargs):
-        self.hf_model.gradient_checkpointing_enable(**kwargs)
-
-    def forward(self, **batch):
-        ranker_out: SequenceClassifierOutput = self.hf_model(**batch, return_dict=True)
-        logits = ranker_out.logits
-
-        if self.training:
-            scores = logits.view(-1, self.train_group_size)
-            loss = self.cross_entropy(scores, self.target_label[: scores.shape[0]])
-
-            return SequenceClassifierOutput(
-                loss=loss,
-                **ranker_out,
-            )
-        else:
-            return ranker_out
-
-    @classmethod
-    def from_pretrained(cls, train_group_size: int, batch_size: int, *args, **kwargs):
-        hf_model = AutoModelForSequenceClassification.from_pretrained(*args, **kwargs)
-        reranker = cls(hf_model, train_group_size, batch_size)
-        return reranker
-
-    def save_pretrained(self, output_dir: str, **kwargs):
-        state_dict = self.hf_model.state_dict()
-        state_dict = type(state_dict)({k: v.clone().cpu() for k, v in state_dict.items()})
-        kwargs.pop("state_dict")
-        self.hf_model.save_pretrained(output_dir, state_dict=state_dict, **kwargs)
-
-
-class BiEncoderModel(nn.Module):
-    TRANSFORMER_CLS = AutoModel
-
-    def __init__(
-        self,
-        model_name: str = None,
-        should_concat: bool = False,
-        normalized: bool = False,
-        sentence_pooling_method: str = "cls",
-        negatives_cross_device: bool = False,
-        temperature: float = 1.0,
-        use_inbatch_neg: bool = True,
-    ):
-        super().__init__()
-        self.model = AutoModel.from_pretrained(model_name, add_pooling_layer=False)
-        self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
-
-        self.should_concat = should_concat
-        self.normalized = normalized
-        self.sentence_pooling_method = sentence_pooling_method
-        self.temperature = temperature
-        self.use_inbatch_neg = use_inbatch_neg
-        self.config = self.model.config
-
-        if not normalized:
-            self.temperature = 1.0
-            logger.info("reset temperature = 1.0 due to using inner product to compute similarity")
-        if normalized:
-            if self.temperature > 0.5:
-                raise ValueError(
-                    "Temperature should be smaller than 1.0 when use cosine similarity (i.e., normalized=True). Recommend to set it 0.01-0.1"
-                )
-
-        self.negatives_cross_device = negatives_cross_device
-        if self.negatives_cross_device:
-            if not dist.is_initialized():
-                raise ValueError("Distributed training has not been initialized for representation all gather.")
-            #     logger.info("Run in a single GPU, set negatives_cross_device=False")
-            #     self.negatives_cross_device = False
-            # else:
-            self.process_rank = dist.get_rank()
-            self.world_size = dist.get_world_size()
-
-    def gradient_checkpointing_enable(self, **kwargs):
-        self.model.gradient_checkpointing_enable(**kwargs)
-
-    def sentence_embedding(self, hidden_state, mask):
-        if self.sentence_pooling_method == "mean":
-            s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
-            d = mask.sum(axis=1, keepdim=True).float()
-            return s / d
-        elif self.sentence_pooling_method == "cls":
-            return hidden_state[:, 0]
-
-    def encode(self, features):
-        if features is None:
-            return None
-        psg_out = self.model(**features, return_dict=True)
-        p_reps = self.sentence_embedding(psg_out.last_hidden_state, features["attention_mask"])
-        if self.normalized:
-            p_reps = torch.nn.functional.normalize(p_reps, dim=-1)
-        return p_reps.contiguous()
-
-    def encode_concat(self, query, passage):
-        if query is None or passage is None:
-            return None
-
-        batch_size = query["input_ids"].size()[0]
-
-        psg_out = self.model(
-            input_ids=torch.cat([query["input_ids"], passage["input_ids"]]),
-            attention_mask=torch.cat([query["attention_mask"], passage["attention_mask"]]),
-            return_dict=True,
-        )
-        reps = self.sentence_embedding(
-            psg_out.last_hidden_state, torch.cat([query["attention_mask"], passage["attention_mask"]])
-        )
-        if self.normalized:
-            reps = torch.nn.functional.normalize(reps, dim=-1)
-
-        q_reps = reps[:batch_size]
-        p_reps = reps[batch_size:]
-
-        return q_reps.contiguous(), p_reps.contiguous()
-
-    def compute_similarity(self, q_reps, p_reps):
-        if len(p_reps.size()) == 2:
-            return torch.matmul(q_reps, p_reps.transpose(0, 1))
-        return torch.matmul(q_reps, p_reps.transpose(-2, -1))
-
-    def forward(self, query: Dict[str, torch.Tensor] = None, passage: Dict[str, torch.Tensor] = None):
-        if self.should_concat:
-            q_reps, p_reps = self.encode_concat(query, passage)
-        else:
-            q_reps = self.encode(query)
-            p_reps = self.encode(passage)
-
-        if self.training:
-            if self.negatives_cross_device and self.use_inbatch_neg:
-                q_reps = self._dist_gather_tensor(q_reps)
-                p_reps = self._dist_gather_tensor(p_reps)
-
-            group_size = p_reps.size(0) // q_reps.size(0)
-            if self.use_inbatch_neg:
-                scores = self.compute_similarity(q_reps, p_reps) / self.temperature  # B B*G
-                scores = scores.view(q_reps.size(0), -1)
-
-                target = torch.arange(scores.size(0), device=scores.device, dtype=torch.long)
-                target = target * group_size
-                loss = self.compute_loss(scores, target)
-            else:
-                scores = (
-                    self.compute_similarity(
-                        q_reps[
-                            :,
-                            None,
-                            :,
-                        ],
-                        p_reps.view(q_reps.size(0), group_size, -1),
-                    ).squeeze(1)
-                    / self.temperature
-                )  # B G
-
-                scores = scores.view(q_reps.size(0), -1)
-                target = torch.zeros(scores.size(0), device=scores.device, dtype=torch.long)
-                loss = self.compute_loss(scores, target)
-
-        else:
-            scores = self.compute_similarity(q_reps, p_reps)
-            loss = None
-
-        return MaskedLMOutput(loss=loss, logits=None, hidden_states=None, attentions=None)
-
-    def compute_loss(self, scores, target):
-        return self.cross_entropy(scores, target)
-
-    def _dist_gather_tensor(self, t: Optional[torch.Tensor]):
-        if t is None:
-            return None
-        t = t.contiguous()
-
-        all_tensors = [torch.empty_like(t) for _ in range(self.world_size)]
-        dist.all_gather(all_tensors, t)
-
-        all_tensors[self.process_rank] = t
-        all_tensors = torch.cat(all_tensors, dim=0)
-
-        return all_tensors
-
-    def save(self, output_dir: str):
-        state_dict = self.model.state_dict()
-        state_dict = type(state_dict)({k: v.clone().cpu() for k, v in state_dict.items()})
-        self.model.save_pretrained(output_dir, state_dict=state_dict)
diff --git a/comps/finetuning_sqft/patches/nncf-v2.12.0.patch b/comps/finetuning_sqft/patches/nncf-v2.12.0.patch
deleted file mode 100644
index f4cbfe0401..0000000000
--- a/comps/finetuning_sqft/patches/nncf-v2.12.0.patch
+++ /dev/null
@@ -1,72 +0,0 @@
-diff --git a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
-index bc6464b24..ca2666626 100644
---- a/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
-+++ b/nncf/experimental/torch/nas/bootstrapNAS/elasticity/elasticity_builder.py
-@@ -152,3 +152,16 @@ class ElasticityBuilder(PTCompressionAlgorithmBuilder):
- 
-         # No conflict resolving with the related config options, parameters are overridden by compression state
-         self._available_elasticity_dims = list(map(ElasticityDim, available_elasticity_dims_state))
-+
-+    def _are_frozen_layers_allowed(self):
-+        """
-+        Check if frozen layers are allowed based on NNCF configuration.
-+        If specified in NNCF configuration, frozen layers will be allowed.
-+
-+        :return: A tuple where the first element is a boolean indicating if frozen layers are allowed,
-+                 and the second element is a string message explaining the reason.
-+        """
-+        frozen_layers_allowed = self.config.get("bootstrapNAS", {}).get("training", {}).get("frozen_layers_allowed", False)
-+        if frozen_layers_allowed:
-+            return True, "Frozen layers are allowed (`frozen_layers_allowed` is set to True in NNCF config)"
-+        return super()._are_frozen_layers_allowed()
-diff --git a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
-index 92609327f..7a0555e3e 100644
---- a/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
-+++ b/nncf/experimental/torch/nas/bootstrapNAS/training/progressive_shrinking_builder.py
-@@ -152,3 +152,16 @@ class ProgressiveShrinkingBuilder(PTCompressionAlgorithmBuilder):
-         self._bn_adapt_params = state_without_name[self._state_names.BN_ADAPTATION_PARAMS]
-         bn_adapt_algo_kwargs = get_bn_adapt_algo_kwargs(self.config, self._bn_adapt_params)
-         self._bn_adaptation = BatchnormAdaptationAlgorithm(**bn_adapt_algo_kwargs) if bn_adapt_algo_kwargs else None
-+
-+    def _are_frozen_layers_allowed(self):
-+        """
-+        Check if frozen layers are allowed based on the algorithm configuration.
-+        If specified in the algorithm configuration, frozen layers will be allowed.
-+
-+        :return: A tuple where the first element is a boolean indicating if frozen layers are allowed,
-+                 and the second element is a string message explaining the reason.
-+        """
-+        frozen_layers_allowed = self._algo_config.get("frozen_layers_allowed", False)
-+        if frozen_layers_allowed:
-+            return True, "Frozen layers are allowed (`frozen_layers_allowed` is set to True in the algorithm config)"
-+        return super()._are_frozen_layers_allowed()
-diff --git a/nncf/torch/layer_utils.py b/nncf/torch/layer_utils.py
-index fb7d7bed7..3b8fda98e 100644
---- a/nncf/torch/layer_utils.py
-+++ b/nncf/torch/layer_utils.py
-@@ -127,6 +127,25 @@ class _NNCFModuleMixin:
-                 results = op_results
-         return results
- 
-+    def get_proxy_module(self, *args):
-+        """
-+        Gets a proxy module with pre-operations applied.
-+
-+        Args:
-+            *args: Arguments for the pre-operations.
-+
-+        Returns:
-+            ProxyModule: The proxy module with pre-operations applied.
-+        """
-+        proxy_module = ProxyModule(self)
-+        for op in self.pre_ops.values():
-+            op_args = op(proxy_module, args)
-+            if op_args is not None:
-+                if not isinstance(op_args, tuple):
-+                    op_args = tuple([op_args])
-+                args = op_args
-+        return proxy_module
-+
- 
- class CompressionParameter(nn.Parameter):
-     """
diff --git a/comps/finetuning_sqft/patches/peft-v0.10.0.patch b/comps/finetuning_sqft/patches/peft-v0.10.0.patch
deleted file mode 100644
index 9606bd24ef..0000000000
--- a/comps/finetuning_sqft/patches/peft-v0.10.0.patch
+++ /dev/null
@@ -1,220 +0,0 @@
-diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py
-index cc5c60a..fa1422e 100644
---- a/src/peft/tuners/lora/config.py
-+++ b/src/peft/tuners/lora/config.py
-@@ -268,6 +268,31 @@ class LoraConfig(PeftConfig):
-             )
-         },
-     )
-+    sparse_adapter: bool = field(
-+        default=False,
-+        metadata={
-+            "help": (
-+                "Enable 'SparsePEFT'. This strategy is designed for fine-tuning sparse models using adapters. "
-+                "It sparsifies the adapter's parameter matrix (BA) such that the sparsity pattern of BA aligns "
-+                "with that of the base model's weights (W). This alignment allows for the merging of the adapter "
-+                "with the base model without disrupting its sparsity. It is derived from SQFT() and is used in the "
-+                "pipelines SQFT + SparsePEFT and SQFT + QA-SparsePEFT."
-+            )
-+        }
-+    )
-+    quantization_aware: bool = field(
-+        default=False,
-+        metadata={
-+            "help": (
-+                "Enable quantization-aware training. This strategy is designed for fine-tuning GPTQ quantized models "
-+                "using adapters. It activates the `SQFTQuantAwareLinear` from SQFT in place of `QuantLinear`, enabling "
-+                "quantization-aware training for adapters. This helps optimize model accuracy and allows the adapter "
-+                "to be merged with the base quantized model, improving performance and deployment efficiency during "
-+                "inference. This strategy, when used in conjunction with `sparse_adapter`, corresponds to the "
-+                "SQFT + QA-SparsePEFT method described in the SQFT paper."
-+            )
-+        }
-+    )
- 
-     def __post_init__(self):
-         self.peft_type = PeftType.LORA
-diff --git a/src/peft/tuners/lora/gptq.py b/src/peft/tuners/lora/gptq.py
-index 333dfa6..7272824 100644
---- a/src/peft/tuners/lora/gptq.py
-+++ b/src/peft/tuners/lora/gptq.py
-@@ -108,7 +108,17 @@ def dispatch_gptq(
-     AutoGPTQQuantLinear = get_auto_gptq_quant_linear(gptq_quantization_config)
- 
-     if AutoGPTQQuantLinear is not None and isinstance(target_base_layer, AutoGPTQQuantLinear):
--        new_module = QuantLinear(target, adapter_name, **kwargs)
-+        quantization_aware = kwargs.get("quantization_aware", False)
-+        if quantization_aware:
-+            # Attempt to import the `SQFTQuantAwareLinear` module
-+            # from https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/blob/main/SQFT/modules/sqft_linear.py
-+            try:
-+                from modules.sqft_linear import SQFTQuantAwareLinear
-+            except ImportError:
-+                raise ImportError("The module 'SQFTQuantAwareLinear' could not be imported.")
-+            new_module = SQFTQuantAwareLinear(target, adapter_name, **kwargs)
-+        else:
-+            new_module = QuantLinear(target, adapter_name, **kwargs)
-         target.qweight = target_base_layer.qweight
- 
-     return new_module
-diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
-index 829b7bd..9d83967 100644
---- a/src/peft/tuners/lora/layer.py
-+++ b/src/peft/tuners/lora/layer.py
-@@ -28,6 +28,10 @@ from peft.utils.other import transpose
- 
- from .config import LoraConfig
- 
-+try:
-+    from nncf.torch.layers import NNCFLinear
-+except ImportError:
-+    NNCFLinear = None
- 
- class LoraLayer(BaseTunerLayer):
-     # All names of layers that may contain (trainable) adapter weights
-@@ -346,6 +350,7 @@ class Linear(nn.Module, LoraLayer):
-         init_lora_weights: Union[bool, str] = True,
-         use_rslora: bool = False,
-         use_dora: bool = False,
-+        sparse_adapter: bool = False,  # Set this to True if enabling 'SparsePEFT' for fine-tuning sparse models
-         **kwargs,
-     ) -> None:
-         super().__init__()
-@@ -363,6 +368,7 @@ class Linear(nn.Module, LoraLayer):
-             use_dora=use_dora,
-         )
-         self.is_target_conv_1d_layer = is_target_conv_1d_layer
-+        self.sparse_adapter = sparse_adapter
- 
-     def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None:
-         """
-@@ -471,6 +477,10 @@ class Linear(nn.Module, LoraLayer):
-             weight_B = weight_B.float()
- 
-         output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter]
-+        if self.sparse_adapter:
-+            # Apply the sparse mask to BA (`output_tensor`).
-+            mask = (self.base_layer.weight != 0)
-+            output_tensor = output_tensor * mask
- 
-         if cast_to_fp32:
-             output_tensor = output_tensor.to(dtype=dtype)
-@@ -506,7 +516,26 @@ class Linear(nn.Module, LoraLayer):
-                 x = x.to(lora_A.weight.dtype)
- 
-                 if not self.use_dora[active_adapter]:
--                    result = result + lora_B(lora_A(dropout(x))) * scaling
-+                    if not self.sparse_adapter:
-+                        result = result + lora_B(lora_A(dropout(x))) * scaling
-+                    else:
-+                        # Since 'sparse_adapter' is enabled, we need to multiply the parameter matrices of `lora_B` and
-+                        # `lora_A` here instead of calling the forward methods of `lora_B` and `lora_A`. This results
-+                        # in the NNCF graph not recognizing lora A and lora B nodes when using NLS strategy. Therefore,
-+                        # we execute `lora_B(lora_A(x))` solely to include these two NNCFLinear nodes in the NNCF graph.
-+                        if NNCFLinear is not None and not self.training:
-+                            lora_B(lora_A(x))
-+                        if NNCFLinear is not None and isinstance(lora_A, NNCFLinear):
-+                            adapter_weight = torch.matmul(
-+                                lora_B.get_proxy_module(x).weight,
-+                                lora_A.get_proxy_module(x).weight
-+                            ) * scaling
-+                        else:
-+                            adapter_weight = torch.matmul(lora_B.weight, lora_A.weight) * scaling
-+                        # Apply the sparse mask to BA (`adapter_weight`).
-+                        mask = (self.base_layer.weight != 0).detach()
-+                        adapter_weight = adapter_weight * mask
-+                        result = result + nn.functional.linear(dropout(x), adapter_weight)
-                 else:
-                     x = dropout(x)
-                     result = result + self._apply_dora(x, lora_A, lora_B, scaling, active_adapter)
-diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py
-index 3f381ef..3e696ca 100644
---- a/src/peft/tuners/lora/model.py
-+++ b/src/peft/tuners/lora/model.py
-@@ -193,6 +193,8 @@ class LoraModel(BaseTuner):
-             "init_lora_weights": lora_config.init_lora_weights,
-             "use_rslora": lora_config.use_rslora,
-             "use_dora": lora_config.use_dora,
-+            "quantization_aware": lora_config.quantization_aware,
-+            "sparse_adapter": lora_config.sparse_adapter,
-             "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False),
-             "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False),
-         }
-@@ -233,7 +235,10 @@ class LoraModel(BaseTuner):
-             child = child.base_layer
- 
-         if not hasattr(new_module, "base_layer"):
--            new_module.weight = child.weight
-+            if hasattr(child, "qweight"):
-+                new_module.qweight = child.qweight
-+            else:
-+                new_module.weight = child.weight
-             if hasattr(child, "bias"):
-                 new_module.bias = child.bias
- 
-@@ -401,7 +406,11 @@ class LoraModel(BaseTuner):
-         Currently gptq quantization and replicated layers do not support merging.
-         """
-         if getattr(self.model, "quantization_method", None) == "gptq":
--            raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
-+            peft_config = self.get_peft_config_as_dict()
-+            # Check if the 'quantization_aware' flag is set to False in the PEFT configuration
-+            # Raise an error if the model is GPTQ quantized and 'quantization_aware' is not enabled
-+            if not peft_config.get("quantization_aware", False):
-+                raise ValueError("Cannot merge LORA layers when the model is gptq quantized")
-         if self.peft_config.get("layer_replication"):
-             raise ValueError("Cannot merge LORA layers when base model layers are replicated")
- 
-diff --git a/src/peft/utils/save_and_load.py b/src/peft/utils/save_and_load.py
-index 5ac1264..acb5d27 100644
---- a/src/peft/utils/save_and_load.py
-+++ b/src/peft/utils/save_and_load.py
-@@ -246,6 +246,48 @@ def set_peft_model_state_dict(model, peft_model_state_dict, adapter_name="default
-     else:
-         raise NotImplementedError
- 
-+    def module_reshape(state_dict):
-+        """Reshape the linear module to match the state dict.
-+
-+        Args:
-+            state_dict (dict): The state dict containing the parameters.
-+        """
-+        for param_name, param in state_dict.items():
-+            tensor_name = param_name
-+            splits = tensor_name.split(".")
-+
-+            # If the parameter name has multiple parts, navigate through the module hierarchy
-+            if len(splits) > 1:
-+                module = model
-+                parent = None
-+
-+                # Traverse the module hierarchy to find the target module
-+                for split in splits[:-1]:
-+                    new_module = getattr(module, split, None)
-+                    if new_module is None:
-+                        raise ValueError(f"{module} has no attribute {split}.")
-+                    parent = module
-+                    module = new_module
-+
-+                tensor_name = splits[-1]
-+                old_value = getattr(module, tensor_name)
-+
-+                # Check if the shape of the original module differs from the shape of the loaded parameter
-+                if old_value.shape != param.shape and isinstance(module, torch.nn.Linear):
-+                    # Create a new Linear module with the new shape
-+                    new_module = torch.nn.Linear(
-+                        param.shape[1],
-+                        param.shape[0],
-+                        bias=module.bias is not None,
-+                        dtype=module.weight.dtype,
-+                        device=module.weight.device
-+                    )
-+                    # Replace the old module with the new one in the parent module
-+                    setattr(parent, splits[-2], new_module)
-+
-+    # Reshape the modules in the peft model to match the state dict
-+    module_reshape(peft_model_state_dict)
-+
-     load_result = model.load_state_dict(peft_model_state_dict, strict=False)
-     if config.is_prompt_learning:
-         model.prompt_encoder[adapter_name].embedding.load_state_dict(
diff --git a/comps/finetuning_sqft/patches/transformers-v4.44.2.patch b/comps/finetuning_sqft/patches/transformers-v4.44.2.patch
deleted file mode 100644
index a35e96297a..0000000000
--- a/comps/finetuning_sqft/patches/transformers-v4.44.2.patch
+++ /dev/null
@@ -1,171 +0,0 @@
-diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
-index 68ba7babf..6b54a3987 100755
---- a/src/transformers/trainer.py
-+++ b/src/transformers/trainer.py
-@@ -155,6 +155,7 @@ from .utils import (
-     is_in_notebook,
-     is_ipex_available,
-     is_lomo_available,
-+    is_nncf_available,
-     is_peft_available,
-     is_safetensors_available,
-     is_sagemaker_dp_enabled,
-@@ -245,6 +246,11 @@ if is_accelerate_available():
- if is_accelerate_available("0.28.0"):
-     from accelerate.utils import DataLoaderConfiguration
- 
-+if is_nncf_available():
-+    from nncf.torch.compression_method_api import PTCompressionAlgorithmController
-+else:
-+    PTCompressionAlgorithmController = None
-+
- 
- def _is_peft_model(model):
-     if is_peft_available():
-@@ -352,6 +358,8 @@ class Trainer:
-             by this function will be reflected in the predictions received by `compute_metrics`.
- 
-             Note that the labels (second parameter) will be `None` if the dataset does not have them.
-+        compression_ctrl ([`PTCompressionAlgorithmController`], *optional*): A compression controller to use. Note that
-+            this script only supports `ProgressiveShrinkingController` of NNCF (https://github.com/openvinotoolkit/nncf).
- 
-     Important attributes:
- 
-@@ -387,6 +395,7 @@ class Trainer:
-         callbacks: Optional[List[TrainerCallback]] = None,
-         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
-         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
-+        compression_ctrl: PTCompressionAlgorithmController = None
-     ):
-         if args is None:
-             output_dir = "tmp_trainer"
-@@ -400,6 +409,7 @@ class Trainer:
-                     " summary statistics should be returned by the function."
-                 )
-         self.args = args
-+        self.compression_ctrl = compression_ctrl
-         # Seed must be set before instantiating the model when using model
-         enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed)
-         self.hp_name = None
-@@ -1040,7 +1050,10 @@ class Trainer:
-             optimizer = self.optimizer.optimizer
-         else:
-             optimizer = self.optimizer
--        self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
-+        # If compression_ctrl (`ProgressiveShrinkingController`) is not used, create a scheduler.
-+        # If compression_ctrl is used (not None), it will use its own learning rate scheduler.
-+        if self.compression_ctrl is None:
-+            self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
- 
-     def get_decay_parameter_names(self, model) -> List[str]:
-         """
-@@ -1569,7 +1582,9 @@ class Trainer:
-             self.state.stateful_callbacks["TrainerControl"] = self.control.state()
-             self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
-             torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
--            torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-+            # Save the learning rate scheduler state if compression_ctrl is not used.
-+            if self.compression_ctrl is None:
-+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
- 
-     def call_model_init(self, trial=None):
-         model_init_argcount = number_of_arguments(self.model_init)
-@@ -2204,8 +2219,16 @@ class Trainer:
-         if args.eval_on_start:
-             self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True)
- 
-+        # Initialize the learning rate scheduler if compression_ctrl is used.
-+        if self.compression_ctrl is not None:
-+            train_iters = len(train_dataloader)
-+            self.compression_ctrl.set_training_lr_scheduler_args(self.optimizer, train_iters)
-+
-         total_batched_samples = 0
-         for epoch in range(epochs_trained, num_train_epochs):
-+            # Perform an epoch step for the compression controller's scheduler if it is used.
-+            if self.compression_ctrl is not None:
-+                self.compression_ctrl.scheduler.epoch_step()
-             epoch_iterator = train_dataloader
-             if hasattr(epoch_iterator, "set_epoch"):
-                 epoch_iterator.set_epoch(epoch)
-@@ -2234,6 +2257,10 @@ class Trainer:
- 
-             step = -1
-             for step, inputs in enumerate(epoch_iterator):
-+                # Perform a step for the compression controller's scheduler if it is used.
-+                # Include actions such as activating the subnetwork or updating the learning rate.
-+                if self.compression_ctrl is not None:
-+                    self.compression_ctrl.scheduler.step()
-                 total_batched_samples += 1
- 
-                 if self.args.include_num_input_tokens_seen:
-@@ -2345,7 +2372,10 @@ class Trainer:
-                     optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
-                     if optimizer_was_run:
-                         # Delay optimizer scheduling until metrics are generated
--                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
-+                        if (
-+                            not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau)
-+                            and self.compression_ctrl is None
-+                        ):
-                             self.lr_scheduler.step()
- 
-                     model.zero_grad()
-@@ -2791,7 +2821,11 @@ class Trainer:
-             logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
-             if grad_norm is not None:
-                 logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm
--            logs["learning_rate"] = self._get_learning_rate()
-+            # Retrieve the current learning rate from the compression controller if available, otherwise use the default method
-+            if self.compression_ctrl is not None:
-+                logs["learning_rate"] = self.compression_ctrl.scheduler.lr_scheduler.get_last_lr()[0]
-+            else:
-+                logs["learning_rate"] = self._get_learning_rate()
- 
-             self._total_loss_scalar += tr_loss_scalar
-             self._globalstep_last_logged = self.state.global_step
-@@ -3015,7 +3049,9 @@ class Trainer:
-             and not is_torch_xla_available()
-         ):
-             with warnings.catch_warnings(record=True) as caught_warnings:
--                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-+                # Save the learning rate scheduler state if compression_ctrl is not used.
-+                if self.compression_ctrl is None:
-+                    torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-             reissue_pt_warnings(caught_warnings)
- 
-     def _load_optimizer_and_scheduler(self, checkpoint):
-diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
-index efe473a6c..1040a75f4 100755
---- a/src/transformers/utils/__init__.py
-+++ b/src/transformers/utils/__init__.py
-@@ -152,6 +152,7 @@ from .import_utils import (
-     is_natten_available,
-     is_ninja_available,
-     is_nltk_available,
-+    is_nncf_available,
-     is_onnx_available,
-     is_openai_available,
-     is_optimum_available,
-diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
-index 3b0abd334..823e8919f 100755
---- a/src/transformers/utils/import_utils.py
-+++ b/src/transformers/utils/import_utils.py
-@@ -131,6 +131,7 @@ _levenshtein_available = _is_package_available("Levenshtein")
- _librosa_available = _is_package_available("librosa")
- _natten_available = _is_package_available("natten")
- _nltk_available = _is_package_available("nltk")
-+_nncf_available = _is_package_available("nncf")
- _onnx_available = _is_package_available("onnx")
- _openai_available = _is_package_available("openai")
- _optimum_available = _is_package_available("optimum")
-@@ -1056,6 +1057,10 @@ def is_nltk_available():
-     return _nltk_available
- 
- 
-+def is_nncf_available():
-+    return _nncf_available
-+
-+
- def is_torchaudio_available():
-     return _torchaudio_available
- 
diff --git a/comps/finetuning_sqft/requirements.txt b/comps/finetuning_sqft/requirements.txt
deleted file mode 100644
index 6eff6b62ac..0000000000
--- a/comps/finetuning_sqft/requirements.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-aiohttp
-datasets
-docarray
-fastapi
-httpx
-opentelemetry-api
-opentelemetry-exporter-otlp
-opentelemetry-sdk
-prometheus-fastapi-instrumentator
-pydantic==2.8.2
-pydantic_yaml
-python-multipart
-pyyaml
-ray[all]
-requests
-shortuuid
-uvicorn
diff --git a/comps/finetuning_sqft/utils/extract_sub_adapter.py b/comps/finetuning_sqft/utils/extract_sub_adapter.py
deleted file mode 100644
index 82e4471719..0000000000
--- a/comps/finetuning_sqft/utils/extract_sub_adapter.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-import os
-import re
-
-import torch
-from nncf import NNCFConfig
-from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
-
-PATTERN = re.compile(r"[[](.*?)[]]", re.S)
-
-
-def get_width_for_query_prefix(torch_module_to_width, query_module, length=5):
-    """Get the width for a given query module prefix.
-
-    Args:
-        torch_module_to_width (dict): Mapping from torch module to width.
-        query_module (str): The query module name.
-        length (int, optional): The length of the prefix to match. Default is 5.
-
-    Returns:
-        int: The width for the query module prefix.
-    """
-    query_module_list = query_module.split(".")
-    width = next(
-        (
-            value
-            for torch_module, value in torch_module_to_width.items()
-            if torch_module.split(".")[:length] == query_module_list[:length]
-        ),
-        None,
-    )
-    return width
-
-
-def main(adapter_model_path, nncf_config, sub_adapter_version, custom_config=None):
-    output_dir = os.path.join(adapter_model_path, sub_adapter_version)
-    os.makedirs(output_dir, exist_ok=True)
-    nncf_config = NNCFConfig.from_json(nncf_config)
-    try:
-        overwrite_groups = nncf_config["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"]
-        overwrite_groups_widths = nncf_config["bootstrapNAS"]["training"]["elasticity"]["width"][
-            "overwrite_groups_widths"
-        ]
-        assert len(overwrite_groups) == len(overwrite_groups_widths)
-    except Exception:
-        raise ValueError("Cannot get the search space in NNCF config.")
-
-    if sub_adapter_version == "maximal":
-        subnetwork_config = {idx: space[0] for idx, space in enumerate(overwrite_groups_widths)}
-    elif sub_adapter_version == "heuristic":
-        subnetwork_config = {idx: space[(len(space) - 1) // 2] for idx, space in enumerate(overwrite_groups_widths)}
-    elif sub_adapter_version == "minimal":
-        subnetwork_config = {idx: space[-1] for idx, space in enumerate(overwrite_groups_widths)}
-    else:
-        assert custom_config is not None, "Missing custom subnetwork config."
-        assert isinstance(custom_config, list), "Custom config must be a list."
-        subnetwork_config = {i: value for i, value in enumerate(custom_config)}
-
-    # Mapping: nncf node -> width
-    nncf_node_to_width = {}
-    for idx, value in subnetwork_config.items():
-        space = overwrite_groups_widths[idx]
-        assert min(space) <= value <= max(space)
-        cur_dict = {node: value for node in overwrite_groups[idx]}
-        nncf_node_to_width.update(cur_dict)
-
-    # Prune adapter model (LoRA low-rank)
-    lora_torch_module_to_width = {
-        ".".join(re.findall(PATTERN, k)): v for k, v in nncf_node_to_width.items() if "lora_A" in k
-    }
-    num_module_name_item = list(lora_torch_module_to_width.keys())[0].split(".").index("lora_A")
-    # Load adapter weights
-    try:
-        super_adapter_weights = torch.load(os.path.join(adapter_model_path, WEIGHTS_NAME))
-    except:
-        from safetensors.torch import load_file
-
-        super_adapter_weights = load_file(os.path.join(adapter_model_path, SAFETENSORS_WEIGHTS_NAME))
-    sub_adapter_weights = {}
-    for weight_key, weight_tensor in super_adapter_weights.items():
-        width = get_width_for_query_prefix(lora_torch_module_to_width, weight_key, length=num_module_name_item)
-        if width is not None:
-            is_loraA = "lora_A" in weight_key
-            new_weight_tensor = weight_tensor[:width].clone() if is_loraA else weight_tensor[:, :width].clone()
-        else:
-            new_weight_tensor = weight_tensor.clone()
-        sub_adapter_weights[weight_key] = new_weight_tensor
-    os.makedirs(output_dir, exist_ok=True)
-    torch.save(sub_adapter_weights, os.path.join(output_dir, WEIGHTS_NAME))
-    config_path = os.path.join(adapter_model_path, CONFIG_NAME)
-    os.system(f"cp {config_path} {output_dir}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Merge base model and adapter model with additional configurations")
-    parser.add_argument("--adapter_model_path", type=str, required=True, help="Path to the adapter model")
-    parser.add_argument("--nncf_config", type=str, required=True, help="Path to the NNCF configuration")
-    parser.add_argument("--sub_adapter_version", type=str, required=True, help="Sub adapter version")
-    parser.add_argument("--custom_config", type=str, default=None, help="Path to custom configuration (optional)")
-    args = parser.parse_args()
-    main(args.adapter_model_path, args.nncf_config, args.sub_adapter_version, args.custom_config)
diff --git a/comps/finetuning_sqft/utils/merge.py b/comps/finetuning_sqft/utils/merge.py
deleted file mode 100644
index 266ee0eac4..0000000000
--- a/comps/finetuning_sqft/utils/merge.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import argparse
-
-from peft import PeftModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-def main(base_model_path, adapter_model_path, output_path):
-    base_model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True)
-    model = PeftModel.from_pretrained(base_model, adapter_model_path)
-    model.eval()
-    for name, param in model.named_parameters():
-        param.requires_grad = False
-    merged_model = model.merge_and_unload()
-    merged_model.train(False)
-    base_model.save_pretrained(output_path, state_dict=merged_model.state_dict())
-
-    tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
-    tokenizer.save_pretrained(output_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Merge base model and adapter model")
-    parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model")
-    parser.add_argument("--adapter_model_path", type=str, required=True, help="Path to the adapter model")
-    parser.add_argument("--output_path", type=str, required=True, help="Path to save the merged model")
-
-    args = parser.parse_args()
-    main(args.base_model_path, args.adapter_model_path, args.output_path)
diff --git a/comps/finetuning_sqft/utils/nncf_config_process.py b/comps/finetuning_sqft/utils/nncf_config_process.py
deleted file mode 100644
index 5f6abb7c8f..0000000000
--- a/comps/finetuning_sqft/utils/nncf_config_process.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-import os
-
-from nncf import NNCFConfig
-
-NNCF_CONFIG_TEMPLATE = {
-    "input_info": [
-        {"sample_size": [1, 256], "type": "long", "keyword": "input_ids"},
-        {"sample_size": [1, 256], "type": "long", "keyword": "attention_mask"},
-    ],
-    "bootstrapNAS": {
-        "training": {
-            "algorithm": "progressive_shrinking",
-            "frozen_layers_allowed": True,
-            "progressivity_of_elasticity": ["width"],
-            "batchnorm_adaptation": {"num_bn_adaptation_samples": 0},
-            "schedule": {
-                "list_stage_descriptions": [
-                    {
-                        "train_dims": ["width"],
-                        "epochs": -1,
-                        "depth_indicator": 1,
-                        "width_indicator": 8,
-                        "init_lr": -1,
-                        "epochs_lr": -1,
-                        "sample_rate": 1,
-                    }
-                ]
-            },
-            "elasticity": {
-                "available_elasticity_dims": ["width"],
-                "width": {"overwrite_groups": [], "overwrite_groups_widths": []},
-            },
-        }
-    },
-}
-
-
-def add_lr_epochs(nncf_config, learning_rate=3e-4, num_epochs=3):
-    """Add learning rate and epochs to the NNCF configuration.
-
-    Args:
-        nncf_config (dict): The NNCF configuration dictionary.
-        learning_rate (float): The initial learning rate to set.
-        num_epochs (int): The number of epochs to set.
-
-    Returns:
-        dict: The updated NNCF configuration.
-    """
-    stage_description = nncf_config["bootstrapNAS"]["training"]["schedule"]["list_stage_descriptions"][0]
-    if stage_description["init_lr"] == -1:
-        stage_description["init_lr"] = learning_rate
-    if stage_description["epochs"] == -1:
-        stage_description["epochs"] = num_epochs
-        stage_description["epochs_lr"] = num_epochs
-
-    return nncf_config
-
-
-def get_model_paths(model, target_module_name):
-    """Find all paths to the target layer in the model.
-
-    Args:
-        model (torch.nn.Module): The model to search.
-        target_module_name (str): The name of the target layer.
-
-    Returns:
-        list: A list of paths to the target layer.
-    """
-
-    def find_layers(module, target_module_name, path, paths):
-        for name, sub_module in module.named_children():
-            new_path = f"{path}/{sub_module.__class__.__name__}[{name}]"
-            if target_module_name in name:
-                # Check if 'lora_A' is in the sub_module's children
-                for sub_name, _ in sub_module.named_children():
-                    if "lora_A" in sub_name:
-                        paths.append(f"{new_path}/ModuleDict[lora_A]/NNCFLinear[default]/linear_0")
-            find_layers(sub_module, target_module_name, new_path, paths)
-
-    base_path = model.__class__.__name__
-    paths = []
-    find_layers(model, target_module_name, base_path, paths)
-    return paths
-
-
-def load_nncf_config(config, model, target_module_groups=None, search_space=None, nncf_config=None):
-    """Load and preprocess the NNCF configuration file.
-
-    Returns:
-        NNCFConfig: The preprocessed NNCF configuration object.
-    """
-
-    if nncf_config is not None:
-        nncf_config = NNCFConfig.from_json(nncf_config)
-    else:
-        if search_space is None and target_module_groups:
-            raise ValueError(
-                "Neural LoRA search is enabled, `search_space` and `target_module_groups` must be provided."
-            )
-        # The NNCF Config will be automatically generated based on `target_module_groups` and `search_space`.
-        num_hidden_layers = model.config.num_hidden_layers
-        nncf_config_dict = NNCF_CONFIG_TEMPLATE
-        overwrite_groups = []
-        for group in target_module_groups:
-            group_paths = []
-            for module in group:
-                target_layer_name = module
-                paths = get_model_paths(model, target_layer_name)
-                assert paths, f"No paths found for module {module}"
-                group_paths.append(paths)
-            # Transpose the list of lists to combine paths by their positions
-            transposed_paths = list(zip(*group_paths))
-            overwrite_groups.extend([list(path_group) for path_group in transposed_paths])
-        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"]["overwrite_groups"] = overwrite_groups
-
-        overwrite_groups_widths = []
-        for space in search_space:
-            space = [int(width) for width in space.split(",")]
-            overwrite_groups_widths.extend([space] * num_hidden_layers)
-        nncf_config_dict["bootstrapNAS"]["training"]["elasticity"]["width"][
-            "overwrite_groups_widths"
-        ] = overwrite_groups_widths
-        assert len(overwrite_groups) == len(overwrite_groups_widths)
-        nncf_config_dict = add_lr_epochs(
-            nncf_config_dict, learning_rate=config["Training"]["learning_rate"], num_epochs=config["Training"]["epochs"]
-        )
-        nncf_config = NNCFConfig.from_dict(nncf_config_dict)
-
-    nncf_config["log_dir"] = config["General"]["output_dir"]
-    os.makedirs(nncf_config["log_dir"], exist_ok=True)
-    with open(os.path.join(nncf_config["log_dir"], "nncf_config.json"), "w") as f:
-        json.dump(nncf_config, f, indent=4)
-    return nncf_config
-
-
-if __name__ == "__main__":
-    import transformers
-    from peft import LoraConfig, get_peft_model
-
-    model = transformers.AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-    lora_config = {
-        "task_type": "CAUSAL_LM",
-        "r": 16,
-        "target_modules": ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj"],
-    }
-    peft_config = LoraConfig(**lora_config)
-    model = get_peft_model(model, peft_config)
-    load_nncf_config(
-        None, model, [["q_proj", "k_proj", "v_proj"], ["up_proj"], ["down_proj"]], ["16,12,8", "16", "16,12"]
-    )

From ed9dc5fc606615f7c43aa08b220bd7766687cfa4 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 26 Nov 2024 11:15:03 +0800
Subject: [PATCH 04/17] Add test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 tests/finetuning/test_finetuning.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 11a544dfda..c333a2f228 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -144,6 +144,18 @@ function validate_microservice() {
         '{"training_file": "test_data.json","model": "facebook/opt-125m"}'
 
 
+    ##########################
+    #    sqft test   #
+    ##########################
+    # test /v1/fine_tuning/jobs
+    validate_finetune \
+        "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
+        "sqft - finetuning" \
+        "test-comps-finetuning-server" \
+        '{"id":"ft-job' \
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}'
+
+
     ##########################
     #    rerank test         #
     ##########################

From d5e559c2b6457d36a9b6aa4b2a8a409dcd42462c Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 26 Nov 2024 11:46:18 +0800
Subject: [PATCH 05/17] Update links
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/comps/finetuning/README.md b/comps/finetuning/README.md
index d6ad323670..83b2adddc5 100644
--- a/comps/finetuning/README.md
+++ b/comps/finetuning/README.md
@@ -118,7 +118,7 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
 
 In addition to traditional fine-tuning, you can use SQFT's NLS to fine-tune your model.
 More details about SQFT can be found in [this paper](https://aclanthology.org/2024.findings-emnlp.749.pdf).
-Please follow the additional installation requirements [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#-start-nls-microservice-with-python).
+Please follow the additional installation requirements [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#-start-the-nls-microservice-with-python).
 Use the following command to launch a finetuning job with the NLS algorithm:
 
 ```bash
@@ -145,9 +145,9 @@ curl http://${your_ip}:8015/v1/fine_tuning/jobs \
   }'
 ```
 
-Detailed explanations for the parameters can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#create-nls-fine-tuning-job).
+Detailed explanations for the parameters can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#create-an-nls-fine-tuning-job).
 Additional use-cases and benefits of SQFT are available [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea).
-Instructions to extracting the desired sub-adapter and merging it with the base model can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#leverage-fine-tuned-super-adapter).
+Instructions to extracting the desired sub-adapter and merging it with the base model can be found [here](https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/tree/main/SQFT/opea#leverage-the-fine-tuned-super-adapter).
 
 #### 3.2.3 Reranking Model Training
 

From 0e275d2bcd1617b78b67d4fdead8b6e1f7ef43ee Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 26 Nov 2024 12:13:41 +0800
Subject: [PATCH 06/17] refactor(SQFTNLSConfig): enhance set_target_modules
 logic for better validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning/finetune_config.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/comps/finetuning/finetune_config.py b/comps/finetuning/finetune_config.py
index df9a6be9bb..1cf3b08794 100644
--- a/comps/finetuning/finetune_config.py
+++ b/comps/finetuning/finetune_config.py
@@ -44,12 +44,14 @@ class SQFTNLSConfig(LoraConfig):
 
     @root_validator(pre=True)
     def set_target_modules(cls, values):
-        target_module_groups = values.get("target_module_groups")
-        if target_module_groups is not None:
-            values["target_modules"] = [item for sublist in target_module_groups for item in sublist]
-        search_space = values.get("search_space")
-        if search_space is not None:
-            assert len(search_space) == len(target_module_groups)
+        if values.get("neural_lora_search"):
+            target_module_groups = values.get("target_module_groups")
+            search_space = values.get("search_space")
+            if target_module_groups is None or search_space is None:
+                raise ValueError("Please specified `target_module_groups` and `search_space` when using NLS strategy.")
+            if len(search_space) != len(target_module_groups):
+                raise ValueError("The length of `search_space` must be equal to the length of `target_module_groups`.")
+            values["target_modules"] = [module for groups in target_module_groups for module in groups]
         return values
 
 

From a304263d777a18ca213dfddad3b7df24c53aceb8 Mon Sep 17 00:00:00 2001
From: ZePan110 <ze.pan@intel.com>
Date: Tue, 26 Nov 2024 19:15:07 +0800
Subject: [PATCH 07/17] Fix build issue (#946)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: ZePan110 <ze.pan@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 tests/retrievers/test_retrievers_pathway_langchain.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/retrievers/test_retrievers_pathway_langchain.sh b/tests/retrievers/test_retrievers_pathway_langchain.sh
index a1e4e773a7..33d60b025f 100644
--- a/tests/retrievers/test_retrievers_pathway_langchain.sh
+++ b/tests/retrievers/test_retrievers_pathway_langchain.sh
@@ -10,9 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
     cd $WORKPATH
 
-    cd comps/vectorstores/pathway
-
-    docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/vectorstore-pathway:comps .
+    docker build --no-cache --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -t opea/vectorstore-pathway:comps -f comps/vectorstores/pathway/Dockerfile .
 
     cd $WORKPATH
 

From 98bc3f8293f6cb755a6784837db07c0fed2f0a56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Pablo=20Mu=C3=B1oz?= <pablo.munoz@intel.com>
Date: Wed, 27 Nov 2024 10:24:24 -0800
Subject: [PATCH 08/17] Fix issue with copying folders in sub-adapter
 extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Yuan0320 <jinjie.yuan@intel.com>
Signed-off-by: J. Pablo Muñoz <pablo.munoz@intel.com>
---
 comps/finetuning/utils/extract_sub_adapter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/comps/finetuning/utils/extract_sub_adapter.py b/comps/finetuning/utils/extract_sub_adapter.py
index 00f477f684..2f5eccde32 100644
--- a/comps/finetuning/utils/extract_sub_adapter.py
+++ b/comps/finetuning/utils/extract_sub_adapter.py
@@ -3,6 +3,7 @@
 
 import os
 import re
+import shutil
 
 import torch
 from peft.utils import CONFIG_NAME, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME
@@ -97,4 +98,4 @@ def main(adapter_model_path, nncf_config, adapter_version, custom_config=None):
     os.makedirs(output_dir, exist_ok=True)
     torch.save(sub_adapter_weights, os.path.join(output_dir, WEIGHTS_NAME))
     config_path = os.path.join(adapter_model_path, CONFIG_NAME)
-    os.system(f"cp {config_path} {output_dir}")
+    shutil.copy(config_path, output_dir)

From 606ef11003b3054e5ea66f74a65b47408d13fc60 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Mon, 2 Dec 2024 14:36:44 +0800
Subject: [PATCH 09/17] Temporarily remove test due to sqft environment
 (additional installation)

Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index c333a2f228..11a544dfda 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -144,18 +144,6 @@ function validate_microservice() {
         '{"training_file": "test_data.json","model": "facebook/opt-125m"}'
 
 
-    ##########################
-    #    sqft test   #
-    ##########################
-    # test /v1/fine_tuning/jobs
-    validate_finetune \
-        "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
-        "sqft - finetuning" \
-        "test-comps-finetuning-server" \
-        '{"id":"ft-job' \
-        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}'
-
-
     ##########################
     #    rerank test         #
     ##########################

From c83f86aea6927cfd277c9a6d875c727683d72b3e Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Mon, 2 Dec 2024 20:01:51 +0800
Subject: [PATCH 10/17] Add sqft test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 comps/finetuning/Dockerfile.sqft    | 66 +++++++++++++++++++++++++++++
 tests/finetuning/test_finetuning.sh | 59 +++++++++++++++++++++++++-
 2 files changed, 123 insertions(+), 2 deletions(-)
 create mode 100644 comps/finetuning/Dockerfile.sqft

diff --git a/comps/finetuning/Dockerfile.sqft b/comps/finetuning/Dockerfile.sqft
new file mode 100644
index 0000000000..ee47310fb2
--- /dev/null
+++ b/comps/finetuning/Dockerfile.sqft
@@ -0,0 +1,66 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Use the same python version with ray
+FROM python:3.10.14
+
+ARG HF_TOKEN
+
+ENV HF_TOKEN=$HF_TOKEN
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+COPY comps /home/user/comps
+
+RUN chown -R user /home/user/comps/finetuning
+
+USER user
+
+ENV PATH=$PATH:/home/user/.local/bin
+
+RUN python -m pip install --no-cache-dir --upgrade pip && \
+    python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+    python -m pip install --no-cache-dir intel-extension-for-pytorch && \
+    python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
+    python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt
+
+WORKDIR /home/user/comps/finetuning
+
+RUN git clone https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.git && \
+    cp -r Hardware-Aware-Automated-Machine-Learning/SQFT/patches /home/user/comps/finetuning/patches && \
+    rm -rf Hardware-Aware-Automated-Machine-Learning && \
+    mkdir third_party
+
+# Clone and set up transformers
+RUN git clone https://github.com/huggingface/transformers.git third_party/transformers && \
+    cd third_party/transformers && \
+    git checkout v4.44.2 && \
+    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/transformers-v4.44.2.patch && \
+    pip install -e .
+
+# Clone and set up peft
+RUN git clone https://github.com/huggingface/peft.git third_party/peft && \
+    cd third_party/peft && \
+    git checkout v0.10.0 && \
+    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/peft-v0.10.0.patch && \
+    pip install -e .
+
+# Clone and set up nncf
+RUN git clone https://github.com/openvinotoolkit/nncf.git third_party/nncf && \
+    cd third_party/nncf && \
+    git checkout f143e1c && \
+    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/nncf-f143e1c.patch && \
+    pip install -e .
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user
+
+RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \
+    echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \
+    echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \
+    echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \
+    echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \
+    echo python finetuning_service.py >> run.sh
+
+CMD bash run.sh
diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 11a544dfda..6314bad81b 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -22,6 +22,19 @@ function build_docker_images() {
     fi
 }
 
+function build_sqft_docker_images() {
+    cd $WORKPATH
+    echo $(pwd)
+    # TODO: get the Dockerfile from the SQFT source repository instead of comps/finetuning/Dockerfile.sqft.
+    docker build -t opea/finetuning:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/Dockerfile.sqft .
+    if [ $? -ne 0 ]; then
+        echo "opea/finetuning (sqft) built fail"
+        exit 1
+    else
+        echo "opea/finetuning (sqft) built successful"
+    fi
+}
+
 function start_service() {
     export no_proxy="localhost,127.0.0.1,"${ip_address}
     docker run -d --name="test-comps-finetuning-server" -p $finetuning_service_port:$finetuning_service_port -p $ray_port:$ray_port --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy opea/finetuning:comps
@@ -225,6 +238,44 @@ EOF
 
 }
 
+function validate_sqft_microservice() {
+    cd $LOG_PATH
+    export no_proxy="localhost,127.0.0.1,"${ip_address}
+
+    ##########################
+    #    general test         #
+    ##########################
+    # test /v1/dataprep upload file
+    echo '[{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."}]' > $LOG_PATH/test_data.json
+    validate_upload \
+        "http://${ip_address}:$finetuning_service_port/v1/files" \
+        "general - upload" \
+        "test-comps-finetuning-server" \
+        "fine-tune" \
+        "test_data.json"
+
+    # test /v1/fine_tuning/jobs
+    validate_finetune \
+        "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
+        "general - finetuning" \
+        "test-comps-finetuning-server" \
+        '{"id":"ft-job' \
+        '{"training_file": "test_data.json","model": "facebook/opt-125m"}'
+
+
+    ##########################
+    #    sqft test   #
+    ##########################
+    # test /v1/fine_tuning/jobs
+    validate_finetune \
+        "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
+        "sqft - finetuning" \
+        "test-comps-finetuning-server" \
+        '{"id":"ft-job' \
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}'
+
+}
+
 function stop_docker() {
     cid=$(docker ps -aq --filter "name=test-comps-finetuning-server*")
     if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
@@ -233,12 +284,16 @@ function stop_docker() {
 function main() {
 
     stop_docker
-
     build_docker_images
     start_service
-
     validate_microservice
 
+    # test sqft
+    stop_docker
+    build_sqft_docker_images
+    start_service
+    validate_sqft_microservice
+
     stop_docker
     echo y | docker system prune
 

From 95e1f26de58ba5d1dfe68c310aceb41ec8bf2d62 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 08:55:38 +0800
Subject: [PATCH 11/17] Get the Dockerfile from SQFT repo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 comps/finetuning/Dockerfile.sqft    | 66 -----------------------------
 tests/finetuning/test_finetuning.sh |  2 +-
 2 files changed, 1 insertion(+), 67 deletions(-)
 delete mode 100644 comps/finetuning/Dockerfile.sqft

diff --git a/comps/finetuning/Dockerfile.sqft b/comps/finetuning/Dockerfile.sqft
deleted file mode 100644
index ee47310fb2..0000000000
--- a/comps/finetuning/Dockerfile.sqft
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-# Use the same python version with ray
-FROM python:3.10.14
-
-ARG HF_TOKEN
-
-ENV HF_TOKEN=$HF_TOKEN
-
-RUN useradd -m -s /bin/bash user && \
-    mkdir -p /home/user && \
-    chown -R user /home/user/
-
-COPY comps /home/user/comps
-
-RUN chown -R user /home/user/comps/finetuning
-
-USER user
-
-ENV PATH=$PATH:/home/user/.local/bin
-
-RUN python -m pip install --no-cache-dir --upgrade pip && \
-    python -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
-    python -m pip install --no-cache-dir intel-extension-for-pytorch && \
-    python -m pip install --no-cache-dir oneccl_bind_pt --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ && \
-    python -m pip install --no-cache-dir -r /home/user/comps/finetuning/requirements.txt
-
-WORKDIR /home/user/comps/finetuning
-
-RUN git clone https://github.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning.git && \
-    cp -r Hardware-Aware-Automated-Machine-Learning/SQFT/patches /home/user/comps/finetuning/patches && \
-    rm -rf Hardware-Aware-Automated-Machine-Learning && \
-    mkdir third_party
-
-# Clone and set up transformers
-RUN git clone https://github.com/huggingface/transformers.git third_party/transformers && \
-    cd third_party/transformers && \
-    git checkout v4.44.2 && \
-    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/transformers-v4.44.2.patch && \
-    pip install -e .
-
-# Clone and set up peft
-RUN git clone https://github.com/huggingface/peft.git third_party/peft && \
-    cd third_party/peft && \
-    git checkout v0.10.0 && \
-    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/peft-v0.10.0.patch && \
-    pip install -e .
-
-# Clone and set up nncf
-RUN git clone https://github.com/openvinotoolkit/nncf.git third_party/nncf && \
-    cd third_party/nncf && \
-    git checkout f143e1c && \
-    git apply --ignore-space-change --ignore-whitespace /home/user/comps/finetuning/patches/nncf-f143e1c.patch && \
-    pip install -e .
-
-ENV PYTHONPATH=$PYTHONPATH:/home/user
-
-RUN echo PKGPATH=$(python3 -c "import pkg_resources; print(pkg_resources.get_distribution('oneccl-bind-pt').location)") >> run.sh && \
-    echo 'export LD_LIBRARY_PATH=$PKGPATH/oneccl_bindings_for_pytorch/opt/mpi/lib/:$LD_LIBRARY_PATH' >> run.sh && \
-    echo 'source $PKGPATH/oneccl_bindings_for_pytorch/env/setvars.sh' >> run.sh && \
-    echo ray start --head --dashboard-host=0.0.0.0 >> run.sh && \
-    echo export RAY_ADDRESS=http://localhost:8265 >> run.sh && \
-    echo python finetuning_service.py >> run.sh
-
-CMD bash run.sh
diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 6314bad81b..0c11b866e5 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -25,7 +25,7 @@ function build_docker_images() {
 function build_sqft_docker_images() {
     cd $WORKPATH
     echo $(pwd)
-    # TODO: get the Dockerfile from the SQFT source repository instead of comps/finetuning/Dockerfile.sqft.
+    curl -o comps/finetuning/Dockerfile.sqft https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/main/SQFT/opea/Dockerfile
     docker build -t opea/finetuning:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/Dockerfile.sqft .
     if [ $? -ne 0 ]; then
         echo "opea/finetuning (sqft) built fail"

From 24ec2a7400440fe6fbea3e0a1e51593c89a2b5e1 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 12:00:25 +0800
Subject: [PATCH 12/17] Add tests for adapter merging and extract
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 69 +++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 8 deletions(-)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 0c11b866e5..a1357ed83f 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -130,6 +130,35 @@ function validate_finetune() {
 	fi
 	sleep 1m
     done
+
+  echo "$FINTUNING_ID"
+}
+
+function validate_merge_or_extract_adapter() {
+    local URL="$1"
+    local SERVICE_NAME="$2"
+    local DOCKER_NAME="$3"
+    local EXPECTED_DATA="$4"
+    local INPUT_DATA="$5"
+
+    HTTP_RESPONSE=$(curl --silent --write-out "HTTPSTATUS:%{http_code}" -X POST -H 'Content-Type: application/json' -d "$INPUT_DATA" "$URL")
+    HTTP_STATUS=$(echo $HTTP_RESPONSE | tr -d '\n' | sed -e 's/.*HTTPSTATUS://')
+    RESPONSE_BODY=$(echo $HTTP_RESPONSE | sed -e 's/HTTPSTATUS\:.*//g')
+
+    if [ "$HTTP_STATUS" -ne "200" ]; then
+        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
+    fi
+
+    # Check if the parsed values match the expected values
+    if [[ "$RESPONSE_BODY" != *"$EXPECTED_DATA"* ]]; then
+        echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        exit 1
+    else
+        echo "[ $SERVICE_NAME ] Content is as expected."
+    fi
 }
 
 function validate_microservice() {
@@ -243,7 +272,7 @@ function validate_sqft_microservice() {
     export no_proxy="localhost,127.0.0.1,"${ip_address}
 
     ##########################
-    #    general test         #
+    #      general test      #
     ##########################
     # test /v1/dataprep upload file
     echo '[{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."},{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."}]' > $LOG_PATH/test_data.json
@@ -254,25 +283,49 @@ function validate_sqft_microservice() {
         "fine-tune" \
         "test_data.json"
 
-    # test /v1/fine_tuning/jobs
-    validate_finetune \
+    # test /v1/fine_tuning/jobs (LoRA)
+    FINTUNING_ID=$(validate_finetune \
         "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
         "general - finetuning" \
         "test-comps-finetuning-server" \
         '{"id":"ft-job' \
-        '{"training_file": "test_data.json","model": "facebook/opt-125m"}'
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "target_modules": ["q_proj"]}}}')
+
+    # test merging the LoRA adapter into the base model
+    validate_merge_or_extract_adapter \
+        "http://${ip_address}:$finetuning_service_port/v1/finetune/merge_adapter" \
+        "adapter merge" \
+        "test-comps-finetuning-server" \
+        "${FINTUNING_ID}" \
+        "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\"}"
 
 
     ##########################
-    #    sqft test   #
+    #        sqft test       #
     ##########################
-    # test /v1/fine_tuning/jobs
-    validate_finetune \
+    # test /v1/fine_tuning/jobs (SQFT-NLS)
+    FINTUNING_ID=$(validate_finetune \
         "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
         "sqft - finetuning" \
         "test-comps-finetuning-server" \
         '{"id":"ft-job' \
-        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}'
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}')
+
+    # test extracting heuristic sub-adapter
+    validate_merge_or_extract_adapter \
+        "http://${ip_address}:$finetuning_service_port/v1/finetune/extract_sub_adapter" \
+        "extract sub-adapter" \
+        "test-comps-finetuning-server" \
+        "${FINTUNING_ID}" \
+        "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"heuristic\"}"
+
+    # test merging the heuristic sub-adapter into the base model
+    validate_merge_or_extract_adapter \
+        "http://${ip_address}:$finetuning_service_port/v1/finetune/merge_adapter" \
+        "adapter merge" \
+        "test-comps-finetuning-server" \
+        "${FINTUNING_ID}" \
+        "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"heuristic\"}"
 
 }
 

From 812aa4a73c67d05279a94a302264c384ad6f790e Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 12:43:28 +0800
Subject: [PATCH 13/17] Add --no-cache option to sqft Docker build process
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index a1357ed83f..363f26886b 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -26,7 +26,7 @@ function build_sqft_docker_images() {
     cd $WORKPATH
     echo $(pwd)
     curl -o comps/finetuning/Dockerfile.sqft https://raw.githubusercontent.com/IntelLabs/Hardware-Aware-Automated-Machine-Learning/main/SQFT/opea/Dockerfile
-    docker build -t opea/finetuning:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/Dockerfile.sqft .
+    docker build --no-cache -t opea/finetuning:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg HF_TOKEN=$HF_TOKEN -f comps/finetuning/Dockerfile.sqft .
     if [ $? -ne 0 ]; then
         echo "opea/finetuning (sqft) built fail"
         exit 1

From fcd5f3d644ea9fcb8b64bc51c3271255c2ab89c3 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 13:30:29 +0800
Subject: [PATCH 14/17] fix: resolve issue with FINTUNING_ID handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 363f26886b..165c2ce3fc 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -130,8 +130,6 @@ function validate_finetune() {
 	fi
 	sleep 1m
     done
-
-  echo "$FINTUNING_ID"
 }
 
 function validate_merge_or_extract_adapter() {
@@ -284,12 +282,12 @@ function validate_sqft_microservice() {
         "test_data.json"
 
     # test /v1/fine_tuning/jobs (LoRA)
-    FINTUNING_ID=$(validate_finetune \
+    validate_finetune \
         "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
         "general - finetuning" \
         "test-comps-finetuning-server" \
         '{"id":"ft-job' \
-        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "target_modules": ["q_proj"]}}}')
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "target_modules": ["q_proj"]}}}'
 
     # test merging the LoRA adapter into the base model
     validate_merge_or_extract_adapter \
@@ -304,12 +302,12 @@ function validate_sqft_microservice() {
     #        sqft test       #
     ##########################
     # test /v1/fine_tuning/jobs (SQFT-NLS)
-    FINTUNING_ID=$(validate_finetune \
+    validate_finetune \
         "http://${ip_address}:$finetuning_service_port/v1/fine_tuning/jobs" \
         "sqft - finetuning" \
         "test-comps-finetuning-server" \
         '{"id":"ft-job' \
-        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}')
+        '{"training_file": "test_data.json","model": "facebook/opt-125m", "General": {"lora_config": {"r": 8, "neural_lora_search": true, "target_module_groups": [["q_proj"]], "search_space": ["8,6,4"]}}}'
 
     # test extracting heuristic sub-adapter
     validate_merge_or_extract_adapter \

From 44781f02c849d84fe8687435e13a37f623e80eda Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 17:48:31 +0800
Subject: [PATCH 15/17] Add logging of Docker container output on HTTP status
 or content mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 165c2ce3fc..554525a0cc 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -145,6 +145,7 @@ function validate_merge_or_extract_adapter() {
 
     if [ "$HTTP_STATUS" -ne "200" ]; then
         echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
+        docker logs $DOCKER_NAME >> ${LOG_PATH}/finetuning-server_merge_or_extract_adapter.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
@@ -153,6 +154,7 @@ function validate_merge_or_extract_adapter() {
     # Check if the parsed values match the expected values
     if [[ "$RESPONSE_BODY" != *"$EXPECTED_DATA"* ]]; then
         echo "[ $SERVICE_NAME ] Content does not match the expected result: $RESPONSE_BODY"
+        docker logs $DOCKER_NAME >> ${LOG_PATH}/finetuning-server_merge_or_extract_adapter.log
         exit 1
     else
         echo "[ $SERVICE_NAME ] Content is as expected."

From a0446eb0dffbe857f6483106009b59643545af3c Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 19:53:47 +0800
Subject: [PATCH 16/17] fix: resolve ipex issue and remove useless code in
 merge_adapter.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 comps/finetuning/utils/merge_adapter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/comps/finetuning/utils/merge_adapter.py b/comps/finetuning/utils/merge_adapter.py
index f1bca2ab51..44fd01e8ad 100644
--- a/comps/finetuning/utils/merge_adapter.py
+++ b/comps/finetuning/utils/merge_adapter.py
@@ -1,6 +1,7 @@
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import intel_extension_for_pytorch
 from peft import PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
@@ -9,8 +10,6 @@ def main(base_model_path, adapter_model_path, output_path):
     base_model = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True)
     model = PeftModel.from_pretrained(base_model, adapter_model_path)
     model.eval()
-    for name, param in model.named_parameters():
-        param.requires_grad = False
     merged_model = model.merge_and_unload()
     merged_model.train(False)
     base_model.save_pretrained(output_path, state_dict=merged_model.state_dict())

From 68f2bb0292eb383727e767b37c94b998a9db64c1 Mon Sep 17 00:00:00 2001
From: Yuan0320 <jinjie.yuan@intel.com>
Date: Tue, 3 Dec 2024 20:59:04 +0800
Subject: [PATCH 17/17] Add tests for custom sub-adapter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: J. Pablo Muñoz <pablo.munoz@intel.com>
Signed-off-by: Yuan0320 <jinjie.yuan@intel.com>
---
 tests/finetuning/test_finetuning.sh | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tests/finetuning/test_finetuning.sh b/tests/finetuning/test_finetuning.sh
index 554525a0cc..1dcb6d7b60 100644
--- a/tests/finetuning/test_finetuning.sh
+++ b/tests/finetuning/test_finetuning.sh
@@ -314,7 +314,7 @@ function validate_sqft_microservice() {
     # test extracting heuristic sub-adapter
     validate_merge_or_extract_adapter \
         "http://${ip_address}:$finetuning_service_port/v1/finetune/extract_sub_adapter" \
-        "extract sub-adapter" \
+        "extract heuristic sub-adapter" \
         "test-comps-finetuning-server" \
         "${FINTUNING_ID}" \
         "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"heuristic\"}"
@@ -322,11 +322,27 @@ function validate_sqft_microservice() {
     # test merging the heuristic sub-adapter into the base model
     validate_merge_or_extract_adapter \
         "http://${ip_address}:$finetuning_service_port/v1/finetune/merge_adapter" \
-        "adapter merge" \
+        "merge heuristic sub-adapter" \
         "test-comps-finetuning-server" \
         "${FINTUNING_ID}" \
         "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"heuristic\"}"
 
+    # test extracting sub-adapter with custom configuration
+    validate_merge_or_extract_adapter \
+        "http://${ip_address}:$finetuning_service_port/v1/finetune/extract_sub_adapter" \
+        "extract custom sub-adapter" \
+        "test-comps-finetuning-server" \
+        "${FINTUNING_ID}" \
+        "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"custom\", \"custom_config\": [8, 6, 4, 4, 8, 6, 8, 8, 8, 8, 4, 8]}"
+
+    # test merging the custom sub-adapter into the base model
+    validate_merge_or_extract_adapter \
+        "http://${ip_address}:$finetuning_service_port/v1/finetune/merge_adapter" \
+        "merge custom sub-adapter" \
+        "test-comps-finetuning-server" \
+        "${FINTUNING_ID}" \
+        "{\"fine_tuning_job_id\": \"${FINTUNING_ID}\", \"adapter_version\": \"custom\"}"
+
 }
 
 function stop_docker() {