ContrastivePruning/StructuredPruning/run_squad.py

# coding=utf-8
# Copyright 2020 The HuggingFace Team All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for question answering.
"""
# You can also adapt this script on your own question answering task. Pointers for this are left as comments.

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

from datasets import load_dataset, load_metric

import torch
import transformers
from trainer_qa import QuestionAnsweringTrainer
from transformers import (
    AutoConfig,
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PreTrainedTokenizerFast,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process
from utils_qa import postprocess_qa_predictions
from torch.utils.data import DataLoader

from model.PruneBert import PruneBertForQuestionAnswering
from model.TeacherBert import TeacherBertForQuestionAnswering
from prune.prune_utils import determine_pruning_sequence, what_to_prune_head, calculate_head_and_intermediate_importance, what_to_prune_mlp


logger = logging.getLogger(__name__)

from transformers.models.bert import BertPreTrainedModel, BertModel
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.modeling_outputs import QuestionAnsweringModelOutput
import torch.nn as nn
from tqdm import tqdm

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
    validation_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_seq_length: int = field(
        default=384,
        metadata={
            "help": "The maximum total input sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
            "be faster on GPU but will be slower on TPU)."
        },
    )
    version_2_with_negative: bool = field(
        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
    )
    null_score_diff_threshold: float = field(
        default=0.0,
        metadata={
            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
            "the score of the null answer minus this threshold, the null answer is selected for this example. "
            "Only useful when `version_2_with_negative=True`."
        },
    )
    doc_stride: int = field(
        default=128,
        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
    )
    n_best_size: int = field(
        default=20,
        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
    )
    max_answer_length: int = field(
        default=30,
        metadata={
            "help": "The maximum length of an answer that can be generated. This is needed because the start "
            "and end predictions are not conditioned on one another."
        },
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        else:
            if self.train_file is not None:
                extension = self.train_file.split(".")[-1]
                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            if self.validation_file is not None:
                extension = self.validation_file.split(".")[-1]
                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."

@dataclass
class CAPTrainingArguments(TrainingArguments):
    do_prune: bool = field(
        default=False,
    )
    prune_percent: str = field(
        default="",
    )
    retrain_num_train_epochs: float = field(
        default=2.0,
    )
    ce_loss_weight: float = field(
        default=1.0,
    )
    cl_unsupervised_loss_weight: float = field(
        default=0.0,
    )
    contrastive_temperature: float = field(
        default=0.1,
    )
    extra_examples: int = field(
        default=4096,
    )
    use_contrastive_loss: bool = field(
        default=False,
    )
    alignrep: str = field(
        default='cls',
    )
    # for pruning
    at_least_x_heads_per_layer: int = field(
        default=1,
    )
    normalize_pruning_by_layer: bool = field(
        default=False,
    )
    subset_ratio: float = field(
        default=1.0,
    )
    # for distillation
    use_distill: bool = field(
        default=False,
    )
    teacher_path: str = field(
        default='',
    )
    distill_temperature: float = field(
        default=1.0,
    )
    distill_loss_weight: float = field(
        default=0.0,
    )

def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CAPTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name+'.py', data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    teacher = TeacherBertForQuestionAnswering.from_pretrained(
        training_args.teacher_path,
        from_tf=bool(".ckpt" in training_args.teacher_path),
        config=config,
        alignrep=training_args.alignrep
    )
    teacher = teacher.cuda()

    model = PruneBertForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
        contrastive_temperature=training_args.contrastive_temperature,
        ce_loss_weight=training_args.ce_loss_weight,
        cl_unsupervised_loss_weight=training_args.cl_unsupervised_loss_weight,
        distill_loss_weight=training_args.distill_loss_weight,
        extra_examples=training_args.extra_examples,
        alignrep=training_args.alignrep,
        get_teacher_logits=teacher.get_teacher_logits if training_args.use_distill else None,
        distill_temperature=training_args.distill_temperature
    )


    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement"
        )

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    # Training preprocessing
    def prepare_train_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]

            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    if training_args.do_train:
        train_dataset = datasets["train"].map(
            prepare_train_features,
            batched=True,
            with_indices=False, 
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )
        def add_id(examples, idx):
            examples['idx'] = idx
            return examples
        train_dataset = train_dataset.map(
            add_id,
            batched=True,
            with_indices=True, 
        )
        

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=data_args.max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if training_args.do_eval:
        validation_dataset = datasets["validation"].map(
            prepare_validation_features,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
    # collator.
    data_collator = (
        default_data_collator
        if data_args.pad_to_max_length
        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
    )

    # Post-processing:
    def post_processing_function(examples, features, predictions):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=data_args.version_2_with_negative,
            n_best_size=data_args.n_best_size,
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold,
            output_dir=training_args.output_dir,
            is_world_process_zero=trainer.is_world_process_zero(),
        )
        # Format the result to the format the metric expects.
        if data_args.version_2_with_negative:
            formatted_predictions = [
                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
            ]
        else:
            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in datasets["validation"]]
        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad_metric.py")

    def compute_metrics(p: EvalPrediction):
        return metric.compute(predictions=p.predictions, references=p.label_ids)

    # Initialize our Trainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=validation_dataset if training_args.do_eval else None,
        eval_examples=datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        global_representations_bank_finetuned = None
        global_representations_bank_pretrained = None
        global_representations_bank_snaps = None

        # contrastive learning of finetuned model
        if training_args.use_contrastive_loss:
            dataloader = DataLoader(
                train_dataset,
                batch_size=trainer.args.train_batch_size,
                shuffle=False,
                collate_fn=trainer.data_collator,
                drop_last=trainer.args.dataloader_drop_last,
                num_workers=trainer.args.dataloader_num_workers,
                pin_memory=trainer.args.dataloader_pin_memory,
            )
            with torch.no_grad():
                for inputs in tqdm(dataloader):
                    inputs = trainer._prepare_inputs(inputs)
                    representations = teacher(encode_example=True, **inputs).cpu()
                    if global_representations_bank_finetuned is None:
                        global_representations_bank_finetuned = representations
                    else:
                        global_representations_bank_finetuned = torch.cat((global_representations_bank_finetuned, representations), dim=0)
            global_representations_bank_finetuned = global_representations_bank_finetuned.unsqueeze(1)

        if not training_args.use_distill:
            teacher = None

        if training_args.do_prune:
            model = trainer._wrap_model(trainer.model, training=False)
            model = model.module if hasattr(model, 'module') else model

            # Determine the number of heads to prune
            prune_percent = training_args.prune_percent
            prune_percent = None if prune_percent == '' else [float(x) for x in prune_percent.split(',')]

            prune_sequence_head, prune_sequence_intermediate = determine_pruning_sequence(
                prune_percent,
                config.num_hidden_layers,
                config.num_attention_heads,
                config.intermediate_size,
                training_args.at_least_x_heads_per_layer,
            )
            prune_sequence = zip(prune_sequence_head, prune_sequence_intermediate)
            
            for step, (n_to_prune_head, n_to_prune_intermediate) in enumerate(prune_sequence):
                logger.info("We are going to prune {} heads and {} intermediate !!!".format(n_to_prune_head, n_to_prune_intermediate))
                head_importance, intermediate_importance = calculate_head_and_intermediate_importance(
                    model,
                    train_dataset,
                    old_head_mask=model.head_mask,
                    old_intermediate_mask=model.intermediate_mask,
                    trainer=trainer,
                    normalize_scores_by_layer=training_args.normalize_pruning_by_layer,
                    subset_size=training_args.subset_ratio
                ) 
                for layer in range(len(head_importance)):
                    layer_scores = head_importance[layer].cpu().data
                    logger.info("head importance score")
                    logger.info("\t".join(f"{x:.5f}" for x in layer_scores))
                # Determine which heads to prune
                new_head_mask = what_to_prune_head(
                    head_importance,
                    n_to_prune=n_to_prune_head,
                    old_head_mask=model.head_mask,
                    at_least_x_heads_per_layer=training_args.at_least_x_heads_per_layer,
                )
                new_intermediate_mask = what_to_prune_mlp(
                    intermediate_importance,
                    n_to_prune=n_to_prune_intermediate,
                    old_intermediate_mask=model.intermediate_mask
                )
                for layer in range(len(new_head_mask)):
                    y = new_head_mask[layer].cpu().data
                    logger.info("head mask")
                    logger.info("\t".join("{}".format(int(x)) for x in y))
                logger.info("intermediate mask")
                for layer in range(len(new_intermediate_mask)):
                    y = new_intermediate_mask[layer]
                    logger.info("Layer {} has {} intermediate active.".format(layer, torch.sum(y)))
            
                # calculate and store example representations and labels (for verification)
                if training_args.use_contrastive_loss:
                    representations_bank = None
                    dataloader = DataLoader(
                        train_dataset,
                        batch_size=trainer.args.train_batch_size,
                        shuffle=False,
                        collate_fn=trainer.data_collator,
                        drop_last=trainer.args.dataloader_drop_last,
                        num_workers=trainer.args.dataloader_num_workers,
                        pin_memory=trainer.args.dataloader_pin_memory,
                    )
                    with torch.no_grad():
                        for inputs in tqdm(dataloader):
                            inputs = trainer._prepare_inputs(inputs)
                            representations = model(encode_example=True, **inputs).cpu()
                            if representations_bank is None:
                                representations_bank = representations
                            else:
                                representations_bank = torch.cat((representations_bank, representations), dim=0)

                    if step == 0:
                        # add to global representations bank for pretrained
                        global_representations_bank_pretrained = representations_bank
                        global_representations_bank_pretrained = global_representations_bank_pretrained.unsqueeze(1)
                    else:
                        # add to global representations bank for snaps
                        if global_representations_bank_snaps is None:
                            global_representations_bank_snaps = representations_bank.unsqueeze(1)
                        else:
                            global_representations_bank_snaps = torch.cat((global_representations_bank_snaps, representations_bank.unsqueeze(1)), dim=1)
                        
                    # update bank
                    model.global_representations_bank_finetuned = global_representations_bank_finetuned
                    model.global_representations_bank_pretrained = global_representations_bank_pretrained
                    model.global_representations_bank_snaps = global_representations_bank_snaps

                # apply structured pruing
                model.head_mask[:] = new_head_mask.clone()
                model.intermediate_mask[:] = new_intermediate_mask.clone()

                # re-train
                trainer.optimizer = trainer.lr_scheduler = None
                trainer.args.num_train_epochs = training_args.retrain_num_train_epochs
                trainer.train()

                # re-eval
                metrics = trainer.evaluate(eval_dataset=validation_dataset)
                task = data_args.dataset_name
                new_metrics = {}
                for k in metrics:
                    new_metrics["{}_{}_{}".format(task, k, step+1)] = metrics[k]
                metrics = new_metrics
                trainer.log_metrics("eval_{}".format(step+1), metrics)
                trainer.save_metrics("eval_{}".format(step+1), metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")
        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in sorted(results.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()