add all metric

OptimalScale · Jan 19, 2024 · fdd351c · fdd351c
1 parent 46e7582
commit fdd351c
Show file tree

Hide file tree

Showing 22 changed files with 18,708 additions and 34 deletions.
diff --git a/configs/accelerator_multigpu_config.yaml b/configs/accelerator_multigpu_config.yaml
@@ -8,7 +8,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 2
+num_processes: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []

diff --git a/configs/ds_config_zero2_nocpu.json b/configs/ds_config_zero2_nocpu.json
@@ -0,0 +1,41 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/configs/ds_config_zero3_nocpu.json b/configs/ds_config_zero3_nocpu.json
@@ -0,0 +1,44 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "bf16": {
+        "enabled": "auto"
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/csqa_dev_data_promptv2_1221.json b/csqa_dev_data_promptv2_1221.json
diff --git a/csqa_test_data_promptv2_1241.json b/csqa_test_data_promptv2_1241.json
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
@@ -53,19 +53,76 @@
     "lmflow_chat_cn_dialog_multiturn_single_nll_text2text":"data/lmflow_chat_cn_dialog_multiturn_single_nll_text2text",
     "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp":"data/lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp",
     "lmflow_chat_en_dialog_multiturn_single_nll_text2text":"data/lmflow_chat_en_dialog_multiturn_single_nll_text2text",
+    "arc_c_prompt_text2text":"data/commonsense_eval/arc_c_prompt_text2text",
+    "arc_c_prompt_text2text_":"data/commonsense_eval/arc_c_prompt_text2text_",
+    "arc_c_prompt_text_only":"data/commonsense_eval/arc_c_prompt_text_only",
+    "arc_e_prompt_text2text":"data/commonsense_eval/arc_e_prompt_text2text",
+    "arc_e_prompt_text2text_":"data/commonsense_eval/arc_e_prompt_text2text_",
+    "arc_e_prompt_text_only":"data/commonsense_eval/arc_e_prompt_text_only",
+    "piqa_prompt_text2text":"data/commonsense_eval/piqa_prompt_text2text",
+    "piqa_prompt_text2text_":"data/commonsense_eval/piqa_prompt_text2text_",
+    "piqa_prompt_text_only":"data/commonsense_eval/piqa_prompt_text_only",
+    "hellaswag_prompt_text2text":"data/commonsense_eval/hellaswag_prompt_text2text",
+    "hellaswag_prompt_text2text_":"data/commonsense_eval/hellaswag_prompt_text2text_",
+    "hellaswag_prompt_text_only":"data/commonsense_eval/hellaswag_prompt_text_only",
+    "boolq_prompt_text2text":"data/commonsense_eval/boolq_prompt_text2text",
+    "boolq_prompt_text2text_":"data/commonsense_eval/boolq_prompt_text2text_",
+    "boolq_prompt_text_only":"data/commonsense_eval/boolq_prompt_text_only",
+    "obqa_prompt_text2text":"data/commonsense_eval/obqa_prompt_text2text",
+    "obqa_prompt_text2text_":"data/commonsense_eval/obqa_prompt_text2text_",
+    "obqa_prompt_text_only":"data/commonsense_eval/obqa_prompt_text_only",
+    "winogrande_prompt_text2text":"data/commonsense_eval/winogrande_prompt_text2text",
+    "winogrande_prompt_text2text_":"data/commonsense_eval/winogrande_prompt_text2text_",
+    "winogrande_prompt_text_only":"data/commonsense_eval/winogrande_prompt_text_only",
+    "lmflow_chat_en_half_multi":"data/commonsense_eval/lmflow_chat_half_multi",
+    "lmflow_chat_en_half_single":"data/commonsense_eval/lmflow_chat_half_single",
+    "lmflow_chat_en_sample20_multi":"data/commonsense_eval/lmflow_chat_sample20_multi",
+    "lmflow_chat_en_sample20_single":"data/commonsense_eval/lmflow_chat_sample20_single",
+    "lmflow_chat_en_sample10_multi":"data/commonsense_eval/lmflow_chat_sample10_multi",
+    "lmflow_chat_en_sample10_single":"data/commonsense_eval/lmflow_chat_sample10_single",
+    'mmlu_text2text':'data/commonsense_eval/mmlu_text2text',
+    'mmlu_textonly':'data/commonsense_eval/mmlu_textonly',
+    'crows_pairs_textonly':'data/commonsense_eval/crows_pairs_textonly',
+    "headqa_prompt_text2text":"data/commonsense_eval/headqa_prompt_text2text",
+    "logiqa_prompt_text2text":"data/commonsense_eval/logiqa_prompt_text2text",
+    "race_prompt_text2text":"data/commonsense_eval/race_prompt_text2text",
+    "sciq_prompt_text2text":"data/commonsense_eval/sciq_prompt_text2text",
+    "webqs_prompt_text2text":"data/commonsense_eval/webqs_prompt_text2text",
+    "coqa_prompt_text2text":"data/commonsense_eval/coqa_prompt_text2text",
+    "drop_prompt_text2text":"data/commonsense_eval/drop_prompt_text2text",
+    "mctaco_prompt_text2text":"data/commonsense_eval/mctaco_prompt_text2text",
 }
 
 LM_EVAL_DATASET_MAP={
     "commonsense_qa_eval":"openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq",
+    "commonsense_qa_eval_5":"openbookqa,arc_easy,arc_challenge,piqa,boolq",
     "math_eval":"gsm8k",
     'boolq':"boolq",
+    'mmlu':"helm_hendrycksTest-*",
+    'truthfulqa':"truthfulqa_mc",
+    'crows_pairs':"crows_pairs_english",
+    'mmlu_crows':"helm_hendrycksTest-*,crows_pairs_english",
+    "add_cs":"sciq,race,webqs,logiqa,headqa",
+    "emnlp_cs":"drop,mc_taco,coqa",
+    "mmlu-anatomy":"hendrycksTest-anatomy"
 }
 
 
 LOCAL_DATSET_GROUP_MAP={
     "commonsense_nll_eval":"common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
     common_sense_eval_obqa,common_sense_eval_piqa,common_sense_eval_hellaswag,common_sense_eval_siqa,\
     common_sense_eval_boolq",
+    "common_sense_prompt":"arc_c_prompt_text2text,arc_c_prompt_text_only,arc_e_prompt_text2text,arc_e_prompt_text_only,piqa_prompt_text2text,piqa_prompt_text_only",
+    "common_sense_eval_arc_c":"common_sense_eval_arc_c,common_sense_eval_arc_e",
+    "commonsense_fix":"arc_c_prompt_text2text,arc_c_prompt_text_only,arc_e_prompt_text2text,arc_e_prompt_text_only,piqa_prompt_text2text,piqa_prompt_text_only,\
+    hellaswag_prompt_text2text,hellaswag_prompt_text_only,boolq_prompt_text2text,boolq_prompt_text_only,obqa_prompt_text2text,obqa_prompt_text_only,\
+    winogrande_prompt_text2text,winogrande_prompt_text_only",
+    "commonsense_fix_abcd_prompt":"arc_c_prompt_text2text,arc_e_prompt_text2text,piqa_prompt_text2text,\
+    hellaswag_prompt_text2text,boolq_prompt_text2text,obqa_prompt_text2text,\
+    winogrande_prompt_text2text",
+    "commonsense_fix_abcd_prompt_complete":"boolq_prompt_text2text_,obqa_prompt_text2text_,\
+    winogrande_prompt_text2text_",
+    "commonsense_fix_abcd_prompt_complete_2":"piqa_prompt_text2text_,hellaswag_prompt_text2text_",
     "gpt4_en_eval":"gpt4_en_eval",
     "gpt4_zh_eval":"gpt4_zh_eval",
     "wiki_zh_eval":"wiki_zh_eval",
@@ -77,9 +134,29 @@
     common_sense_eval_boolq,gpt4_en_eval,gpt4_zh_eval,wiki_zh_eval,wiki_en_eval,\
     lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
     lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",
+    "nocs_nll_eval":"gpt4_en_eval,gpt4_zh_eval,wiki_zh_eval,wiki_en_eval,\
+    lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
+    lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",
     "lmflow_chat_nll_eval":"lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
     lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",
     "lmflow_chat_zh_nll_eval":"lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text",
+    "all_cs_nll_eval":"common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
+    common_sense_eval_obqa,common_sense_eval_piqa,common_sense_eval_hellaswag,common_sense_eval_siqa,common_sense_eval_boolq",\
+    "lmflow_chat_nll_eval_sample":"lmflow_chat_en_half_multi,lmflow_chat_en_half_single,lmflow_chat_en_sample20_multi,\
+    lmflow_chat_en_sample20_single,lmflow_chat_en_sample10_multi,lmflow_chat_en_sample10_single",\
+    "all_nll_eval":"common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
+    common_sense_eval_obqa,common_sense_eval_piqa,common_sense_eval_hellaswag,\
+    common_sense_eval_boolq,gpt4_en_eval,wiki_en_eval,\
+    lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",\
+    "test_nll_gpt4_wiki":"gpt4_en_eval,wiki_en_eval",
+    "mmlu_crows_nll":"mmlu_text2text,mmlu_textonly,crows_pairs_textonly",
+    "add_cs_nll":"headqa_prompt_text2text,logiqa_prompt_text2text,race_prompt_text2text,sciq_prompt_text2text,webqs_prompt_text2text",
+    "emnlp_cs_nll":"coqa_prompt_text2text,drop_prompt_text2text,mctaco_prompt_text2text",
+    "all_nll_eval_text2text":"arc_c_prompt_text2text,arc_e_prompt_text2text,winogrande_prompt_text2text,\
+    obqa_prompt_text2text,piqa_prompt_text2text,hellaswag_prompt_text2text,\
+    boolq_prompt_text2text,gpt4_en_eval,wiki_en_eval,\
+    lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",\
+    "lmflow_inst_dialog":"lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text,gpt4_en_eval",
 }
 
 LOCAL_DATSET_ANSWERTYPE_MAP={
@@ -104,6 +181,50 @@
     "lmflow_chat_cn_dialog_multiturn_single_nll_text2text":"text2text",
     "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp":"text2text",
     "lmflow_chat_en_dialog_multiturn_single_nll_text2text":"text2text",
+    "arc_c_prompt_text2text":"text2text",
+    "arc_c_prompt_text_only":"text_only",
+    "arc_e_prompt_text2text":"text2text",
+    "arc_e_prompt_text_only":"text_only",
+    "piqa_prompt_text2text":"text2text",
+    "piqa_prompt_text_only":"text_only",
+    "hellaswag_prompt_text2text":"text2text",
+    "hellaswag_prompt_text_only":"text_only",
+    "boolq_prompt_text2text":"text2text",
+    "boolq_prompt_text_only":"text_only",
+    "obqa_prompt_text2text":"text2text",
+    "obqa_prompt_text_only":"text_only",
+    "arc_c_prompt_text2text_":"text2text",
+    "arc_e_prompt_text2text_":"text2text",
+    "piqa_prompt_text2text_":"text2text",
+    "hellaswag_prompt_text2text_":"text2text",
+    "boolq_prompt_text2text_":"text2text",
+    "obqa_prompt_text2text_":"text2text",
+    "winogrande_prompt_text2text_":"text2text",
+    "winogrande_prompt_text2text":"text2text",
+    "lmflow_chat_en_half_multi":"text2text",
+    "lmflow_chat_en_half_single":"text2text",
+    "lmflow_chat_en_sample20_multi":"text2text",
+    "lmflow_chat_en_sample20_single":"text2text",
+    "lmflow_chat_en_sample10_multi":"text2text",
+    "lmflow_chat_en_sample10_single":"text2text",
+    'mmlu_text2text':'text2text',
+    'mmlu_textonly':'text_only',
+    'crows_pairs_textonly':'text_only',
+    "headqa_prompt_text2text":"text2text",
+    "logiqa_prompt_text2text":"text2text",
+    "race_prompt_text2text":"text2text",
+    "sciq_prompt_text2text":"text2text",
+    "webqs_prompt_text2text":"text2text",
+    "coqa_prompt_text2text":"text2text",
+    "drop_prompt_text2text":"text2text",
+    "mctaco_prompt_text2text":"text2text",
+}
+
+LM_EVAL_FEW_SHOT={
+    "arc_easy":25,
+    "winogrande":5,
+    "hellaswag":10,
+    "arc_challenge":25,
 }
 
 
@@ -170,7 +291,8 @@ def run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args, \
         # 1. When --metric is not specified, or "accuracy", log warning and change to
         #    the dataset_name's default metric
         # 2. If specified, use the specified metric
-        result = evaluator.evaluate(model=model, dataset=dataset, metric=local_metric,verbose=True)
+        # result = evaluator.evaluate(model=model, dataset=dataset, metric=local_metric,verbose=True,entropy_n=pipeline_args.entropy_n)
+        result = evaluator.evaluate(model=model, dataset=dataset, metric=pipeline_args.metric,verbose=True)
         reuslt_collection.append({"dataset":dataset_name_,"result":result})
     for record in reuslt_collection:
         print("-"*30)
@@ -187,10 +309,25 @@ def run_lm_evaluation_benchmarking(dataset_name,model_name):
     # "--tasks", "openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq"
     # "--device", "cuda:0"])
     dataset = LM_EVAL_DATASET_MAP[dataset_name]
-    subprocess.run(["python3", "utils/lm_evaluator.py", "--model", "hf-causal-experimental", 
-    "--model_args", "pretrained="+model_name,
-    "--tasks", dataset,
-    "--device", "cuda:0"])
+    num_fewshot = 0
+    new_tasks = []
+    no_few_tasks = []
+    for task in dataset.split(','):
+        if task in LM_EVAL_FEW_SHOT.keys():
+            new_tasks.append(task)
+        else:
+            no_few_tasks.append(task)
+    new_tasks.append(','.join(no_few_tasks))
+    for dataset in new_tasks:
+        if dataset in LM_EVAL_FEW_SHOT.keys():
+            num_fewshot = LM_EVAL_FEW_SHOT[dataset]
+        else:
+            num_fewshot = 0
+        subprocess.run(["python3", "utils/lm_evaluator.py", "--model", "hf-causal-experimental", 
+        "--model_args", "trust_remote_code=True,pretrained="+model_name,
+        "--tasks", dataset,
+        "--num_fewshot",str(num_fewshot),
+        "--device", "cuda:0"])
 
 def main():
     # Parses arguments (self-defined for our evaluation platform)
@@ -214,7 +351,7 @@ def main():
     dataset_name = benchmarking_args.dataset_name
     # metric = pipeline_args.metric
     if is_lmflow_local_benchmarking(dataset_name):   # TODO (@Jipeng)
-        model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
+        model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config, use_accelerator=pipeline_args.use_accelerator_for_evaluator)
         run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args,pipeline_args,model)  # Pass args TODO (@Jipeng)
     elif is_lm_evaluation_benchmarking(dataset_name):
         model = model_args.model_name_or_path

diff --git a/examples/generate_text.py b/examples/generate_text.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
+"""A one-line summary of the module or program, terminated by a period.
+
+Leave one blank line.  The rest of this docstring should contain an
+overall description of the module or program.  Optionally, it may also
+contain a brief description of exported classes and functions and/or usage
+examples.
+
+Typical usage example:
+
+  foo = ClassFoo()
+  bar = foo.FunctionBar()
+"""
+import json
+import os
+import sys
+sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
+from transformers import HfArgumentParser
+
+from lmflow.datasets.dataset import Dataset
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.models.auto_model import AutoModel
+from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
+
+def write_json(data,path):
+    with open(path, 'w') as f:
+        json.dump(data, f, indent=4)
+
+pipeline_name = "inferencer"
+PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
+
+parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
+model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
+
+with open (pipeline_args.deepspeed, "r") as f:
+    ds_config = json.load(f)
+
+model = AutoModel.get_model(
+    model_args, 
+    tune_strategy='none', 
+    ds_config=ds_config, 
+    use_accelerator=pipeline_args.use_accelerator_for_evaluator
+)
+dataset = Dataset(data_args)
+
+inferencer = AutoPipeline.get_pipeline(
+    pipeline_name=pipeline_name,
+    model_args=model_args,
+    data_args=data_args,
+    pipeline_args=pipeline_args,
+)
+output_datasets, output_file = inferencer.inference(model=model, dataset=dataset, max_new_tokens=512, temperature=0.7)
+write_json(output_file, pipeline_args.output_result_path+'/results.json')
diff --git a/lm-evaluation-harness b/lm-evaluation-harness