diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index aebf6d33..7b980508 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -36,6 +36,8 @@ jobs:
      - name: Test
        env:
         HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
+        HF_HOME: "cache/models"
+        HF_DATASETS_CACHE: "cache/datasets"
        run: | # PYTHONPATH="${PYTHONPATH}:src" HF_DATASETS_CACHE="cache/datasets" HF_HOME="cache/models"
         python -m pytest --disable-pytest-warnings
      - name: Write cache
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/adding-a-custom-task.mdx
index 752c4e54..2fbff552 100644
--- a/docs/source/adding-a-custom-task.mdx
+++ b/docs/source/adding-a-custom-task.mdx
@@ -191,8 +191,7 @@ Once your file is created you can then run the evaluation with the following com
 
 ```bash
 lighteval accelerate \
-    --model_args "pretrained=HuggingFaceH4/zephyr-7b-beta" \
-    --tasks "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
-    --custom_tasks {path_to_your_custom_task_file} \
-    --output_dir "./evals"
+    "pretrained=HuggingFaceH4/zephyr-7b-beta" \
+    "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
+    --custom-tasks {path_to_your_custom_task_file}
 ```
diff --git a/docs/source/available-tasks.mdx b/docs/source/available-tasks.mdx
index 9b167d21..f340fabb 100644
--- a/docs/source/available-tasks.mdx
+++ b/docs/source/available-tasks.mdx
@@ -3,7 +3,13 @@
 You can get a list of all the available tasks by running:
 
 ```bash
-lighteval tasks --list
+lighteval tasks list
+```
+
+You can also inspect a specific task by running:
+
+```bash
+lighteval tasks inspect <task_name>
 ```
 
 ## List of tasks
diff --git a/docs/source/evaluate-the-model-on-a-server-or-container.mdx b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
index da8f1d4b..0d9a7d12 100644
--- a/docs/source/evaluate-the-model-on-a-server-or-container.mdx
+++ b/docs/source/evaluate-the-model-on-a-server-or-container.mdx
@@ -6,10 +6,9 @@ to the server. The command is the same as before, except you specify a path to
 a yaml config file (detailed below):
 
 ```bash
-lighteval accelerate \
-    --model_config_path="/path/to/config/file"\
-    --tasks <task parameters> \
-    --output_dir output_dir
+lighteval endpoint {tgi,inference-endpoint} \
+    "/path/to/config/file"\
+    <task parameters>
 ```
 
 There are two types of configuration files that can be provided for running on
@@ -65,3 +64,19 @@ model:
     inference_server_auth: null
     model_id: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
 ```
+
+### OpenAI API
+
+Lighteval also supports evaluating models on the OpenAI API. To do so you need to set your OpenAI API key in the environment variable.
+
+```bash
+export  OPENAI_API_KEY={your_key}
+```
+
+And then run the following command:
+
+```bash
+lighteval endpoint openai \
+    {model-name} \
+    <task parameters>
+```
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 9c055f5e..fa1895b7 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -5,7 +5,7 @@ backends—whether it's
 [transformers](https://github.com/huggingface/transformers),
 [tgi](https://github.com/huggingface/text-generation-inference),
 [vllm](https://github.com/vllm-project/vllm), or
-[nanotron](https://github.com/huggingface/nanotron)—with
+[nanotron](https://github.com/huggingface/nanotron)-with
 ease. Dive deep into your model’s performance by saving and exploring detailed,
 sample-by-sample results to debug and see how your models stack-up.
 
diff --git a/docs/source/package_reference/model_config.mdx b/docs/source/package_reference/model_config.mdx
index c70258bb..e2ecceb4 100644
--- a/docs/source/package_reference/model_config.mdx
+++ b/docs/source/package_reference/model_config.mdx
@@ -8,5 +8,3 @@
 [[autodoc]] models.model_config.InferenceModelConfig
 [[autodoc]] models.model_config.TGIModelConfig
 [[autodoc]] models.model_config.VLLMModelConfig
-
-[[autodoc]] models.model_config.create_model_config
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 5f66547e..b2190245 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -1,11 +1,24 @@
 # Quicktour
 
-We provide two main entry points to evaluate models:
+
+> [!TIP]
+> We recommend using the `--help` flag to get more information about the
+> available options for each command.
+> `lighteval --help`
+
+Lighteval can be used with a few different commands.
 
 - `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
   Accelerate](https://github.com/huggingface/accelerate)
 - `lighteval nanotron`: evaluate models in distributed settings using [⚡️
   Nanotron](https://github.com/huggingface/nanotron)
+- `lighteval vllm`: evaluate models on one or more GPUs using [🚀
+  VLLM](https://github.com/vllm-project/vllm)
+- `lighteval endpoint`
+    - `inference-endpoint`: evaluate models on one or more GPUs using [🔗
+  Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)
+    - `tgi`: evaluate models on one or more GPUs using [🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)
+    - `openai`: evaluate models on one or more GPUs using [🔗 OpenAI API](https://platform.openai.com/)
 
 ## Accelerate
 
@@ -15,10 +28,8 @@ To evaluate `GPT-2` on the Truthful QA benchmark, run:
 
 ```bash
 lighteval accelerate \
-     --model_args "pretrained=gpt2" \
-     --tasks "leaderboard|truthfulqa:mc|0|0" \
-     --override_batch_size 1 \
-     --output_dir="./evals/"
+     "pretrained=gpt2" \
+     "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Here, `--tasks` refers to either a comma-separated list of supported tasks from
@@ -51,10 +62,8 @@ You can then evaluate a model using data parallelism on 8 GPUs like follows:
 ```bash
 accelerate launch --multi_gpu --num_processes=8 -m \
     lighteval accelerate \
-    --model_args "pretrained=gpt2" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --override_batch_size 1 \
-    --output_dir="./evals/"
+    "pretrained=gpt2" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Here, `--override_batch_size` defines the batch size per device, so the effective
@@ -66,10 +75,8 @@ To evaluate a model using pipeline parallelism on 2 or more GPUs, run:
 
 ```bash
 lighteval accelerate \
-    --model_args "pretrained=gpt2,model_parallel=True" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --override_batch_size 1 \
-    --output_dir="./evals/"
+    "pretrained=gpt2,model_parallel=True" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 This will automatically use accelerate to distribute the model across the GPUs.
@@ -81,7 +88,7 @@ GPUs.
 
 ### Model Arguments
 
-The `--model_args` argument takes a string representing a list of model
+The `model-args` argument takes a string representing a list of model
 argument. The arguments allowed vary depending on the backend you use (vllm or
 accelerate).
 
@@ -150,8 +157,8 @@ To evaluate a model trained with nanotron on a single gpu.
 ```bash
  torchrun --standalone --nnodes=1 --nproc-per-node=1  \
  src/lighteval/__main__.py nanotron \
- --checkpoint_config_path ../nanotron/checkpoints/10/config.yaml \
- --lighteval_config_path examples/nanotron/lighteval_config_override_template.yaml
+ --checkpoint-config-path ../nanotron/checkpoints/10/config.yaml \
+ --lighteval-config-path examples/nanotron/lighteval_config_override_template.yaml
  ```
 
 The `nproc-per-node` argument should match the data, tensor and pipeline
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
index b50cdee6..8c347cee 100644
--- a/docs/source/saving-and-reading-results.mdx
+++ b/docs/source/saving-and-reading-results.mdx
@@ -3,30 +3,32 @@
 ## Saving results locally
 
 Lighteval will automatically save results and evaluation details in the
-directory set with the `--output_dir` argument. The results will be saved in
+directory set with the `--output-dir` option. The results will be saved in
 `{output_dir}/results/{model_name}/results_{timestamp}.json`. [Here is an
 example of a result file](#example-of-a-result-file). The output path can be
 any [fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html)
 compliant path (local, s3, hf hub, gdrive, ftp, etc).
 
-To save the details of the evaluation, you can use the `--save_details`
-argument. The details will be saved in a parquet file
+To save the details of the evaluation, you can use the `--save-details`
+option. The details will be saved in a parquet file
 `{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`.
 
 ## Pushing results to the HuggingFace hub
 
 You can push the results and evaluation details to the HuggingFace hub. To do
-so, you need to set the `--push_to_hub` as well as the `--results_org`
-argument. The results will be saved in a dataset with the name at
+so, you need to set the `--push-to-hub` as well as the `--results-org`
+option. The results will be saved in a dataset with the name at
 `{results_org}/{model_org}/{model_name}`. To push the details, you need to set
-the `--save_details` argument.
+the `--save-details` option.
 The dataset created will be private by default, you can make it public by
-setting the `--public_run` argument.
+setting the `--public-run` option.
 
 
 ## Pushing results to Tensorboard
 
-You can push the results to Tensorboard by setting `--push_to_tensorboard`.
+You can push the results to Tensorboard by setting `--push-to-tensorboard`.
+This will create a Tensorboard dashboard in a HF org set with the `--results-org`
+option.
 
 
 ## How to load and investigate details
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/use-vllm-as-backend.mdx
index 153ff659..787848c3 100644
--- a/docs/source/use-vllm-as-backend.mdx
+++ b/docs/source/use-vllm-as-backend.mdx
@@ -4,10 +4,9 @@ Lighteval allows you to use `vllm` as backend allowing great speedups.
 To use, simply change the `model_args` to reflect the arguments you want to pass to vllm.
 
 ```bash
-lighteval accelerate \
-    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --output_dir="./evals/"
+lighteval vllm \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 `vllm` is able to distribute the model across multiple GPUs using data
@@ -17,19 +16,17 @@ You can choose the parallelism method by setting in the the `model_args`.
 For example if you have 4 GPUs you can split it across using `tensor_parallelism`:
 
 ```bash
-export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval accelerate \
-    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --output_dir="./evals/"
+export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Or, if your model fits on a single GPU, you can use `data_parallelism` to speed up the evaluation:
 
 ```bash
-lighteval accelerate \
-    --model_args="vllm,pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --output_dir="./evals/"
+lighteval vllm \
+    "pretrained=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 Available arguments for `vllm` can be found in the `VLLMModelConfig`:
@@ -50,4 +47,3 @@ Available arguments for `vllm` can be found in the `VLLMModelConfig`:
 > [!WARNING]
 > In the case of OOM issues, you might need to reduce the context size of the
 > model as well as reduce the `gpu_memory_utilisation` parameter.
-
diff --git a/examples/model_configs/base_model.yaml b/examples/model_configs/base_model.yaml
index 925d3e50..d6563e61 100644
--- a/examples/model_configs/base_model.yaml
+++ b/examples/model_configs/base_model.yaml
@@ -1,5 +1,4 @@
 model:
-  type: "base" # can be base, tgi, or endpoint
   base_params:
     model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
     dtype: "bfloat16"
diff --git a/examples/model_configs/endpoint_model.yaml b/examples/model_configs/endpoint_model.yaml
index 4bf2f060..c3f5222b 100644
--- a/examples/model_configs/endpoint_model.yaml
+++ b/examples/model_configs/endpoint_model.yaml
@@ -1,5 +1,4 @@
 model:
-  type: "endpoint" # can be base, tgi, or endpoint
   base_params:
     endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
     model: "meta-llama/Llama-2-7b-hf"
diff --git a/examples/model_configs/peft_model.yaml b/examples/model_configs/peft_model.yaml
index d94ff610..81205818 100644
--- a/examples/model_configs/peft_model.yaml
+++ b/examples/model_configs/peft_model.yaml
@@ -1,5 +1,4 @@
 model:
-  type: "base"
   base_params:
     model_args: "pretrained=predibase/customer_support,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
     dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
diff --git a/examples/model_configs/quantized_model.yaml b/examples/model_configs/quantized_model.yaml
index dfac1c95..3bc6b2c3 100644
--- a/examples/model_configs/quantized_model.yaml
+++ b/examples/model_configs/quantized_model.yaml
@@ -1,5 +1,4 @@
 model:
-  type: "base"
   base_params:
     model_args: "pretrained=HuggingFaceH4/zephyr-7b-beta,revision=main" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
     dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
diff --git a/examples/model_configs/tgi_model.yaml b/examples/model_configs/tgi_model.yaml
index 82ac50a7..8db5654d 100644
--- a/examples/model_configs/tgi_model.yaml
+++ b/examples/model_configs/tgi_model.yaml
@@ -1,5 +1,4 @@
 model:
-  type: "tgi" # can be base, tgi, or endpoint
   instance:
     inference_server_address: ""
     inference_server_auth: null
diff --git a/examples/nanotron/lighteval_config_override_template.yaml b/examples/nanotron/lighteval_config_override_template.yaml
index 03b65596..50886ced 100644
--- a/examples/nanotron/lighteval_config_override_template.yaml
+++ b/examples/nanotron/lighteval_config_override_template.yaml
@@ -4,9 +4,7 @@ generation: null
 logging:
   output_dir: "outputs"
   save_details: false
-  push_results_to_hub: false
-  push_details_to_hub: false
-  push_results_to_tensorboard: false
+  push_to_hub: false
   public_run: false
   results_org: null
   tensorboard_metric_prefix: "eval"
diff --git a/pyproject.toml b/pyproject.toml
index 7d17c8d1..1a99b6a6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -61,6 +61,7 @@ dependencies = [
     "datasets>=2.14.0",
     "numpy<2",  # pinned to avoid incompatibilities
     # Prettiness
+    "typer",
     "termcolor==2.3.0",
     "pytablewriter",
     "colorama",
@@ -114,4 +115,4 @@ Issues = "https://github.com/huggingface/lighteval/issues"
 # Changelog = "https://github.com/huggingface/lighteval/blob/master/CHANGELOG.md"
 
 [project.scripts]
-lighteval = "lighteval.__main__:cli_evaluate"
+lighteval = "lighteval.__main__:app"
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index b0164b2d..c715723d 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env python
-
 # MIT License
 
 # Copyright (c) 2024 Taratra D. RAHARISON and The HuggingFace Team
@@ -22,81 +20,36 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import typer
 
-import argparse
-import os
-from dataclasses import asdict
-from pprint import pformat
-
-from lighteval.parsers import parser_accelerate, parser_baseline, parser_nanotron, parser_utils_tasks
-from lighteval.tasks.registry import Registry, taskinfo_selector
-
-
-CACHE_DIR = os.getenv("HF_HOME")
-
-
-def cli_evaluate():  # noqa: C901
-    parser = argparse.ArgumentParser(description="CLI tool for lighteval, a lightweight framework for LLM evaluation")
-    subparsers = parser.add_subparsers(help="help for subcommand", dest="subcommand")
-
-    # Subparser for the "accelerate" command
-    parser_a = subparsers.add_parser("accelerate", help="use accelerate and transformers as backend for evaluation.")
-    parser_accelerate(parser_a)
-
-    # Subparser for the "nanotron" command
-    parser_b = subparsers.add_parser("nanotron", help="use nanotron as backend for evaluation.")
-    parser_nanotron(parser_b)
-
-    parser_c = subparsers.add_parser("baseline", help="compute baseline for a task")
-    parser_baseline(parser_c)
-
-    # Subparser for task utils functions
-    parser_d = subparsers.add_parser("tasks", help="display information about available tasks and samples.")
-    parser_utils_tasks(parser_d)
-
-    args = parser.parse_args()
-
-    if args.subcommand == "accelerate":
-        from lighteval.main_accelerate import main as main_accelerate
-
-        main_accelerate(args)
-
-    elif args.subcommand == "nanotron":
-        from lighteval.main_nanotron import main as main_nanotron
-
-        main_nanotron(args.checkpoint_config_path, args.lighteval_config_path, args.cache_dir)
+import lighteval.main_accelerate
+import lighteval.main_baseline
+import lighteval.main_endpoint
+import lighteval.main_nanotron
+import lighteval.main_tasks
+import lighteval.main_vllm
 
-    elif args.subcommand == "baseline":
-        from lighteval.main_baseline import main as main_baseline
 
-        main_baseline(args)
+app = typer.Typer()
 
-    elif args.subcommand == "tasks":
-        registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
-        if args.list:
-            registry.print_all_tasks()
 
-        if args.inspect:
-            print(f"Loading the tasks dataset to cache folder: {args.cache_dir}")
-            print(
-                "All examples will be displayed without few shot, as few shot sample construction requires loading a model and using its tokenizer. "
-            )
-            # Loading task
-            task_names_list, _ = taskinfo_selector(args.inspect, task_registry=registry)
-            task_dict = registry.get_task_dict(task_names_list)
-            for name, task in task_dict.items():
-                print("-" * 10, name, "-" * 10)
-                if args.show_config:
-                    print("-" * 10, "CONFIG")
-                    task.cfg.print()
-                for ix, sample in enumerate(task.eval_docs()[: int(args.num_samples)]):
-                    if ix == 0:
-                        print("-" * 10, "SAMPLES")
-                    print(f"-- sample {ix} --")
-                    print(pformat(asdict(sample), indent=1))
-    else:
-        print("You did not provide any argument. Exiting")
+app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
+app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)
+app.command(rich_help_panel="Evaluation Backends")(lighteval.main_nanotron.nanotron)
+app.command(rich_help_panel="Evaluation Backends")(lighteval.main_vllm.vllm)
+app.add_typer(
+    lighteval.main_endpoint.app,
+    name="endpoint",
+    rich_help_panel="Evaluation Backends",
+    help="Evaluate models using some endpoint (tgi, inference endpoint, openai) as backend.",
+)
+app.add_typer(
+    lighteval.main_tasks.app,
+    name="tasks",
+    rich_help_panel="Utils",
+    help="List or inspect tasks.",
+)
 
 
 if __name__ == "__main__":
-    cli_evaluate()
+    app()
diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py
index 1c4c3a11..ac8d59d8 100644
--- a/src/lighteval/logging/hierarchical_logger.py
+++ b/src/lighteval/logging/hierarchical_logger.py
@@ -26,24 +26,10 @@
 from logging import Logger
 from typing import Any, Callable
 
-from lighteval.utils.imports import is_accelerate_available, is_nanotron_available
-
-
-if is_nanotron_available():
-    from nanotron.logging import get_logger
-
-    logger = get_logger(__name__, log_level="INFO")
-elif is_accelerate_available():
-    from accelerate import Accelerator, InitProcessGroupKwargs
-    from accelerate.logging import get_logger
+from colorama import Fore, Style
 
-    # We must init the accelerator before using the logger
-    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
-    logger = get_logger(__name__, log_level="INFO")
-else:
-    logger = Logger(__name__, level="INFO")
 
-from colorama import Fore, Style
+logger = Logger(__name__, level="INFO")
 
 
 class HierarchicalLogger:
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index 67396be8..f6ed6b38 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -20,63 +20,173 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import os
-from datetime import timedelta
+from typing import Optional
 
-from lighteval.logging.evaluation_tracker import EvaluationTracker
-from lighteval.logging.hierarchical_logger import hlog_warn, htrack
-from lighteval.models.model_config import create_model_config
-from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
-from lighteval.utils.imports import is_accelerate_available, is_tgi_available
+from typer import Argument, Option
+from typing_extensions import Annotated
 
 
-if not is_accelerate_available() and not is_tgi_available():
-    hlog_warn("Using either accelerate or text-generation to run this script is advised.")
+logger = logging.getLogger(__name__)
 
 TOKEN = os.getenv("HF_TOKEN")
-
-if is_accelerate_available():
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+
+HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_2 = "Logging Parameters"
+HELP_PANNEL_NAME_3 = "Debug Paramaters"
+HELP_PANNEL_NAME_4 = "Modeling Paramaters"
+
+
+def accelerate(  # noqa C901
+    # === general ===
+    model_args: Annotated[
+        str,
+        Argument(
+            help="Model arguments in the form key1=value1,key2=value2,... or path to yaml config file (see examples/model_configs/base_model.yaml)"
+        ),
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        Optional[str], Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    override_batch_size: Annotated[
+        int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = -1,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using accelerate and transformers as backend.
+    """
+    from datetime import timedelta
+
+    import torch
+    import yaml
     from accelerate import Accelerator, InitProcessGroupKwargs
 
-    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
-else:
-    accelerator = None
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import AdapterModelConfig, BaseModelConfig, BitsAndBytesConfig, DeltaModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
 
+    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+    cache_dir = CACHE_DIR
 
-@htrack()
-def main(args):
-    env_config = EnvConfig(token=TOKEN, cache_dir=args.cache_dir)
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
     evaluation_tracker = EvaluationTracker(
-        output_dir=args.output_dir,
-        save_details=args.save_details,
-        push_to_hub=args.push_to_hub,
-        push_to_tensorboard=args.push_to_tensorboard,
-        public=args.public_run,
-        hub_results_org=args.results_org,
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
     )
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.ACCELERATE,
         env_config=env_config,
-        job_id=args.job_id,
-        dataset_loading_processes=args.dataset_loading_processes,
-        custom_tasks_directory=args.custom_tasks,
-        override_batch_size=args.override_batch_size,
-        num_fewshot_seeds=args.num_fewshot_seeds,
-        max_samples=args.max_samples,
-        use_chat_template=args.use_chat_template,
-        system_prompt=args.system_prompt,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=override_batch_size,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
     )
 
-    model_config = create_model_config(
-        use_chat_template=args.use_chat_template,
-        override_batch_size=args.override_batch_size,
-        model_args=args.model_args,
-        model_config_path=args.model_config_path,
-        accelerator=accelerator,
-    )
+    # TODO (nathan): better handling of model_args
+    if model_args.endswith(".yaml"):
+        with open(model_args, "r") as f:
+            config = yaml.safe_load(f)["model"]
+
+        # Creating optional quantization configuration
+        if config["base_params"]["dtype"] == "4bit":
+            quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+        elif config["base_params"]["dtype"] == "8bit":
+            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+        else:
+            quantization_config = None
+
+        # We extract the model args
+        args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
+
+        # We store the relevant other args
+        args_dict["base_model"] = config["merged_weights"]["base_model"]
+        args_dict["compile"] = bool(config["base_params"]["compile"])
+        args_dict["dtype"] = config["base_params"]["dtype"]
+        args_dict["accelerator"] = accelerator
+        args_dict["quantization_config"] = quantization_config
+        args_dict["batch_size"] = override_batch_size
+        args_dict["multichoice_continuations_start_space"] = config["generation"][
+            "multichoice_continuations_start_space"
+        ]
+        args_dict["use_chat_template"] = use_chat_template
+
+        # Keeping only non null params
+        args_dict = {k: v for k, v in args_dict.items() if v is not None}
+
+        if config["merged_weights"]["delta_weights"]:
+            if config["merged_weights"]["base_model"] is None:
+                raise ValueError("You need to specify a base model when using delta weights")
+            model_config = DeltaModelConfig(**args_dict)
+        elif config["merged_weights"]["adapter_weights"]:
+            if config["merged_weights"]["base_model"] is None:
+                raise ValueError("You need to specify a base model when using adapter weights")
+            model_config = AdapterModelConfig(**args_dict)
+        elif config["merged_weights"]["base_model"] not in ["", None]:
+            raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
+        else:
+            model_config = BaseModelConfig(**args_dict)
+    else:
+        model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+        model_args_dict["accelerator"] = accelerator
+        model_args_dict["use_chat_template"] = use_chat_template
+        model_args_dict["compile"] = bool(model_args_dict["compile"]) if "compile" in model_args_dict else False
+        model_config = BaseModelConfig(**model_args_dict)
 
     pipeline = Pipeline(
-        tasks=args.tasks,
+        tasks=tasks,
         pipeline_parameters=pipeline_params,
         evaluation_tracker=evaluation_tracker,
         model_config=model_config,
diff --git a/src/lighteval/main_baseline.py b/src/lighteval/main_baseline.py
index f824d94f..dd478667 100644
--- a/src/lighteval/main_baseline.py
+++ b/src/lighteval/main_baseline.py
@@ -20,33 +20,66 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from lighteval.logging.evaluation_tracker import EvaluationTracker
-from lighteval.metrics.utils.metric_utils import MetricCategory
-from lighteval.models.abstract_model import ModelInfo
-from lighteval.tasks.lighteval_task import LightevalTask
-from lighteval.tasks.registry import Registry, taskinfo_selector
-from lighteval.utils.utils import as_list
 
-
-def main(args):
+import os
+from typing import Optional
+
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+
+HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_2 = "Logging Parameters"
+HELP_PANNEL_NAME_3 = "Debug Paramaters"
+HELP_PANNEL_NAME_4 = "Modeling Paramaters"
+
+
+def baseline(
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+):
     """
     Compute baselines for given tasks.
 
     It has been tested with generative and accuracy tasks, but may not work correctly for other task types.
 
     The baseline is computed as follows:
+
     - For multiple-choice tasks: It assumes random guessing, so the score is n_correct/number_of_choices.
     - For other metrics: It assigns a score of 0, which may not be appropriate for all task types.
 
     Note:
         This baseline computation may not be suitable for all task types and should be used with caution.
     """
-    task_registry = Registry(cache_dir=args.cache_dir, custom_tasks=args.custom_tasks)
-    task_names_list, fewshots_dict = taskinfo_selector(args.tasks, task_registry)
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.metrics.utils.metric_utils import MetricCategory
+    from lighteval.models.abstract_model import ModelInfo
+    from lighteval.tasks.lighteval_task import LightevalTask
+    from lighteval.tasks.registry import Registry, taskinfo_selector
+    from lighteval.utils.utils import as_list
+
+    task_registry = Registry(cache_dir=cache_dir, custom_tasks=custom_tasks)
+    task_names_list, fewshots_dict = taskinfo_selector(tasks, task_registry)
     task_dict = task_registry.get_task_dict(task_names_list)
 
     evaluation_tracker = EvaluationTracker(
-        output_dir=args.output_dir,
+        output_dir=output_dir,
         save_details=False,
         push_to_hub=False,
         push_to_tensorboard=False,
@@ -63,11 +96,11 @@ def main(args):
     )
     evaluation_tracker.task_config_logger.log(task_dict)
 
-    LightevalTask.load_datasets(list(task_dict.values()), args.dataset_loading_processes)
+    LightevalTask.load_datasets(list(task_dict.values()), dataset_loading_processes)
 
     for task_name, task in task_dict.items():
         task_docs = list(task.eval_docs())
-        n_samples = min(args.max_samples, len(task_docs)) if args.max_samples else len(task_docs)
+        n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs)
 
         p_correct_score = [
             len(as_list(task_doc.gold_index)) / len(task_doc.choices) for task_doc in task_docs[:n_samples]
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
new file mode 100644
index 00000000..877be2df
--- /dev/null
+++ b/src/lighteval/main_endpoint.py
@@ -0,0 +1,397 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import os
+from typing import Optional
+
+import typer
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+app = typer.Typer()
+
+
+TOKEN = os.getenv("HF_TOKEN")
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+
+HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_2 = "Logging Parameters"
+HELP_PANNEL_NAME_3 = "Debug Paramaters"
+HELP_PANNEL_NAME_4 = "Modeling Paramaters"
+
+
+@app.command(rich_help_panel="Evaluation Backends")
+def openai(
+    # === general ===
+    model_name: Annotated[
+        str, Argument(help="The model name to evaluate (has to be available through the openai API.")
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate OPENAI models.
+    """
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import OpenAIModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    parallelism_manager = ParallelismManager.OPENAI
+    model_config = OpenAIModelConfig(model=model_name)
+
+    pipeline_params = PipelineParameters(
+        launcher_type=parallelism_manager,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=-1,  # Cannot override batch size when using OpenAI
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=False,  # Cannot use chat template when using OpenAI
+        system_prompt=system_prompt,
+    )
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
+
+
+@app.command(rich_help_panel="Evaluation Backends")
+def inference_endpoint(
+    # === general ===
+    model_config_path: Annotated[
+        str, Argument(help="Path to model config yaml file. (examples/model_configs/endpoint_model.yaml)")
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    override_batch_size: Annotated[
+        int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = -1,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using inference-endpoints as backend.
+    """
+    import yaml
+
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import (
+        InferenceEndpointModelConfig,
+        InferenceModelConfig,
+    )
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    # TODO (nathan): better handling of model_args
+
+    parallelism_manager = ParallelismManager.TGI
+
+    with open(model_config_path, "r") as f:
+        config = yaml.safe_load(f)["model"]
+
+    reuse_existing_endpoint = config["base_params"].get("reuse_existing", None)
+
+    complete_config_endpoint = all(
+        val not in [None, ""]
+        for key, val in config.get("instance", {}).items()
+        if key not in InferenceEndpointModelConfig.nullable_keys()
+    )
+
+    if reuse_existing_endpoint or complete_config_endpoint:
+        model_config = InferenceEndpointModelConfig(
+            name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
+            repository=config["base_params"]["model"],
+            model_dtype=config["base_params"]["dtype"],
+            revision=config["base_params"]["revision"] or "main",
+            should_reuse_existing=reuse_existing_endpoint,
+            accelerator=config["instance"]["accelerator"],
+            region=config["instance"]["region"],
+            vendor=config["instance"]["vendor"],
+            instance_size=config["instance"]["instance_size"],
+            instance_type=config["instance"]["instance_type"],
+            namespace=config["instance"]["namespace"],
+            image_url=config["instance"].get("image_url", None),
+            env_vars=config["instance"].get("env_vars", None),
+        )
+    else:
+        model_config = InferenceModelConfig(model=config["base_params"]["endpoint_name"])
+
+    pipeline_params = PipelineParameters(
+        launcher_type=parallelism_manager,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=override_batch_size,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+    )
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
+
+
+@app.command(rich_help_panel="Evaluation Backends")
+def tgi(
+    # === general ===
+    model_config_path: Annotated[
+        str, Argument(help="Path to model config yaml file. (examples/model_configs/tgi_model.yaml)")
+    ],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    override_batch_size: Annotated[
+        int, Option(help="Override batch size for evaluation.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = -1,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using TGI as backend.
+    """
+    import yaml
+
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import TGIModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    # TODO (nathan): better handling of model_args
+    parallelism_manager = ParallelismManager.TGI
+    with open(model_config_path, "r") as f:
+        config = yaml.safe_load(f)["model"]
+
+    model_config = TGIModelConfig(
+        inference_server_address=config["instance"]["inference_server_address"],
+        inference_server_auth=config["instance"]["inference_server_auth"],
+        model_id=config["instance"]["model_id"],
+    )
+
+    pipeline_params = PipelineParameters(
+        launcher_type=parallelism_manager,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=override_batch_size,
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+    )
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/main_nanotron.py b/src/lighteval/main_nanotron.py
index a00ef884..66826122 100644
--- a/src/lighteval/main_nanotron.py
+++ b/src/lighteval/main_nanotron.py
@@ -22,33 +22,46 @@
 
 # flake8: noqa: C901
 import os
-from typing import Optional
 
-from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
-from lighteval.logging.evaluation_tracker import EvaluationTracker
-from lighteval.logging.hierarchical_logger import htrack, htrack_block
-from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
-from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
-from lighteval.utils.utils import EnvConfig
+from typer import Option
+from typing_extensions import Annotated
 
 
-if not is_nanotron_available():
-    raise ImportError(NO_NANOTRON_ERROR_MSG)
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
 
-from nanotron.config import Config, get_config_from_file
+HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_2 = "Logging Parameters"
+HELP_PANNEL_NAME_3 = "Debug Paramaters"
+HELP_PANNEL_NAME_4 = "Modeling Paramaters"
 
 
 SEED = 1234
 
 
-@htrack()
-def main(
-    checkpoint_config_path: str,
-    lighteval_config_path: Optional[str] = None,
-    cache_dir: Optional[str] = os.getenv("HF_HOME", "/scratch"),
+def nanotron(
+    checkpoint_config_path: Annotated[
+        str, Option(help="Path to the nanotron checkpoint YAML or python config file, potentially on s3.")
+    ],
+    lighteval_config_path: Annotated[str, Option(help="Path to a YAML config to be used for the evaluation.")],
+    cache_dir: Annotated[str, Option(help="Cache directory for datasets and models.")] = CACHE_DIR,
 ):
+    """
+    Evaluate models using nanotron as backend.
+    """
+    from nanotron.config import Config, get_config_from_file
+
+    from lighteval.config.lighteval_config import FullNanotronConfig, LightEvalConfig
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.logging.hierarchical_logger import htrack_block
+    from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
+    from lighteval.utils.imports import NO_NANOTRON_ERROR_MSG, is_nanotron_available
+    from lighteval.utils.utils import EnvConfig
+
     env_config = EnvConfig(token=os.getenv("HF_TOKEN"), cache_dir=cache_dir)
 
+    if not is_nanotron_available():
+        raise ImportError(NO_NANOTRON_ERROR_MSG)
+
     with htrack_block("Load nanotron config"):
         # Create nanotron config
         if not checkpoint_config_path.endswith(".yaml"):
diff --git a/src/lighteval/main_tasks.py b/src/lighteval/main_tasks.py
new file mode 100644
index 00000000..66834798
--- /dev/null
+++ b/src/lighteval/main_tasks.py
@@ -0,0 +1,77 @@
+# MIT License
+
+# Copyright (c) 2024 Taratra D. RAHARISON and The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import os
+from typing import Optional
+
+import typer
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+app = typer.Typer()
+CACHE_DIR = os.getenv("HF_HOME")
+
+
+@app.command()
+def inspect(
+    tasks: Annotated[str, Argument(help="Id of tasks or path to a text file with a list of tasks")],
+    custom_tasks: Annotated[Optional[str], Option(help="Path to a file with custom tasks")] = None,
+    num_samples: Annotated[int, Option(help="Number of samples to display")] = 10,
+    show_config: Annotated[bool, Option(help="Will display the full task config")] = False,
+    cache_dir: Annotated[Optional[str], Option(help="Cache directory used to store datasets and models")] = CACHE_DIR,
+):
+    """
+    Inspect a tasks
+    """
+    from dataclasses import asdict
+    from pprint import pformat
+
+    from rich import print
+
+    from lighteval.tasks.registry import Registry, taskinfo_selector
+
+    registry = Registry(cache_dir=cache_dir, custom_tasks=custom_tasks)
+
+    # Loading task
+    task_names_list, _ = taskinfo_selector(tasks, task_registry=registry)
+    task_dict = registry.get_task_dict(task_names_list)
+    for name, task in task_dict.items():
+        print("-" * 10, name, "-" * 10)
+        if show_config:
+            print("-" * 10, "CONFIG")
+            task.cfg.print()
+        for ix, sample in enumerate(task.eval_docs()[: int(num_samples)]):
+            if ix == 0:
+                print("-" * 10, "SAMPLES")
+            print(f"-- sample {ix} --")
+            print(pformat(asdict(sample), indent=2))
+
+
+@app.command()
+def list():
+    """
+    List all tasks
+    """
+    from lighteval.tasks.registry import Registry
+
+    registry = Registry(cache_dir=CACHE_DIR)
+    registry.print_all_tasks()
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
new file mode 100644
index 00000000..4bd1681d
--- /dev/null
+++ b/src/lighteval/main_vllm.py
@@ -0,0 +1,139 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import os
+from typing import Optional
+
+from typer import Argument, Option
+from typing_extensions import Annotated
+
+
+TOKEN = os.getenv("HF_TOKEN")
+CACHE_DIR: str = os.getenv("HF_HOME", "/scratch")
+
+HELP_PANNEL_NAME_1 = "Common Paramaters"
+HELP_PANNEL_NAME_2 = "Logging Parameters"
+HELP_PANNEL_NAME_3 = "Debug Paramaters"
+HELP_PANNEL_NAME_4 = "Modeling Paramaters"
+
+
+def vllm(
+    # === general ===
+    model_args: Annotated[str, Argument(help="Model arguments in the form key1=value1,key2=value2,...")],
+    tasks: Annotated[str, Argument(help="Comma-separated list of tasks to evaluate on.")],
+    # === Common parameters ===
+    use_chat_template: Annotated[
+        bool, Option(help="Use chat template for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = False,
+    system_prompt: Annotated[
+        Optional[str], Option(help="Use system prompt for evaluation.", rich_help_panel=HELP_PANNEL_NAME_4)
+    ] = None,
+    dataset_loading_processes: Annotated[
+        int, Option(help="Number of processes to use for dataset loading.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    custom_tasks: Annotated[
+        Optional[str], Option(help="Path to custom tasks directory.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = None,
+    cache_dir: Annotated[
+        str, Option(help="Cache directory for datasets and models.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = CACHE_DIR,
+    num_fewshot_seeds: Annotated[
+        int, Option(help="Number of seeds to use for few-shot evaluation.", rich_help_panel=HELP_PANNEL_NAME_1)
+    ] = 1,
+    # === saving ===
+    output_dir: Annotated[
+        str, Option(help="Output directory for evaluation results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = "results",
+    push_to_hub: Annotated[
+        bool, Option(help="Push results to the huggingface hub.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    push_to_tensorboard: Annotated[
+        bool, Option(help="Push results to tensorboard.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    public_run: Annotated[
+        bool, Option(help="Push results and details to a public repo.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    results_org: Annotated[
+        Optional[str], Option(help="Organization to push results to.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = None,
+    save_details: Annotated[
+        bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANNEL_NAME_2)
+    ] = False,
+    # === debug ===
+    max_samples: Annotated[
+        Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = None,
+    job_id: Annotated[
+        int, Option(help="Optional job id for future refenrence.", rich_help_panel=HELP_PANNEL_NAME_3)
+    ] = 0,
+):
+    """
+    Evaluate models using vllm as backend.
+    """
+    from lighteval.logging.evaluation_tracker import EvaluationTracker
+    from lighteval.models.model_config import VLLMModelConfig
+    from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters
+
+    TOKEN = os.getenv("HF_TOKEN")
+
+    env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+
+    evaluation_tracker = EvaluationTracker(
+        output_dir=output_dir,
+        save_details=save_details,
+        push_to_hub=push_to_hub,
+        push_to_tensorboard=push_to_tensorboard,
+        public=public_run,
+        hub_results_org=results_org,
+    )
+
+    pipeline_params = PipelineParameters(
+        launcher_type=ParallelismManager.VLLM,
+        env_config=env_config,
+        job_id=job_id,
+        dataset_loading_processes=dataset_loading_processes,
+        custom_tasks_directory=custom_tasks,
+        override_batch_size=-1,  # Cannot override batch size when using VLLM
+        num_fewshot_seeds=num_fewshot_seeds,
+        max_samples=max_samples,
+        use_chat_template=use_chat_template,
+        system_prompt=system_prompt,
+    )
+
+    model_args_dict: dict = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
+    model_config = VLLMModelConfig(**model_args_dict)
+
+    pipeline = Pipeline(
+        tasks=tasks,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+
+    pipeline.show_results()
+
+    results = pipeline.get_results()
+
+    pipeline.save_and_push_results()
+
+    return results
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index a315d892..2afaead1 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -24,7 +24,6 @@
 from typing import Dict, Optional, Union
 
 import torch
-import yaml
 from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig
 
 from lighteval.logging.hierarchical_logger import hlog
@@ -118,6 +117,17 @@ class BaseModelConfig:
     def __post_init__(self):
         # Making sure this parameter is a boolean
         self.multichoice_continuations_start_space = boolstring_to_bool(self.multichoice_continuations_start_space)
+
+        if self.multichoice_continuations_start_space is not None:
+            if self.multichoice_continuations_start_space:
+                hlog(
+                    "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space"
+                )
+            else:
+                hlog(
+                    "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present."
+                )
+
         self.model_parallel = boolstring_to_bool(self.model_parallel)
         self.compile = boolstring_to_bool(self.compile)
 
@@ -297,155 +307,3 @@ def nullable_keys() -> list[str]:
         that are not required and can remain None.
         """
         return ["namespace", "env_vars", "image_url"]
-
-
-def create_model_config(  # noqa: C901
-    use_chat_template: bool,
-    override_batch_size: int,
-    accelerator: Union["Accelerator", None],
-    model_args: Union[str, dict] = None,
-    model_config_path: str = None,
-) -> Union[
-    BaseModelConfig,
-    AdapterModelConfig,
-    DeltaModelConfig,
-    TGIModelConfig,
-    InferenceEndpointModelConfig,
-    DummyModelConfig,
-    VLLMModelConfig,
-    OpenAIModelConfig,
-]:
-    """
-    Create a model configuration based on the provided arguments.
-
-    Args:
-        accelerator(Union[Accelerator, None]): accelerator to use for model training.
-        use_chat_template (bool): whether to use the chat template or not. Set to True for chat or ift models
-        override_batch_size (int): frozen batch size to use
-        model_args (Optional[Union[str, dict]]): Parameters to create the model, passed as a string (like the CLI kwargs or dict).
-            This option only allows to create a dummy model using `dummy` or a base model (using accelerate or no accelerator), in
-            which case corresponding full model args available are the arguments of the [[BaseModelConfig]].
-            Minimal configuration is `pretrained=<name_of_the_model_on_the_hub>`.
-        model_config_path (Optional[str]): Path to the parameters to create the model, passed as a config file. This allows to create
-            all possible model configurations (base, adapter, peft, inference endpoints, tgi...)
-
-    Returns:
-        Union[BaseModelConfig, AdapterModelConfig, DeltaModelConfig, TGIModelConfig, InferenceEndpointModelConfig, DummyModelConfig]: model configuration.
-
-    Raises:
-        ValueError: If both an inference server address and model arguments are provided.
-     ValueError: If multichoice continuations both should start with a space and should not start with a space.
-        ValueError: If a base model is not specified when using delta weights or adapter weights.
-        ValueError: If a base model is specified when not using delta weights or adapter weights.
-    """
-    if model_args is None and model_config_path is None:
-        raise ValueError("You can't create a model without either a list of model_args or a model_config_path.")
-
-    if model_args:
-        if isinstance(model_args, str):
-            model_args = {k.split("=")[0]: k.split("=")[1] if "=" in k else True for k in model_args.split(",")}
-
-        if model_args.pop("dummy", False):
-            return DummyModelConfig(**model_args)
-
-        if model_args.pop("vllm", False):
-            return VLLMModelConfig(**model_args)
-
-        if model_args.pop("openai", False):
-            return OpenAIModelConfig(**model_args)
-
-        model_args["accelerator"] = accelerator
-        model_args["use_chat_template"] = use_chat_template
-        model_args["compile"] = bool(model_args["compile"]) if "compile" in model_args else False
-
-        return BaseModelConfig(**model_args)
-
-    with open(model_config_path, "r") as f:
-        config = yaml.safe_load(f)["model"]
-
-    if config["type"] == "tgi":
-        return TGIModelConfig(
-            inference_server_address=config["instance"]["inference_server_address"],
-            inference_server_auth=config["instance"]["inference_server_auth"],
-            model_id=config["instance"]["model_id"],
-        )
-
-    if config["type"] == "endpoint":
-        reuse_existing_endpoint = config["base_params"].get("reuse_existing", None)
-        complete_config_endpoint = all(
-            val not in [None, ""]
-            for key, val in config.get("instance", {}).items()
-            if key not in InferenceEndpointModelConfig.nullable_keys()
-        )
-        if reuse_existing_endpoint or complete_config_endpoint:
-            return InferenceEndpointModelConfig(
-                name=config["base_params"]["endpoint_name"].replace(".", "-").lower(),
-                repository=config["base_params"]["model"],
-                model_dtype=config["base_params"]["dtype"],
-                revision=config["base_params"]["revision"] or "main",
-                should_reuse_existing=reuse_existing_endpoint,
-                accelerator=config["instance"]["accelerator"],
-                region=config["instance"]["region"],
-                vendor=config["instance"]["vendor"],
-                instance_size=config["instance"]["instance_size"],
-                instance_type=config["instance"]["instance_type"],
-                namespace=config["instance"]["namespace"],
-                image_url=config["instance"].get("image_url", None),
-                env_vars=config["instance"].get("env_vars", None),
-            )
-        return InferenceModelConfig(model=config["base_params"]["endpoint_name"])
-
-    if config["type"] == "base":
-        # Creating the multichoice space parameters
-        # We need to take into account possible conversion issues from our different input formats
-        multichoice_continuations_start_space = boolstring_to_bool(
-            config["generation"]["multichoice_continuations_start_space"]
-        )
-
-        if multichoice_continuations_start_space is not None:
-            if multichoice_continuations_start_space:
-                hlog(
-                    "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space"
-                )
-            else:
-                hlog(
-                    "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present."
-                )
-
-        # Creating optional quantization configuration
-        if config["base_params"]["dtype"] == "4bit":
-            quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-        elif config["base_params"]["dtype"] == "8bit":
-            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-        else:
-            quantization_config = None
-
-        # We extract the model args
-        args_dict = {k.split("=")[0]: k.split("=")[1] for k in config["base_params"]["model_args"].split(",")}
-
-        # We store the relevant other args
-        args_dict["base_model"] = config["merged_weights"]["base_model"]
-        args_dict["compile"] = bool(config["base_params"]["compile"])
-        args_dict["dtype"] = config["base_params"]["dtype"]
-        args_dict["accelerator"] = accelerator
-        args_dict["quantization_config"] = quantization_config
-        args_dict["batch_size"] = override_batch_size
-        args_dict["multichoice_continuations_start_space"] = multichoice_continuations_start_space
-        args_dict["use_chat_template"] = use_chat_template
-
-        # Keeping only non null params
-        args_dict = {k: v for k, v in args_dict.items() if v is not None}
-
-        if config["merged_weights"]["delta_weights"]:
-            if config["merged_weights"]["base_model"] is None:
-                raise ValueError("You need to specify a base model when using delta weights")
-            return DeltaModelConfig(**args_dict)
-        if config["merged_weights"]["adapter_weights"]:
-            if config["merged_weights"]["base_model"] is None:
-                raise ValueError("You need to specify a base model when using adapter weights")
-            return AdapterModelConfig(**args_dict)
-        if config["merged_weights"]["base_model"] not in ["", None]:
-            raise ValueError("You can't specify a base model if you are not using delta/adapter weights")
-        return BaseModelConfig(**args_dict)
-
-    raise ValueError(f"Unknown model type in your model config file: {config['type']}")
diff --git a/src/lighteval/parsers.py b/src/lighteval/parsers.py
deleted file mode 100644
index 3988fdbb..00000000
--- a/src/lighteval/parsers.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-import argparse
-import os
-
-
-TOKEN = os.getenv("HF_TOKEN")
-CACHE_DIR = os.getenv("HF_HOME")
-
-
-def parser_accelerate(parser=None):
-    if parser is None:
-        parser = argparse.ArgumentParser(
-            description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
-        )
-
-    group = parser.add_mutually_exclusive_group(required=True)
-    task_type_group = parser.add_mutually_exclusive_group(required=True)
-
-    # Model type: either use a config file or simply the model name
-    task_type_group.add_argument(
-        "--model_config_path",
-        type=str,
-        help="Path to the model config file, e.g. 'examples/model_configs/base_model.yaml'",
-    )
-    task_type_group.add_argument(
-        "--model_args",
-        type=str,
-        help="Model arguments to pass to the model class, e.g. 'pretrained=gpt2,dtype=float16'",
-    )
-
-    # Debug
-    parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on")
-    parser.add_argument("--override_batch_size", type=int, default=-1)
-    parser.add_argument("--job_id", type=str, help="Optional Job ID for future reference", default="")
-
-    # Saving
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        type=str,
-        help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
-    )
-    parser.add_argument("--save_details", action="store_true", help="Save the details of the run in the output_dir")
-    parser.add_argument("--push_to_hub", default=False, action="store_true", help="Set to push the details to the hub")
-    parser.add_argument("--push_to_tensorboard", default=False, action="store_true")
-    parser.add_argument(
-        "--public_run", default=False, action="store_true", help="Push results and details to a public repo"
-    )
-    parser.add_argument(
-        "--results_org",
-        type=str,
-        help="Hub organisation where you want to store the results. Your current token must have write access to it",
-        default=None,
-    )
-    # Common parameters
-    parser.add_argument(
-        "--use_chat_template",
-        default=False,
-        action="store_true",
-        help="Use the chat template (from the model's tokenizer) for the prompt",
-    )
-    parser.add_argument(
-        "--system_prompt", type=str, default=None, help="System prompt to use, e.g. 'You are a helpful assistant.'"
-    )
-    parser.add_argument(
-        "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets"
-    )
-    parser.add_argument(
-        "--custom_tasks",
-        type=str,
-        default=None,
-        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
-    )
-    group.add_argument(
-        "--tasks",
-        type=str,
-        default=None,
-        help="Id of a task, e.g. 'original|mmlu:abstract_algebra|5' or path to a texte file with a list of tasks",
-    )
-    parser.add_argument(
-        "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
-    )
-    parser.add_argument("--num_fewshot_seeds", type=int, default=1, help="Number of trials the few shots")
-    return parser
-
-
-def parser_baseline(parser=None):
-    if parser is None:
-        parser = argparse.ArgumentParser(
-            description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
-        )
-
-    parser.add_argument(
-        "--custom_tasks",
-        type=str,
-        default=None,
-        help="Path to a file with custom tasks (a TASK list of dict and potentially prompt formating functions)",
-    )
-
-    parser.add_argument(
-        "--tasks",
-        type=str,
-        required=True,
-        help="Task to compute the baseline for",
-    )
-    parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to evaluate on")
-    parser.add_argument(
-        "--dataset_loading_processes", type=int, default=1, help="Number of processes to use for loading the datasets"
-    )
-
-    parser.add_argument(
-        "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
-    )
-    # Ooutput related
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        type=str,
-        help="Directory to save the results, fsspec compliant (e.g. s3://bucket/path)",
-    )
-
-    return parser
-
-
-def parser_nanotron(parser=None):
-    if parser is None:
-        parser = argparse.ArgumentParser(
-            description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
-        )
-
-    parser.add_argument(
-        "--checkpoint_config_path",
-        type=str,
-        required=True,
-        help="Path to the nanotron checkpoint YAML or python config file, potentially on S3",
-    )
-    parser.add_argument(
-        "--lighteval_config_path",
-        type=str,
-        help="Path to a YAML or python lighteval config to be used for the evaluation. Lighteval key in nanotron config is ignored!",
-        required=True,
-    )
-    parser.add_argument(
-        "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
-    )
-
-
-def parser_utils_tasks(parser=None):
-    if parser is None:
-        parser = argparse.ArgumentParser(
-            description="CLI tool for lighteval, a lightweight framework for LLM evaluation"
-        )
-
-    group = parser.add_mutually_exclusive_group(required=True)
-
-    group.add_argument("--list", action="store_true", help="List available tasks")
-    group.add_argument(
-        "--inspect",
-        type=str,
-        default=None,
-        help="Id of tasks or path to a text file with a list of tasks (e.g. 'original|mmlu:abstract_algebra|5') for which you want to manually inspect samples.",
-    )
-    parser.add_argument("--custom_tasks", type=str, default=None, help="Path to a file with custom tasks")
-    parser.add_argument("--num_samples", type=int, default=10, help="Number of samples to display")
-    parser.add_argument("--show_config", default=False, action="store_true", help="Will display the full task config")
-    parser.add_argument(
-        "--cache_dir", type=str, default=CACHE_DIR, help="Cache directory used to store datasets and models"
-    )
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index da4fb045..e429e519 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -42,10 +42,14 @@
 from lighteval.utils.imports import (
     NO_ACCELERATE_ERROR_MSG,
     NO_NANOTRON_ERROR_MSG,
+    NO_OPENAI_ERROR_MSG,
     NO_TGI_ERROR_MSG,
+    NO_VLLM_ERROR_MSG,
     is_accelerate_available,
     is_nanotron_available,
+    is_openai_available,
     is_tgi_available,
+    is_vllm_available,
 )
 from lighteval.utils.parallelism import test_all_gather
 from lighteval.utils.utils import EnvConfig, make_results_table
@@ -65,6 +69,8 @@ class ParallelismManager(Enum):
     ACCELERATE = auto()
     NANOTRON = auto()
     TGI = auto()
+    OPENAI = auto()
+    VLLM = auto()
     NONE = auto()
 
 
@@ -85,16 +91,22 @@ class PipelineParameters:
     use_chat_template: bool = False
     system_prompt: str | None = None
 
-    def __post_init__(self):
+    def __post_init__(self):  # noqa C901
         if self.launcher_type == ParallelismManager.ACCELERATE:
             if not is_accelerate_available():
                 raise ImportError(NO_ACCELERATE_ERROR_MSG)
+        elif self.launcher_type == ParallelismManager.VLLM:
+            if not is_vllm_available():
+                raise ImportError(NO_VLLM_ERROR_MSG)
         elif self.launcher_type == ParallelismManager.TGI:
             if not is_tgi_available():
                 raise ImportError(NO_TGI_ERROR_MSG)
         elif self.launcher_type == ParallelismManager.NANOTRON:
             if not is_nanotron_available():
                 raise ImportError(NO_NANOTRON_ERROR_MSG)
+        elif self.launcher_type == ParallelismManager.OPENAI:
+            if not is_openai_available():
+                raise ImportError(NO_OPENAI_ERROR_MSG)
 
 
 class Pipeline:
diff --git a/tests/test_main.py b/tests/test_main.py
index 32211d00..d0c85b33 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -28,8 +28,7 @@
 import pytest
 from pytest import approx
 
-from lighteval.main_accelerate import main  # noqa: E402
-from lighteval.parsers import parser_accelerate
+from lighteval.main_accelerate import accelerate  # noqa: E402
 from tests.reference_scores.reference_task_scores import RESULTS_FULL, RESULTS_LITE  # noqa: E402
 from tests.reference_scores.reference_tasks import ALL_SUBSETS
 
@@ -37,10 +36,6 @@
 # Set env var for deterministic run of models
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
-# Set cache for github actions
-os.environ["HF_DATASETS_CACHE"] = "cache/datasets/"
-os.environ["HF_HOME"] = "cache/models/"
-
 # To add new models or tasks, change here
 # ! The correct results must be present in reference_task_scores
 MODELS = ["gpt2"]
@@ -53,39 +48,29 @@
 @lru_cache(maxsize=len(MODELS))
 def run_model_predictions_full(model: str, tasks: tuple):
     """Runs the full main as a black box, using the input model and tasks, on all samples without parallelism"""
-    lighteval_args = ["--model_args", f"pretrained={model}", "--tasks", ",".join(tasks)]
-    lighteval_args += [
-        "--override_batch_size",
-        "1",
-        "--output_dir",
-        "",
-        "--dataset_loading_processes",
-        "1",
-        "--save_details",
-    ]
-    parser = parser_accelerate()
-    args = parser.parse_args(lighteval_args)
-    results = main(args)
+    results = accelerate(
+        model_args=f"pretrained={model}",
+        tasks=",".join(tasks),
+        override_batch_size=1,
+        output_dir="",
+        dataset_loading_processes=1,
+        save_details=True,
+    )
     return results
 
 
 @lru_cache(maxsize=len(MODELS))
 def run_model_predictions_lite(model: str, tasks: tuple):
     """Runs the full main as a black box, using the input model and tasks, on 10 samples without parallelism"""
-    lighteval_args = ["--model_args", f"pretrained={model}", "--tasks", ",".join(tasks)]
-    lighteval_args += [
-        "--override_batch_size",
-        "1",
-        "--output_dir",
-        "",
-        "--dataset_loading_processes",
-        "1",
-        "--save_details",
-    ]
-    lighteval_args += ["--max_samples", "10"]
-    parser = parser_accelerate()
-    args = parser.parse_args(lighteval_args)
-    results = main(args)
+    results = accelerate(
+        model_args=f"pretrained={model}",
+        tasks=",".join(tasks),
+        override_batch_size=1,
+        output_dir="",
+        dataset_loading_processes=1,
+        save_details=True,
+        max_samples=10,
+    )
     return results