From 1fb796868bbe765421988e19e3a07d39ca1181dd Mon Sep 17 00:00:00 2001
From: Nathan Habib <30601243+NathanHB@users.noreply.github.com>
Date: Thu, 5 Dec 2024 13:26:32 +0100
Subject: [PATCH] redo logging (#415)

- revamped the logging with a config modifyng every loggers used in lighteval
- made accelerate a default requirement
- fixed some documentation
---
 README.md                                     |  24 +-
 docs/source/installation.mdx                  |   1 -
 pyproject.toml                                |   5 +-
 src/lighteval/__main__.py                     |  26 +++
 src/lighteval/data.py                         |   9 +-
 src/lighteval/logging/evaluation_tracker.py   |  24 +-
 src/lighteval/logging/hierarchical_logger.py  | 166 --------------
 src/lighteval/logging/info_loggers.py         |   9 +-
 src/lighteval/main_accelerate.py              |   1 +
 src/lighteval/metrics/imports/bert_scorer.py  |  14 +-
 .../metrics/imports/data_stats_metric.py      |   7 +-
 src/lighteval/metrics/imports/summac.py       |   5 +-
 src/lighteval/metrics/llm_as_judge.py         |   4 +-
 src/lighteval/metrics/metrics_corpus.py       |   7 +-
 src/lighteval/metrics/metrics_sample.py       |  23 +-
 src/lighteval/metrics/sample_preparator.py    |   6 +-
 src/lighteval/metrics/stderr.py               |   8 +-
 .../metrics/utils/linguistic_tokenizers.py    |   7 +-
 src/lighteval/models/adapter_model.py         |  10 +-
 src/lighteval/models/base_model.py            |  35 +--
 src/lighteval/models/delta_model.py           |  11 +-
 src/lighteval/models/endpoint_model.py        |  32 +--
 src/lighteval/models/model_config.py          |  10 +-
 src/lighteval/models/model_loader.py          |   9 +-
 src/lighteval/models/nanotron_model.py        |   9 +-
 src/lighteval/models/openai_model.py          |   7 +-
 src/lighteval/models/vllm_model.py            |   9 +-
 src/lighteval/pipeline.py                     | 210 +++++++++---------
 src/lighteval/tasks/default_prompts.py        |  11 +-
 src/lighteval/tasks/lighteval_task.py         |  10 +-
 src/lighteval/tasks/prompt_manager.py         |   7 +-
 src/lighteval/tasks/registry.py               |  20 +-
 src/lighteval/utils/parallelism.py            |  13 +-
 33 files changed, 345 insertions(+), 404 deletions(-)
 delete mode 100644 src/lighteval/logging/hierarchical_logger.py

diff --git a/README.md b/README.md
index f554ed17..0ee4010c 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ Hub, S3, or locally.
 ## ⚡️ Installation
 
 ```bash
-pip install lighteval[accelerate]
+pip install lighteval
 ```
 
 Lighteval allows for many extras when installing, see [here](https://github.com/huggingface/lighteval/wiki/Installation) for a complete list.
@@ -71,20 +71,24 @@ huggingface-cli login
 
 Lighteval offers two main entry points for model evaluation:
 
-
-* `lighteval accelerate`: evaluate models on CPU or one or more GPUs using [🤗
-  Accelerate](https://github.com/huggingface/accelerate).
-* `lighteval nanotron`: evaluate models in distributed settings using [⚡️
-  Nanotron](https://github.com/huggingface/nanotron).
+- `lighteval accelerate` : evaluate models on CPU or one or more GPUs using [🤗
+  Accelerate](https://github.com/huggingface/accelerate)
+- `lighteval nanotron`: evaluate models in distributed settings using [⚡️
+  Nanotron](https://github.com/huggingface/nanotron)
+- `lighteval vllm`: evaluate models on one or more GPUs using [🚀
+  VLLM](https://github.com/vllm-project/vllm)
+- `lighteval endpoint`
+    - `inference-endpoint`: evaluate models on one or more GPUs using [🔗
+  Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)
+    - `tgi`: evaluate models on one or more GPUs using [🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)
+    - `openai`: evaluate models on one or more GPUs using [🔗 OpenAI API](https://platform.openai.com/)
 
 Here’s a quick command to evaluate using the Accelerate backend:
 
 ```shell
 lighteval accelerate \
-    --model_args "pretrained=gpt2" \
-    --tasks "leaderboard|truthfulqa:mc|0|0" \
-    --override_batch_size 1 \
-    --output_dir="./evals/"
+    "pretrained=gpt2" \
+    "leaderboard|truthfulqa:mc|0|0"
 ```
 
 ## 🙏 Acknowledgements
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 39ac2b89..542c0975 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -25,7 +25,6 @@ appropriate extras group.
 
 | extra name   | description                                                               |
 |--------------|---------------------------------------------------------------------------|
-| accelerate   | To use accelerate for model and data parallelism with transformers models |
 | tgi          | To use Text Generation Inference API to evaluate your model               |
 | nanotron     | To evaluate nanotron models                                               |
 | quantization | To evaluate quantized models                                              |
diff --git a/pyproject.toml b/pyproject.toml
index 1a99b6a6..9a4d3a3c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,6 +55,7 @@ keywords = ["evaluation", "nlp", "llm"]
 dependencies = [
     # Base dependencies
     "transformers>=4.38.0",
+    "accelerate",
     "huggingface_hub>=0.23.0",
     "torch>=2.0,<2.5",
     "GitPython>=3.1.41", # for logging
@@ -64,7 +65,8 @@ dependencies = [
     "typer",
     "termcolor==2.3.0",
     "pytablewriter",
-    "colorama",
+    "rich",
+    "colorlog",
     # Extension of metrics
     "aenum==3.1.15",
     # Base metrics
@@ -80,7 +82,6 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
-accelerate = ["accelerate"]
 tgi = ["text-generation==0.6.0"]
 optimum = ["optimum==1.12.0"]
 quantization = ["bitsandbytes>=0.41.0", "auto-gptq>=0.4.2"]
diff --git a/src/lighteval/__main__.py b/src/lighteval/__main__.py
index c715723d..4484f781 100644
--- a/src/lighteval/__main__.py
+++ b/src/lighteval/__main__.py
@@ -19,7 +19,10 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+import logging
+from logging.config import dictConfig
 
+import colorlog
 import typer
 
 import lighteval.main_accelerate
@@ -32,6 +35,29 @@
 
 app = typer.Typer()
 
+logging_config = dict(  # noqa C408
+    version=1,
+    formatters={
+        "c": {
+            "()": colorlog.ColoredFormatter,
+            "format": "[%(asctime)s] [%(log_color)s%(levelname)8s%(reset)s]: %(message)s (%(filename)s:%(lineno)s)",
+            "log_colors": {
+                "DEBUG": "cyan",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red,bg_white",
+            },
+        },
+    },
+    handlers={"h": {"class": "logging.StreamHandler", "formatter": "c", "level": logging.INFO}},
+    root={
+        "handlers": ["h"],
+        "level": logging.INFO,
+    },
+)
+
+dictConfig(logging_config)
 
 app.command(rich_help_panel="Evaluation Backends")(lighteval.main_accelerate.accelerate)
 app.command(rich_help_panel="Evaluation Utils")(lighteval.main_baseline.baseline)
diff --git a/src/lighteval/data.py b/src/lighteval/data.py
index 74dedf22..7cb105e6 100644
--- a/src/lighteval/data.py
+++ b/src/lighteval/data.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import math
 from typing import Iterator, Tuple
 
@@ -27,7 +28,6 @@
 from torch.utils.data import Dataset
 from torch.utils.data.distributed import DistributedSampler, T_co
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import (
     GreedyUntilRequest,
     LoglikelihoodRequest,
@@ -37,6 +37,9 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 class DynamicBatchDataset(Dataset):
     def __init__(
         self,
@@ -76,7 +79,7 @@ def __init__(
 
     def init_split_limits(self, num_dataset_splits):
         if num_dataset_splits >= self.total_size:
-            hlog_warn(
+            logger.warning(
                 f"num_dataset_splits ({num_dataset_splits}) >= total_size ({self.total_size}), setting num_dataset_splits to 1"
             )
             num_dataset_splits = 1
@@ -247,7 +250,7 @@ def init_split_limits(self, num_dataset_splits):
             _type_: _description_
         """
         if num_dataset_splits is not None:
-            hlog_warn(
+            logger.warning(
                 "You cannot select the number of dataset splits for a generative evaluation at the moment. Automatically inferring."
             )
 
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
index 58ae7410..01705534 100644
--- a/src/lighteval/logging/evaluation_tracker.py
+++ b/src/lighteval/logging/evaluation_tracker.py
@@ -22,6 +22,7 @@
 
 import copy
 import json
+import logging
 import os
 import re
 import time
@@ -37,7 +38,6 @@
 from fsspec import url_to_fs
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HFSummaryWriter, hf_hub_url
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.logging.info_loggers import (
     DetailsLogger,
     GeneralConfigLogger,
@@ -49,6 +49,8 @@
 from lighteval.utils.utils import obj_to_markdown
 
 
+logger = logging.getLogger(__name__)
+
 if is_nanotron_available():
     from nanotron.config import GeneralArgs  # type: ignore
 
@@ -147,7 +149,7 @@ def __init__(
 
     def save(self) -> None:
         """Saves the experiment information and results to files, and to the hub if requested."""
-        hlog("Saving experiment tracker")
+        logger.info("Saving experiment tracker")
         date_id = datetime.now().isoformat().replace(":", "-")
 
         # We first prepare data to save
@@ -202,7 +204,7 @@ def save_results(self, date_id: str, results_dict: dict):
         output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
         self.fs.mkdirs(output_dir_results, exist_ok=True)
         output_results_file = output_dir_results / f"results_{date_id}.json"
-        hlog(f"Saving results to {output_results_file}")
+        logger.info(f"Saving results to {output_results_file}")
         with self.fs.open(output_results_file, "w") as f:
             f.write(json.dumps(results_dict, cls=EnhancedJSONEncoder, indent=2, ensure_ascii=False))
 
@@ -210,7 +212,7 @@ def save_details(self, date_id: str, details_datasets: dict[str, Dataset]):
         output_dir_details = Path(self.output_dir) / "details" / self.general_config_logger.model_name
         output_dir_details_sub_folder = output_dir_details / date_id
         self.fs.mkdirs(output_dir_details_sub_folder, exist_ok=True)
-        hlog(f"Saving details to {output_dir_details_sub_folder}")
+        logger.info(f"Saving details to {output_dir_details_sub_folder}")
         for task_name, dataset in details_datasets.items():
             output_file_details = output_dir_details_sub_folder / f"details_{task_name}_{date_id}.parquet"
             with self.fs.open(str(output_file_details), "wb") as f:
@@ -255,7 +257,7 @@ def push_to_hub(
 
         if not self.api.repo_exists(repo_id):
             self.api.create_repo(repo_id, private=not (self.public), repo_type="dataset", exist_ok=True)
-            hlog(f"Repository {repo_id} not found, creating it.")
+            logger.info(f"Repository {repo_id} not found, creating it.")
 
         # We upload it both as a json and a parquet file
         result_file_base_name = f"results_{date_id}"
@@ -490,11 +492,11 @@ def push_to_tensorboard(  # noqa: C901
         self, results: dict[str, dict[str, float]], details: dict[str, DetailsLogger.CompiledDetail]
     ):
         if not is_tensorboardX_available:
-            hlog_warn(NO_TENSORBOARDX_WARN_MSG)
+            logger.warning(NO_TENSORBOARDX_WARN_MSG)
             return
 
         if not is_nanotron_available():
-            hlog_warn("You cannot push results to tensorboard without having nanotron installed. Skipping")
+            logger.warning("You cannot push results to tensorboard without having nanotron installed. Skipping")
             return
 
         prefix = self.tensorboard_metric_prefix
@@ -526,14 +528,14 @@ def push_to_tensorboard(  # noqa: C901
             bench_suite = None
             if ":" in task_name:
                 bench_suite = task_name.split(":")[0]  # e.g. MMLU
-                hlog(f"bench_suite {bench_suite} in {task_name}")
+                logger.info(f"bench_suite {bench_suite} in {task_name}")
                 for metric, value in values.items():
                     if "stderr" in metric:
                         continue
                     if bench_suite not in bench_averages:
                         bench_averages[bench_suite] = {}
                     bench_averages[bench_suite][metric] = bench_averages[bench_suite].get(metric, []) + [float(value)]
-            hlog(f"Pushing {task_name} {values} to tensorboard")
+            logger.info(f"Pushing {task_name} {values} to tensorboard")
             for metric, value in values.items():
                 if "stderr" in metric:
                     tb_context.add_scalar(f"stderr_{prefix}/{task_name}/{metric}", value, global_step=global_step)
@@ -546,7 +548,7 @@ def push_to_tensorboard(  # noqa: C901
         # Tasks with subtasks
         for name, values in bench_averages.items():
             for metric, values in values.items():
-                hlog(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
+                logger.info(f"Pushing average {name} {metric} {sum(values) / len(values)} to tensorboard")
                 tb_context.add_scalar(f"{prefix}/{name}/{metric}", sum(values) / len(values), global_step=global_step)
 
         tb_context.add_text("eval_config", obj_to_markdown(results), global_step=global_step)
@@ -571,7 +573,7 @@ def push_to_tensorboard(  # noqa: C901
 
         # Now we can push to the hub
         tb_context.scheduler.trigger()
-        hlog(
+        logger.info(
             f"Pushed to tensorboard at https://huggingface.co/{self.tensorboard_repo}/{output_dir_tb}/tensorboard"
             f" at global_step {global_step}"
         )
diff --git a/src/lighteval/logging/hierarchical_logger.py b/src/lighteval/logging/hierarchical_logger.py
deleted file mode 100644
index ac8d59d8..00000000
--- a/src/lighteval/logging/hierarchical_logger.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# MIT License
-
-# Copyright (c) 2024 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import sys
-import time
-from datetime import timedelta
-from logging import Logger
-from typing import Any, Callable
-
-from colorama import Fore, Style
-
-
-logger = Logger(__name__, level="INFO")
-
-
-class HierarchicalLogger:
-    """
-    Tracks the execution flow of the code as blocks, along with how long we're spending in each block.
-    Should not be called on its own, use [`hlog`], [`hlog_warn`] and [`hlog_err`] to log things, and
-    [`htrack_block`] to start a block level log step, or [`htrack`] to start a function level log step.
-    """
-
-    def __init__(self) -> None:
-        self.start_times: list[float] = []
-
-    def indent(self) -> str:
-        """Manages the block level text indentation for nested blocks"""
-        return "  " * len(self.start_times)
-
-    def track_begin(self, x: Any) -> None:
-        """Starts a block level tracker, stores the step begin time"""
-        logger.warning(f"{self.indent()}{str(x)} \u007b")  # \u007b is {
-        sys.stdout.flush()
-        self.start_times.append(time.time())
-
-    def track_end(self) -> None:
-        """Ends a block level tracker, prints the elapsed time for the associated step"""
-        duration = time.time() - self.start_times.pop()
-        logger.warning(f"{self.indent()}\u007d [{str(timedelta(seconds=duration))}]")  # \u007d is }
-        sys.stdout.flush()
-
-    def log(self, x: Any) -> None:
-        logger.warning(self.indent() + str(x))
-        sys.stdout.flush()
-
-
-HIERARCHICAL_LOGGER = HierarchicalLogger()
-BACKUP_LOGGER = Logger(__name__, level="INFO")
-
-
-# Exposed public methods
-def hlog(x: Any) -> None:
-    """Info logger.
-
-    Logs a string version of x through the singleton [`HierarchicalLogger`].
-    """
-    try:
-        HIERARCHICAL_LOGGER.log(x)
-    except RuntimeError:
-        BACKUP_LOGGER.warning(x)
-
-
-def hlog_warn(x: Any) -> None:
-    """Warning logger.
-
-    Logs a string version of x, which will appear in a yellow color, through the singleton [`HierarchicalLogger`].
-    """
-    try:
-        HIERARCHICAL_LOGGER.log(Fore.YELLOW + str(x) + Style.RESET_ALL)
-    except RuntimeError:
-        BACKUP_LOGGER.warning(Fore.YELLOW + str(x) + Style.RESET_ALL)
-
-
-def hlog_err(x: Any) -> None:
-    """Error logger.
-
-    Logs a string version of x, which will appear in a red color, through the singleton [`HierarchicalLogger`].
-    """
-    try:
-        HIERARCHICAL_LOGGER.log(Fore.RED + str(x) + Style.RESET_ALL)
-    except RuntimeError:
-        BACKUP_LOGGER.warning(Fore.RED + str(x) + Style.RESET_ALL)
-
-
-class htrack_block:
-    """
-    Block annotator: hierarchical logging block, which encapsulate the current step's logs and duration.
-
-    Usage:
-        with htrack_block('Step'):
-            hlog('current logs')
-
-    Output:
-        Step {
-            current logs
-        } [0s]
-    """
-
-    def __init__(self, x: Any) -> None:
-        self.x = x
-
-    def __enter__(self) -> None:
-        HIERARCHICAL_LOGGER.track_begin(self.x)
-
-    def __exit__(self, tpe: Any, value: Any, callback: Any) -> None:
-        HIERARCHICAL_LOGGER.track_end()
-
-
-class htrack:
-    """
-    Function annotator: prints called function parameters, then opens an hierarchical [`htrack_block`]
-    which encapsulate the current step's logs and duration.
-
-    Usage:
-        @htrack()
-        def function(args):
-            with htrack_block('Step'):
-                hlog('current logs')
-
-    Output:
-        function: args, {
-            Step {
-                current logs
-            } [0s]
-        }
-    """
-
-    def __call__(self, fn: Callable) -> Any:
-        def wrapper(*args, **kwargs):  # type:ignore
-            # Parent name to prepend
-            if len(args) > 0 and hasattr(args[0], fn.__name__):
-                parent = type(args[0]).__name__ + "."
-            else:
-                parent = ""
-
-            args_list = ""
-            if len(args) > 0 or len(kwargs) > 0:
-                args_list = ": "
-                for v in enumerate(args):
-                    args_list += f"{str(v)}, "
-                for k, v in kwargs.items():
-                    args_list += f"{str(k)}: {str(v)}, "
-
-            with htrack_block(parent + fn.__name__ + args_list):
-                return fn(*args, **kwargs)
-
-        return wrapper
diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
index b96a875d..46d3ab5c 100644
--- a/src/lighteval/logging/info_loggers.py
+++ b/src/lighteval/logging/info_loggers.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import collections
+import logging
 import os
 import time
 from dataclasses import asdict, dataclass, field
@@ -30,7 +31,6 @@
 import numpy as np
 import xxhash
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics import MetricCategory
 from lighteval.metrics.stderr import get_stderr_function
 from lighteval.models.abstract_model import ModelInfo
@@ -41,6 +41,9 @@
 from lighteval.utils.utils import as_list, sanitize_numpy
 
 
+logger = logging.getLogger(__name__)
+
+
 if is_nanotron_available():
     from nanotron.config import Config
 
@@ -507,7 +510,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
                 try:
                     metric_result = task.aggregation()[metric_name](metric_values)
                 except OverflowError:
-                    hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.")
+                    logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when aggregating.")
                     metric_result = float("nan")
                 except KeyError:
                     continue
@@ -529,7 +532,7 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
                     except OverflowError:
                         # Is this need or should we just pass?
                         self.metric_aggregated[task_name][f"{metric_name}_stderr"] = float("nan")
-                        hlog_warn(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.")
+                        logger.warning(f"{task_name}, {metric_name} got an OVERFLOW ERROR when computing stderr.")
 
         # We group subtasks which belong to the same parent task, like MMLU, to compute an average on them
         # and compute an average of all metrics
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
index f6ed6b38..e7d18c80 100644
--- a/src/lighteval/main_accelerate.py
+++ b/src/lighteval/main_accelerate.py
@@ -114,6 +114,7 @@ def accelerate(  # noqa C901
     cache_dir = CACHE_DIR
 
     env_config = EnvConfig(token=TOKEN, cache_dir=cache_dir)
+
     evaluation_tracker = EvaluationTracker(
         output_dir=output_dir,
         save_details=save_details,
diff --git a/src/lighteval/metrics/imports/bert_scorer.py b/src/lighteval/metrics/imports/bert_scorer.py
index 347dcc02..1012bc3f 100644
--- a/src/lighteval/metrics/imports/bert_scorer.py
+++ b/src/lighteval/metrics/imports/bert_scorer.py
@@ -22,6 +22,7 @@
 # SOFTWARE.
 
 """Simplified version of the BertScorer lib - we only import what we need."""
+import logging
 import os
 import time
 from collections import defaultdict
@@ -31,7 +32,8 @@
 from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoModel, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
+
+logger = logging.getLogger(__name__)
 
 
 def padding(arr, pad_token, dtype=torch.long):
@@ -218,14 +220,14 @@ def greedy_cos_idf(
         F = F.view(L, B)
 
     if torch.any(hyp_zero_mask):
-        hlog_warn(
+        logger.warning(
             "Warning: Empty candidate sentence detected; setting raw BERTscores to 0.",
         )
         P = P.masked_fill(hyp_zero_mask, 0.0)
         R = R.masked_fill(hyp_zero_mask, 0.0)
 
     if torch.any(ref_zero_mask):
-        hlog_warn("Warning: Empty reference sentence detected; setting raw BERTScores to 0.")
+        logger.warning("Empty reference sentence detected; setting raw BERTScores to 0.")
         P = P.masked_fill(ref_zero_mask, 0.0)
         R = R.masked_fill(ref_zero_mask, 0.0)
 
@@ -441,7 +443,7 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False):
         """
 
         if self._model is None:
-            hlog(f"Loading BERTScorer model `{self._model_type}`")
+            logger.info(f"Loading BERTScorer model `{self._model_type}`")
             self._tokenizer = AutoTokenizer.from_pretrained(self._model_type)
             self._model = AutoModel.from_pretrained(self._model_type)
             self._model.eval()
@@ -460,7 +462,7 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False):
                 count += len(ref_group)
 
         if verbose:
-            hlog("calculating scores...")
+            logger.info("calculating scores...")
             start = time.perf_counter()
 
         if self.idf:
@@ -496,6 +498,6 @@ def score(self, cands, refs, verbose=False, batch_size=64, return_hash=False):
 
         if verbose:
             time_diff = time.perf_counter() - start
-            hlog(f"done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec")
+            logger.info(f"done in {time_diff:.2f} seconds, {len(refs) / time_diff:.2f} sentences/sec")
 
         return out
diff --git a/src/lighteval/metrics/imports/data_stats_metric.py b/src/lighteval/metrics/imports/data_stats_metric.py
index 9982e5da..119e6cad 100644
--- a/src/lighteval/metrics/imports/data_stats_metric.py
+++ b/src/lighteval/metrics/imports/data_stats_metric.py
@@ -24,15 +24,18 @@
 
 # pylint: disable=C0103,W0221,W0106
 # Replace summ_eval.data_stats_metric
+import logging
 from collections import Counter
 from multiprocessing import Pool
 
 import spacy
 
-from lighteval.logging.hierarchical_logger import hlog
 from lighteval.metrics.imports.data_stats_utils import Fragments
 
 
+logger = logging.getLogger(__name__)
+
+
 _en = None
 
 
@@ -78,7 +81,7 @@ def __init__(self, n_gram=3, n_workers=24, case=False, tokenize=True):
         try:
             _en = spacy.load("en_core_web_sm")
         except OSError:
-            hlog("Downloading the spacy en_core_web_sm model\n" "(don't worry, this will only happen once)")
+            logger.info("Downloading the spacy en_core_web_sm model\n(don't worry, this will only happen once)")
             from spacy.cli import download
 
             download("en_core_web_sm")
diff --git a/src/lighteval/metrics/imports/summac.py b/src/lighteval/metrics/imports/summac.py
index 2803ba59..e64dab86 100644
--- a/src/lighteval/metrics/imports/summac.py
+++ b/src/lighteval/metrics/imports/summac.py
@@ -4,6 +4,7 @@
 ###############################################
 
 import json
+import logging
 import os
 import time
 
@@ -13,8 +14,8 @@
 import tqdm
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog
 
+logger = logging.getLogger(__name__)
 
 # GPU-related business
 
@@ -40,7 +41,7 @@ def wait_free_gpu(gb_needed):
 
 def select_freer_gpu():
     freer_gpu = str(get_freer_gpu())
-    hlog("Will use GPU: %s" % (freer_gpu))
+    logger.info("Will use GPU: %s" % (freer_gpu))
     os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     os.environ["CUDA_VISIBLE_DEVICES"] = "" + freer_gpu
     return freer_gpu
diff --git a/src/lighteval/metrics/llm_as_judge.py b/src/lighteval/metrics/llm_as_judge.py
index a8f363d4..a4b5dfb1 100644
--- a/src/lighteval/metrics/llm_as_judge.py
+++ b/src/lighteval/metrics/llm_as_judge.py
@@ -28,12 +28,12 @@
 
 from tqdm import tqdm
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.utils.imports import is_openai_available, is_vllm_available
 
 
 logging.getLogger("openai").setLevel(logging.ERROR)
 logging.getLogger("httpx").setLevel(logging.ERROR)
+logger = logging.getLogger(__name__)
 
 
 class JudgeLM:
@@ -211,6 +211,6 @@ def __call_api(self, prompt):
                 text = response.choices[0].message.content
                 return text
             except Exception as e:
-                hlog_warn(f"{type(e), e}")
+                logger.warning(f"{type(e), e}")
                 time.sleep(self.API_RETRY_SLEEP)
         raise Exception("Failed to get response from the API")
diff --git a/src/lighteval/metrics/metrics_corpus.py b/src/lighteval/metrics/metrics_corpus.py
index 1286ab08..03b1b2c5 100644
--- a/src/lighteval/metrics/metrics_corpus.py
+++ b/src/lighteval/metrics/metrics_corpus.py
@@ -24,13 +24,13 @@
 Some metrics (such as corpus BLEU) are not computed at the individual item level, but over all the corpus.
 A number of these aggregations come from the EleutherAIHarness
 """
+import logging
 import math
 
 import numpy as np
 import sacrebleu
 import sklearn.metrics
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.sample_preparator import (
     GenerativeCorpusMetricInput,
     LogprobCorpusMetricInput,
@@ -39,6 +39,9 @@
 from lighteval.utils.utils import as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 # General aggregations
 def matthews_corrcoef(items: list[GenerativeCorpusMetricInput]) -> float:
     """Computes the Matthews Correlation Coefficient, using scikit learn ([doc](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.matthews_corrcoef.html)).
@@ -108,7 +111,7 @@ def compute(self, items: list[GenerativeCorpusMetricInput]) -> float:
         for i in items:
             pred = as_list(i.preds)
             if len(pred) > 1:
-                hlog_warn(
+                logger.info(
                     f"Multiple predictions present, keeping only the first prediction (when computing sacrebleu.{self.metric.__name__})."
                 )
             preds.append(pred[0])
diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
index 4fdc4293..2081b560 100644
--- a/src/lighteval/metrics/metrics_sample.py
+++ b/src/lighteval/metrics/metrics_sample.py
@@ -24,6 +24,7 @@
 using simple function (min, mean, max, ...) at the corpus level. Most metrics fall under this category.
 """
 
+import logging
 import os
 from typing import Callable, Literal
 
@@ -34,10 +35,8 @@
 from nltk.tokenize import word_tokenize
 from nltk.tokenize.treebank import TreebankWordTokenizer
 from nltk.translate.bleu_score import sentence_bleu
-from rouge_score import rouge_scorer, scoring
 from transformers import AutoModelForSequenceClassification, AutoTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.metrics.imports.bert_scorer import BERTScorer
 from lighteval.metrics.imports.data_stats_metric import DataStatsMetric
 from lighteval.metrics.imports.summac import SummaCZS
@@ -53,6 +52,9 @@
 from lighteval.utils.utils import as_list, safe_divide
 
 
+logger = logging.getLogger(__name__)
+
+
 class ExactMatches:
     def __init__(
         self,
@@ -464,7 +466,7 @@ def __init__(
                 default tokenizer will be used.
         """
         if aggregation_function and bootstrap:
-            hlog_warn("Can't use both bootstrapping and an aggregation function in Rouge. Keeping bootstrap.")
+            logger.warning("Can't use both bootstrapping and an aggregation function in Rouge. Keeping bootstrap.")
         self.aggregation_function = aggregation_function
         if self.aggregation_function is None:
             self.aggregation_function = np.mean
@@ -474,11 +476,11 @@ def __init__(
             raise ValueError(
                 f"Rouge was initialised with method {methods}, which is not in {','.join(self.ALLOWED_ROUGE_METHODS)}"
             )
-        self.scorer = rouge_scorer.RougeScorer([methods], tokenizer=tokenizer)
         self.multiple_golds = multiple_golds
         self.bootstrap = bootstrap
         self.normalize_gold = normalize_gold
         self.normalize_pred = normalize_pred
+        self.tokenizer = tokenizer
 
     def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float | dict:
         """Computes the metric(s) over a list of golds and predictions for one single sample.
@@ -491,6 +493,11 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> float |
             float or dict: Aggregated score over the current sample's items.
                 If several rouge functions have been selected, returns a dict which maps name and scores.
         """
+        from rouge_score import rouge_scorer
+
+        if self.scorer is None:
+            self.scorer = rouge_scorer.RougeScorer(self.methods, tokenizer=self.tokenizer)
+
         # Normalize
         if self.normalize_gold:
             golds = [self.normalize_gold(g) for g in golds]
@@ -527,6 +534,8 @@ def _rouge_score_multi_golds(self, golds: list[str], preds: list[str]):
         return {method: self.aggregation_function(scores[method]) for method in self.methods}
 
     def _rouge_score_with_bootsrap(self, golds: list[str], preds: list[str]):
+        from rouge_score import scoring
+
         aggregator = scoring.BootstrapAggregator()
         for g, p in zip(golds, preds):
             aggregator.add_scores(self.scorer.score(g, p))
@@ -575,7 +584,7 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict:
             dict: Scores over the current sample's items.
         """
         if self.bert_scorer is None:
-            hlog_warn("The first metric computation step might be a bit longer as we need to download the model.")
+            logger.warning("The first metric computation step might be a bit longer as we need to download the model.")
             # We only initialize on first compute
             self.bert_scorer = BERTScorer(
                 model_type="microsoft/deberta-large-mnli", lang="en", rescale_with_baseline=True, num_layers=9
@@ -787,7 +796,9 @@ def compute(self, golds: list[str], predictions: list[str], **kwargs) -> dict:
            dict: The different scores computed
         """
         if len(golds) > 1:
-            hlog_warn("Provided more than one gold to compute a string distance metric. Just using the first one.")
+            logger.warning(
+                "Provided more than one gold to compute a string distance metric. Just using the first one."
+            )
         reference = golds[0]
 
         result = {m: [] for m in self.metric_types}
diff --git a/src/lighteval/metrics/sample_preparator.py b/src/lighteval/metrics/sample_preparator.py
index dc32d95c..4fafa509 100644
--- a/src/lighteval/metrics/sample_preparator.py
+++ b/src/lighteval/metrics/sample_preparator.py
@@ -20,12 +20,14 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import re
 from dataclasses import asdict, dataclass
 
 import numpy as np
 
-from lighteval.logging.hierarchical_logger import hlog_warn
+
+logger = logging.getLogger(__name__)
 
 
 @dataclass
@@ -92,7 +94,7 @@ def prepare(self, gold_ixs: list[int], choices_logprob: list[float], **kwargs) -
         """
         if self.is_single_token:
             if len(gold_ixs) > 1:
-                hlog_warn(
+                logger.warning(
                     "The current sample has more than one gold available, which is unexpected. We selected only the first one for the corpus aggregation of the loglikelihood metric."
                 )
             return LogprobCorpusMetricInput(golds=gold_ixs[0], preds=np.argmax(choices_logprob))
diff --git a/src/lighteval/metrics/stderr.py b/src/lighteval/metrics/stderr.py
index 388521a6..751e29e0 100644
--- a/src/lighteval/metrics/stderr.py
+++ b/src/lighteval/metrics/stderr.py
@@ -24,6 +24,7 @@
 # We kept it because it's very fast - however, we renamed the variables
 # and added documentation
 
+import logging
 import math
 import random
 from typing import Callable, Optional
@@ -32,7 +33,8 @@
 from scipy.stats import bootstrap
 from tqdm import tqdm
 
-from lighteval.logging.hierarchical_logger import hlog
+
+logger = logging.getLogger(__name__)
 
 
 def _stddev(arr):
@@ -78,7 +80,7 @@ def bootstrap_stderr(metric: Callable, population: list, number_experiments: int
     number_draws = min(1000, number_experiments)
     number_seeds = number_experiments // number_draws
 
-    hlog(f"Bootstrapping {metric.__name__}'s stderr with {number_seeds} seeds.")
+    logger.info(f"Bootstrapping {metric.__name__}'s stderr with {number_seeds} seeds.")
     for seed in range(number_seeds):
         # sample w replacement
         res.extend(_bootstrap_internal(metric=metric, number_draws=number_draws)((population, seed)))
@@ -106,7 +108,7 @@ def bootstrap_stderr_scipy(metric: Callable, population: list, number_experiment
     Same as bootstrap_stderr, but uses scipy.
     It's kept for archive, as it overflows for big datasets
     """
-    hlog(f"Bootstrapping {metric.__name__}'s stderr.")
+    logger.info(f"Bootstrapping {metric.__name__}'s stderr.")
     res = bootstrap(
         data=[population],
         statistic=metric,
diff --git a/src/lighteval/metrics/utils/linguistic_tokenizers.py b/src/lighteval/metrics/utils/linguistic_tokenizers.py
index 3bc84eab..e0dd9ef1 100644
--- a/src/lighteval/metrics/utils/linguistic_tokenizers.py
+++ b/src/lighteval/metrics/utils/linguistic_tokenizers.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 from abc import ABC, abstractmethod
 from functools import lru_cache
 from typing import Callable, Iterator
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.utils.imports import (
     NO_SPACY_TOKENIZER_ERROR_MSG,
     NO_STANZA_TOKENIZER_ERROR_MSG,
@@ -26,6 +26,9 @@
 from lighteval.utils.language import Language
 
 
+logger = logging.getLogger(__name__)
+
+
 # Copy of https://github.com/huggingface/datatrove/blob/main/src/datatrove/utils/tokenization.py
 def strip_strings(els: list[str]) -> list[str]:
     return [el.strip() for el in els if len(el.strip()) > 0]
@@ -270,6 +273,6 @@ def span_tokenize(self, text: str) -> list[tuple[int, int]]:
 def get_word_tokenizer(language: Language) -> WordTokenizer:
     tokenizer = TOKENIZER_FACTORY.get(language)
     if tokenizer is None:
-        hlog_warn(f"No word tokenizer found for language {language}, will split on spaces.")
+        logger.warning(f"No word tokenizer found for language {language}, will split on spaces.")
         return WhitespaceTokenizer()
     return tokenizer()
diff --git a/src/lighteval/models/adapter_model.py b/src/lighteval/models/adapter_model.py
index dbf762d7..24de80f4 100644
--- a/src/lighteval/models/adapter_model.py
+++ b/src/lighteval/models/adapter_model.py
@@ -20,12 +20,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 from contextlib import nullcontext
 
 import torch
 from transformers import AutoModelForCausalLM, PreTrainedTokenizer
 
-from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.base_model import BaseModel
 from lighteval.models.model_config import AdapterModelConfig
 from lighteval.models.utils import _get_dtype
@@ -33,6 +33,8 @@
 from lighteval.utils.utils import EnvConfig
 
 
+logger = logging.getLogger(__name__)
+
 if is_peft_available():
     from peft import PeftModel
 
@@ -60,7 +62,7 @@ def _create_auto_model(self, config: AdapterModelConfig, env_config: EnvConfig)
         merged_path = f"{adapter_weights}-adapter-applied"
 
         if self.accelerator.is_local_main_process if self.accelerator is not None else nullcontext():
-            hlog(f"Loading model from {adapter_weights} and applying adapter to {config.base_model}")
+            logger.info(f"Loading model from {adapter_weights} and applying adapter to {config.base_model}")
             base = AutoModelForCausalLM.from_pretrained(
                 config.base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True, token=env_config.token
             )
@@ -68,10 +70,10 @@ def _create_auto_model(self, config: AdapterModelConfig, env_config: EnvConfig)
             model = PeftModel.from_pretrained(base, adapter_weights)
             model = model.merge_and_unload()
 
-            hlog("Saving model with adapter applied")
+            logger.info("Saving model with adapter applied")
             base.save_pretrained(merged_path)
 
-        hlog(f"Loading model from {merged_path}")
+        logger.info(f"Loading model from {merged_path}")
 
         model = AutoModelForCausalLM.from_pretrained(
             merged_path,
diff --git a/src/lighteval/models/base_model.py b/src/lighteval/models/base_model.py
index 993978d5..fedc56a5 100644
--- a/src/lighteval/models/base_model.py
+++ b/src/lighteval/models/base_model.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import os
 from typing import Optional, Tuple, Union
 
@@ -33,7 +34,6 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset, LoglikelihoodSingleTokenDataset
-from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn
 from lighteval.models.abstract_model import LightevalModel, ModelInfo
 from lighteval.models.model_config import BaseModelConfig
 from lighteval.models.model_output import (
@@ -57,6 +57,9 @@
 from lighteval.utils.utils import EnvConfig, as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 if is_accelerate_available():
     from accelerate import Accelerator
     from accelerate.utils import calculate_maximum_sizes, convert_bytes, get_max_memory
@@ -91,10 +94,10 @@ def __init__(
 
         # We are in DP (and launch the script with `accelerate launch`)
         if not config.model_parallel and not isinstance(config.quantization_config, BitsAndBytesConfig):
-            hlog(f"Using Data Parallelism, putting model on device {self._device}")
+            logger.info(f"Using Data Parallelism, putting model on device {self._device}")
             self.model = self.model.to(self._device)
         if config.compile:
-            hlog("Compiling the model")
+            logger.info("Compiling the model")
             self.model.model.compile()
 
         self.model_name = _simplify_name(config.pretrained)
@@ -202,7 +205,7 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool,
         self.num_local_processes = int(os.environ.get("LOCAL_WORLD_SIZE", 1))
         self.num_machines = int(os.environ.get("WORLD_SIZE", 0)) // self.num_local_processes
         if self.num_machines == 0:
-            hlog("We are not in a distributed setting. Setting model_parallel to False.")
+            logger.info("We are not in a distributed setting. Setting model_parallel to False.")
             model_parallel = False
 
         if model_parallel is None:
@@ -210,7 +213,7 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool,
             if "cpu" in max_memory_all_gpus:
                 del max_memory_all_gpus["cpu"]
             model_parallel = bool(self.num_local_processes < len(max_memory_all_gpus))
-            hlog(
+            logger.info(
                 f"Setting model parallel to {model_parallel} since "
                 f"the number of local processes is {self.num_local_processes} "
                 f"and the number of GPUs is {len(max_memory_all_gpus)}"
@@ -225,13 +228,13 @@ def init_model_parallel(self, model_parallel: bool | None = None) -> Tuple[bool,
                 if k % self.num_local_processes == (self.accelerator.process_index % self.num_local_processes)
             }
             device_map = "auto"
-            hlog(
+            logger.info(
                 f"Model parallel was set to True, setting max memory per GPU to {max_mem_this_process} and device map to {device_map}"
             )
         else:
             max_mem_this_process = None
             device_map = None
-            hlog(
+            logger.info(
                 f"Model parallel was set to False, max memory set to {max_mem_this_process} and device map to {device_map}"
             )
         return model_parallel, max_mem_this_process, device_map
@@ -332,7 +335,9 @@ def _create_auto_tokenizer_with_name(
                 truncation_side="left",
             )
         except FileNotFoundError:
-            hlog_warn("Problem when loading the tokenizer in the cache - discarding the provided cache path value.")
+            logger.warning(
+                "Problem when loading the tokenizer in the cache - discarding the provided cache path value."
+            )
             tokenizer = AutoTokenizer.from_pretrained(
                 model_name if tokenizer_name is None else tokenizer_name,
                 revision=revision + (f"/{subfolder}" if subfolder is not None else ""),
@@ -343,7 +348,7 @@ def _create_auto_tokenizer_with_name(
             )
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.model_max_length = self.max_length
-        hlog("Tokenizer truncation and padding size set to the left side.")
+        logger.info("Tokenizer truncation and padding size set to the left side.")
 
         return tokenizer
 
@@ -409,7 +414,7 @@ def _model_call(self, inputs: torch.Tensor) -> torch.Tensor:
     def _get_batch_size(self, max_input_length: int, override_bs: int = 0, starting_batch_size: int = 512) -> int:
         if override_bs > 0:
             return override_bs
-        hlog(f"Detecting largest batch size with max_input_length={max_input_length}")
+        logger.info(f"Detecting largest batch size with max_input_length={max_input_length}")
 
         @find_executable_batch_size(
             starting_batch_size=starting_batch_size
@@ -422,7 +427,7 @@ def forward_batch(batch_size):
             return batch_size
 
         batch_size = forward_batch()
-        hlog(f"Determined largest batch size: {batch_size}")
+        logger.info(f"Determined largest batch size: {batch_size}")
         return batch_size
 
     def greedy_until_multi_turn(  # noqa: C901
@@ -440,7 +445,7 @@ def greedy_until_multi_turn(  # noqa: C901
         if self.accelerator:
             dataloader = self.accelerator.prepare(dataloader)
 
-        hlog_warn("Running greedy multi turn generation, the batch size is set to 1 for this task.")
+        logger.warning("Running greedy multi turn generation, the batch size is set to 1 for this task.")
 
         for request_batch in tqdm(
             dataloader, desc="Greedy Multi Turn generation", position=1, leave=False, disable=self.disable_tqdm
@@ -650,7 +655,7 @@ def greedy_until(
                 # should have been managed by the prompt creator/few shot manager if requested by the user.
                 context_size = tokenized["input_ids"].shape[1]
                 if context_size > self.max_length:
-                    hlog_warn(
+                    logger.warning(
                         f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
                         + str({i.task_name for i in batch})
                         + ". This is likely to lead to some errors."  # noqa C401
@@ -949,7 +954,7 @@ def prepare_batch_logprob(
         padded = []
 
         if max_context is None:
-            hlog_warn("max_context is None, using max_length")
+            logger.warning("max_context is None, using max_length")
             max_context = self.max_length
 
         # Each sample is concatenated and cut to length or padded to max_length
@@ -964,7 +969,7 @@ def prepare_batch_logprob(
             padding_length = padding_length if padding_length is not None else sequence_len
 
             if padding_length - sequence_len < 0:
-                hlog_err(f"Padding length {padding_length} is smaller than input length {sequence_len}")
+                logger.warning(f"Padding length {padding_length} is smaller than input length {sequence_len}")
                 raise ValueError("Negative padding")
 
             padded.append(padding_length - sequence_len)
diff --git a/src/lighteval/models/delta_model.py b/src/lighteval/models/delta_model.py
index 69fba37a..9aa8c01d 100644
--- a/src/lighteval/models/delta_model.py
+++ b/src/lighteval/models/delta_model.py
@@ -20,19 +20,22 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 from contextlib import nullcontext
 
 import torch
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM
 
-from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.base_model import BaseModel
 from lighteval.models.model_config import DeltaModelConfig
 from lighteval.models.utils import _get_dtype
 from lighteval.utils.utils import EnvConfig
 
 
+logger = logging.getLogger(__name__)
+
+
 class DeltaModel(BaseModel):
     def _create_auto_model(
         self,
@@ -48,7 +51,7 @@ def _create_auto_model(
         merged_path = f"{delta_model}-delta-applied"
 
         if self.accelerator.is_main_process if self.accelerator is not None else nullcontext():
-            hlog(f"Loading base and delta models from {config.base_model} and {delta_model}")
+            logger.info(f"Loading base and delta models from {config.base_model} and {delta_model}")
             base = AutoModelForCausalLM.from_pretrained(
                 config.base_model, torch_dtype=torch.float16, low_cpu_mem_usage=True, token=env_config.token
             )
@@ -64,10 +67,10 @@ def _create_auto_model(
                 assert name in delta.state_dict()
                 param.data += delta.state_dict()[name]
 
-            hlog("Saving delta-applied model")
+            logger.info("Saving delta-applied model")
             base.save_pretrained(merged_path)
 
-        hlog(f"Loading delta-applied model from {delta_model}-delta-applied")
+        logger.info(f"Loading delta-applied model from {delta_model}-delta-applied")
 
         model = AutoModelForCausalLM.from_pretrained(
             merged_path,
diff --git a/src/lighteval/models/endpoint_model.py b/src/lighteval/models/endpoint_model.py
index bc2c7eac..bd82f058 100644
--- a/src/lighteval/models/endpoint_model.py
+++ b/src/lighteval/models/endpoint_model.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 import asyncio
+import logging
 import re
 import time
 from typing import Coroutine, List, Optional, Union
@@ -45,7 +46,6 @@
 from transformers import AutoTokenizer
 
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
-from lighteval.logging.hierarchical_logger import hlog, hlog_err, hlog_warn
 from lighteval.models.abstract_model import LightevalModel, ModelInfo
 from lighteval.models.model_config import InferenceEndpointModelConfig, InferenceModelConfig
 from lighteval.models.model_output import GenerativeResponse, LoglikelihoodResponse, LoglikelihoodSingleTokenResponse
@@ -58,6 +58,8 @@
 from lighteval.utils.utils import EnvConfig, as_list
 
 
+logger = logging.getLogger(__name__)
+
 BATCH_SIZE = 50
 MAX_TIME_FOR_SPINUP = 3600
 
@@ -117,7 +119,7 @@ def __init__(  # noqa: C901
                 try:
                     if self.endpoint is None:  # Endpoint does not exist yet locally
                         if not config.should_reuse_existing:  # New endpoint
-                            hlog("Creating endpoint.")
+                            logger.info("Creating endpoint.")
                             self.endpoint: InferenceEndpoint = create_inference_endpoint(
                                 name=endpoint_name,
                                 namespace=config.namespace,
@@ -150,7 +152,7 @@ def __init__(  # noqa: C901
                                 },
                             )
                         else:  # Endpoint exists
-                            hlog("Reusing existing endpoint.")
+                            logger.info("Reusing existing endpoint.")
                             self.endpoint = get_inference_endpoint(
                                 name=endpoint_name, token=env_config.token, namespace=config.namespace
                             )
@@ -158,13 +160,13 @@ def __init__(  # noqa: C901
                     else:
                         # Endpoint exists locally but either failed (and most likely it must be scaled up)
                         if must_scaleup_endpoint:
-                            hlog("Rescaling existing endpoint.")
+                            logger.info("Rescaling existing endpoint.")
                             self.endpoint.update(instance_size=instance_size, instance_type=instance_type)
                             must_scaleup_endpoint = False
                         # or we got a connection error, in which case we do nothing and just wait at the next step
 
                     # Waits for the endpoint to be deployed - we could also check for the status in updating', 'pending', 'initializing'
-                    hlog("Trying to deploy your endpoint. Please wait for 10 min.")
+                    logger.info("Trying to deploy your endpoint. Please wait for 10 min.")
                     self.endpoint.wait(timeout=600, refresh_every=60)  # We wait for 10 min
                 except InferenceEndpointError as e:
                     instance_type, instance_size = InferenceEndpointModel.get_larger_hardware_suggestion(
@@ -172,11 +174,13 @@ def __init__(  # noqa: C901
                     )
                     must_scaleup_endpoint = True
 
-                    hlog(
+                    logger.info(
                         f"Endpoint failed to start on current hardware with error {e}. Trying to autoscale to ({instance_type}, {instance_size})."
                     )
                 except InferenceEndpointTimeoutError as e:
-                    hlog_err("Endpoint did not start within 30 minutes, there was a timeout. Please inspect the logs.")
+                    logger.error(
+                        "Endpoint did not start within 30 minutes, there was a timeout. Please inspect the logs."
+                    )
                     raise e
                 except HfHubHTTPError as e:
                     # The endpoint actually already exists, we'll spin it up instead of trying to create a new one
@@ -185,20 +189,20 @@ def __init__(  # noqa: C901
                         config.should_reuse_existing = True
                     # Requested resources are not available
                     elif "Bad Request: Compute instance not available yet" in str(e):
-                        hlog_err(
-                            "The hardware combination you are requesting does not seem to be available: ({instance_type}, {instance_size}, {config.region})."
+                        logger.error(
+                            f"The hardware combination you are requesting does not seem to be available: ({instance_type}, {instance_size}, {config.region})."
                         )
                         raise e
                     # User account does not have access to requested resources
                     elif "Conflict: Quota exceeded" in str(e):
                         raise e
                 except ConnectionError as e:
-                    hlog_err(f"Connection failed with error {e}. Retrying")
+                    logger.error(f"Connection failed with error {e}. Retrying")
 
             if not self.endpoint.status == "running":
                 raise Exception("Did not manage to start endpoint within the elapsed time and on suggested hardware.")
 
-            hlog("Endpoint successfully deployed!")
+            logger.info("Endpoint successfully deployed!")
             self.endpoint_name = config.endpoint_name
             self.name = self.endpoint.repository
             self.revision = self.endpoint.revision
@@ -278,12 +282,12 @@ def cleanup(self):
         if self.endpoint is not None:
             if self.reuse_existing:
                 self.endpoint.pause()
-                hlog_warn(
+                logger.warning(
                     "Since your endpoint was existing before, we did not delete it, but paused it instead. You might want to delete it if you're done using it."
                 )
             else:
                 self.endpoint.delete()
-                hlog_warn(
+                logger.warning(
                     "We deleted the spinned up endpoint after using it. You'll need to create it again if you need to reuse it."
                 )
 
@@ -425,7 +429,7 @@ def greedy_until(
                 returns_logits = batch[0].use_logits
                 num_samples = batch[0].num_samples
                 if num_samples > 1:
-                    hlog_err(
+                    logger.error(
                         "Inference endpoints does not allow sampling evaluations - this is likely to fail or provide problematic results"
                     )
 
diff --git a/src/lighteval/models/model_config.py b/src/lighteval/models/model_config.py
index 268e2a6f..1eda1e02 100644
--- a/src/lighteval/models/model_config.py
+++ b/src/lighteval/models/model_config.py
@@ -20,13 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 from dataclasses import dataclass
 from typing import Dict, Optional, Union
 
 import torch
 from transformers import AutoConfig, BitsAndBytesConfig, GPTQConfig, PretrainedConfig
 
-from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.utils import _get_model_sha
 from lighteval.utils.imports import (
     NO_AUTOGPTQ_ERROR_MSG,
@@ -40,6 +40,8 @@
 from lighteval.utils.utils import EnvConfig, boolstring_to_bool
 
 
+logger = logging.getLogger(__name__)
+
 if is_accelerate_available():
     from accelerate import Accelerator
 
@@ -120,11 +122,11 @@ def __post_init__(self):
 
         if self.multichoice_continuations_start_space is not None:
             if self.multichoice_continuations_start_space:
-                hlog(
+                logger.info(
                     "You set `multichoice_continuations_start_space` to true. This will force multichoice continuations to use a starting space"
                 )
             else:
-                hlog(
+                logger.info(
                     "You set `multichoice_continuations_start_space` to false. This will remove a leading space from multichoice continuations, if present."
                 )
 
@@ -154,7 +156,7 @@ def _init_configs(self, model_name: str, env_config: EnvConfig) -> PretrainedCon
         # Gathering the model's automatic quantization config, if available
         try:
             model_auto_quantization_config = auto_config.quantization_config
-            hlog("An automatic quantization config was found in the model's config. Using it to load the model")
+            logger.info("An automatic quantization config was found in the model's config. Using it to load the model")
         except (AttributeError, KeyError):
             model_auto_quantization_config = None
 
diff --git a/src/lighteval/models/model_loader.py b/src/lighteval/models/model_loader.py
index e783c86d..1a409746 100644
--- a/src/lighteval/models/model_loader.py
+++ b/src/lighteval/models/model_loader.py
@@ -20,9 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 from typing import Union
 
-from lighteval.logging.hierarchical_logger import hlog
 from lighteval.models.adapter_model import AdapterModel
 from lighteval.models.base_model import BaseModel
 from lighteval.models.delta_model import DeltaModel
@@ -52,6 +52,9 @@
 from lighteval.utils.utils import EnvConfig
 
 
+logger = logging.getLogger(__name__)
+
+
 def load_model(  # noqa: C901
     config: Union[
         BaseModelConfig,
@@ -104,7 +107,7 @@ def load_model_with_tgi(config: TGIModelConfig):
     if not is_tgi_available():
         raise ImportError(NO_TGI_ERROR_MSG)
 
-    hlog(f"Load model from inference server: {config.inference_server_address}")
+    logger.info(f"Load model from inference server: {config.inference_server_address}")
     model = ModelClient(
         address=config.inference_server_address, auth_token=config.inference_server_auth, model_id=config.model_id
     )
@@ -121,7 +124,7 @@ def load_openai_model(config: OpenAIModelConfig, env_config: EnvConfig):
 
 
 def load_model_with_inference_endpoints(config: InferenceEndpointModelConfig, env_config: EnvConfig):
-    hlog("Spin up model using inference endpoint.")
+    logger.info("Spin up model using inference endpoint.")
     model = InferenceEndpointModel(config=config, env_config=env_config)
     return model
 
diff --git a/src/lighteval/models/nanotron_model.py b/src/lighteval/models/nanotron_model.py
index ded1624f..21b60504 100644
--- a/src/lighteval/models/nanotron_model.py
+++ b/src/lighteval/models/nanotron_model.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 # ruff: noqa: C901
+import logging
 import os
 import time
 from typing import List, Optional, Tuple, Type, Union
@@ -41,7 +42,6 @@
     LoglikelihoodDataset,
     LoglikelihoodSingleTokenDataset,
 )
-from lighteval.logging.hierarchical_logger import hlog_err, hlog_warn
 from lighteval.models.base_model import LightevalModel, ModelInfo
 from lighteval.models.model_output import (
     Batch,
@@ -59,6 +59,9 @@
 from lighteval.utils.utils import EnvConfig, as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 TokenSequence = Union[List[int], torch.LongTensor, torch.Tensor, BatchEncoding]
@@ -1186,7 +1189,7 @@ def greedy_until(
                 returns_logits = batch[0].use_logits
                 num_samples = batch[0].num_samples
                 if num_samples > 1:
-                    hlog_err(
+                    logger.error(
                         "Nonotron models does not allow sampling evaluations - this is likely to fail or provide problematic results"
                     )
 
@@ -1210,7 +1213,7 @@ def greedy_until(
                 # should have been managed by the prompt creator/few shot manager if requested by the user.
                 context_size = tokenized["input_ids"].shape[1]
                 if context_size > self.max_length:
-                    hlog_warn(
+                    logger.warning(
                         f"The context size of your batch ({context_size}) is bigger than the maximum context size allowed by the model ({self.max_length}) for a task in"
                         + str({i.task_name for i in batch})
                         + ". This is likely to lead to some errors."  # noqa C401
diff --git a/src/lighteval/models/openai_model.py b/src/lighteval/models/openai_model.py
index f799a45a..12fbeb95 100644
--- a/src/lighteval/models/openai_model.py
+++ b/src/lighteval/models/openai_model.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import os
 import time
 from concurrent.futures import ThreadPoolExecutor
@@ -28,7 +29,6 @@
 from tqdm import tqdm
 
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.models.endpoint_model import ModelInfo
 from lighteval.models.model_output import (
@@ -45,6 +45,9 @@
 from lighteval.utils.imports import is_openai_available
 
 
+logger = logging.getLogger(__name__)
+
+
 if is_openai_available():
     import logging
 
@@ -90,7 +93,7 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, logit_b
                 )
                 return response
             except Exception as e:
-                hlog_warn(f"{type(e), e}")
+                logger.warning(f"{type(e), e}")
                 time.sleep(self.API_RETRY_SLEEP)
                 self.API_RETRY_SLEEP = self.API_RETRY_SLEEP**self.API_RETRY_MULTIPLIER
         raise Exception("Failed to get response from the API")
diff --git a/src/lighteval/models/vllm_model.py b/src/lighteval/models/vllm_model.py
index dc242c60..ecfe8fd8 100644
--- a/src/lighteval/models/vllm_model.py
+++ b/src/lighteval/models/vllm_model.py
@@ -22,6 +22,7 @@
 
 import gc
 import itertools
+import logging
 import os
 from typing import Optional
 
@@ -29,7 +30,6 @@
 from tqdm import tqdm
 
 from lighteval.data import GenerativeTaskDataset, LoglikelihoodDataset
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.models.abstract_model import LightevalModel, ModelInfo
 from lighteval.models.model_config import VLLMModelConfig
 from lighteval.models.model_output import (
@@ -45,6 +45,9 @@
 from lighteval.utils.utils import EnvConfig, as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 if is_vllm_available():
     import ray
     from more_itertools import distribute
@@ -225,14 +228,14 @@ def greedy_until(
             # left truncate the inputs to the maximum length
             if max_new_tokens is not None:
                 if context_size + max_new_tokens > self.max_length:
-                    hlog_warn(
+                    logger.warning(
                         f"{context_size + max_new_tokens=} which is greather than {self.max_length=}. Truncating context to {self.max_length - max_new_tokens} tokens."
                     )
                     context_size = self.max_length - max_new_tokens
                     inputs = [input[-context_size:] for input in inputs]
             else:
                 if context_size > self.max_length:
-                    hlog_warn(
+                    logger.warning(
                         f"{context_size=} which is greather than {self.max_length=}. Truncating context to {self.max_length} tokens."
                     )
                     context_size = self.max_length
diff --git a/src/lighteval/pipeline.py b/src/lighteval/pipeline.py
index e429e519..facecd8e 100644
--- a/src/lighteval/pipeline.py
+++ b/src/lighteval/pipeline.py
@@ -32,7 +32,6 @@
 import numpy as np
 
 from lighteval.logging.evaluation_tracker import EvaluationTracker
-from lighteval.logging.hierarchical_logger import hlog, htrack_block
 from lighteval.metrics.utils.metric_utils import MetricCategory
 from lighteval.models.model_loader import BaseModel, load_model
 from lighteval.models.model_output import ModelResponse
@@ -65,6 +64,12 @@
     from lighteval.models.nanotron_model import NanotronLightevalModel
 
 
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
 class ParallelismManager(Enum):
     ACCELERATE = auto()
     NANOTRON = auto()
@@ -124,8 +129,8 @@ def __init__(
         self.pipeline_parameters = pipeline_parameters
         self.launcher_type = self.pipeline_parameters.launcher_type
         if self.pipeline_parameters.max_samples:
-            hlog(
-                "WARNING: --max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING."
+            logger.warning(
+                "--max_samples WAS SET. THESE NUMBERS ARE ONLY PARTIAL AND SHOULD NOT BE USED FOR COMPARISON UNLESS YOU KNOW WHAT YOU ARE DOING."
             )
 
         self.model_config = model_config
@@ -141,93 +146,88 @@ def __init__(
 
     def _init_parallelism_manager(self):
         accelerator, parallel_context = None, None
-        with htrack_block("Test all gather"):
-            if self.launcher_type == ParallelismManager.ACCELERATE:
-                if not is_accelerate_available():
-                    raise ValueError("You are trying to launch an accelerate model, but accelerate is not installed")
-                accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
-                test_all_gather(accelerator=accelerator)
-            elif self.launcher_type == ParallelismManager.NANOTRON:
-                if not is_nanotron_available():
-                    raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed")
-                dist.initialize_torch_distributed()
-                parallel_context = ParallelContext(
-                    tensor_parallel_size=self.model_config.lighteval_config.parallelism.tp,
-                    pipeline_parallel_size=self.model_config.lighteval_config.parallelism.pp,
-                    data_parallel_size=self.model_config.lighteval_config.parallelism.dp,
-                )
-                test_all_gather(parallel_context=parallel_context)
+        if self.launcher_type == ParallelismManager.ACCELERATE:
+            if not is_accelerate_available():
+                raise ValueError("You are trying to launch an accelerate model, but accelerate is not installed")
+            accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+            test_all_gather(accelerator=accelerator)
+        elif self.launcher_type == ParallelismManager.NANOTRON:
+            if not is_nanotron_available():
+                raise ValueError("You are trying to launch a nanotron model, but nanotron is not installed")
+            dist.initialize_torch_distributed()
+            parallel_context = ParallelContext(
+                tensor_parallel_size=self.model_config.lighteval_config.parallelism.tp,
+                pipeline_parallel_size=self.model_config.lighteval_config.parallelism.pp,
+                data_parallel_size=self.model_config.lighteval_config.parallelism.dp,
+            )
+            test_all_gather(parallel_context=parallel_context)
 
-            return accelerator, parallel_context
+        return accelerator, parallel_context
 
     def _init_model(self, model_config, model):
-        with htrack_block("Model loading"):
-            if model_config is not None:
-                if self.parallel_context:
-                    return NanotronLightevalModel(
-                        checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path)
-                        if self.pipeline_parameters.nanotron_checkpoint_path
-                        else "",
-                        nanotron_config=self.model_config,
-                        parallel_context=self.parallel_context,
-                        debug_one_layer_model=False,
-                        model_class=None,
-                        env_config=self.pipeline_parameters.env_config,
-                    )
-                else:
-                    return load_model(config=model_config, env_config=self.pipeline_parameters.env_config)
-            if isinstance(model, BaseModel):
-                return model
-            else:
-                return BaseModel.from_model(
-                    model=model,
-                    use_chat_template=self.pipeline_parameters.use_chat_template,
+        logger.info("--- LOADING MODEL ---")
+        if model_config is not None:
+            if self.parallel_context:
+                return NanotronLightevalModel(
+                    checkpoint_path=os.path.dirname(self.pipeline_parameters.nanotron_checkpoint_path)
+                    if self.pipeline_parameters.nanotron_checkpoint_path
+                    else "",
+                    nanotron_config=self.model_config,
+                    parallel_context=self.parallel_context,
+                    debug_one_layer_model=False,
+                    model_class=None,
                     env_config=self.pipeline_parameters.env_config,
-                    accelerator=self.accelerator,
                 )
+            else:
+                return load_model(config=model_config, env_config=self.pipeline_parameters.env_config)
+        if isinstance(model, BaseModel):
+            return model
+        else:
+            return BaseModel.from_model(
+                model=model,
+                use_chat_template=self.pipeline_parameters.use_chat_template,
+                env_config=self.pipeline_parameters.env_config,
+                accelerator=self.accelerator,
+            )
 
     def _init_tasks_and_requests(self, tasks: str):
-        with htrack_block("Tasks loading"):
-            with local_ranks_zero_first() if self.launcher_type == ParallelismManager.NANOTRON else nullcontext():
-                registry = Registry(
-                    cache_dir=self.pipeline_parameters.env_config.cache_dir,
-                    custom_tasks=self.pipeline_parameters.custom_tasks_directory,
-                )
-                task_names_list, fewshots_dict = taskinfo_selector(tasks, registry)
-                task_dict = registry.get_task_dict(task_names_list)
-                LightevalTask.load_datasets(
-                    list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes
-                )
+        with local_ranks_zero_first() if self.launcher_type == ParallelismManager.NANOTRON else nullcontext():
+            logger.info("--- LOADING TASKS ---")
+            registry = Registry(
+                cache_dir=self.pipeline_parameters.env_config.cache_dir,
+                custom_tasks=self.pipeline_parameters.custom_tasks_directory,
+            )
+            task_names_list, fewshots_dict = taskinfo_selector(tasks, registry)
+            task_dict = registry.get_task_dict(task_names_list)
+            LightevalTask.load_datasets(list(task_dict.values()), self.pipeline_parameters.dataset_loading_processes)
 
-                self.evaluation_tracker.task_config_logger.log(task_dict)
-
-                hlog("Loading documents, and requests")
-                requests, docs = create_requests_from_tasks(
-                    task_dict=task_dict,
-                    fewshot_dict=fewshots_dict,
-                    num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds,
-                    lm=self.model,
-                    max_samples=self.pipeline_parameters.max_samples,
-                    evaluation_tracker=self.evaluation_tracker,
-                    use_chat_template=self.pipeline_parameters.use_chat_template,
-                    system_prompt=self.pipeline_parameters.system_prompt,
-                )
+            self.evaluation_tracker.task_config_logger.log(task_dict)
+
+            requests, docs = create_requests_from_tasks(
+                task_dict=task_dict,
+                fewshot_dict=fewshots_dict,
+                num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds,
+                lm=self.model,
+                max_samples=self.pipeline_parameters.max_samples,
+                evaluation_tracker=self.evaluation_tracker,
+                use_chat_template=self.pipeline_parameters.use_chat_template,
+                system_prompt=self.pipeline_parameters.system_prompt,
+            )
 
-                self.task_names_list = task_names_list
-                self.task_dict = task_dict
-                self.fewshot_dict = fewshots_dict
-                self.requests = requests
-                self.docs = docs
+            self.task_names_list = task_names_list
+            self.task_dict = task_dict
+            self.fewshot_dict = fewshots_dict
+            self.requests = requests
+            self.docs = docs
 
     def _init_random_seeds(self):
-        with htrack_block("Setting seeds and waiting for all processes"):
-            hlog(f"setting seed to {1234} for random and numpy")
-            random.seed(1234)
-            np.random.seed(1234)
-            if self.accelerator is not None:
-                self.accelerator.wait_for_everyone()
-            if self.parallel_context is not None:
-                dist.barrier()
+        logger.info("--- INIT SEEDS ---")
+        random.seed(1234)
+        np.random.seed(1234)
+        if self.accelerator is not None:
+            self.accelerator.wait_for_everyone()
+        if self.parallel_context is not None:
+            dist.barrier()
 
     def is_main_process(self):
         if self.accelerator:
@@ -237,43 +237,38 @@ def is_main_process(self):
         return True
 
     def evaluate(self):
-        with htrack_block("Evaluation"):
-            self.evaluation_tracker.general_config_logger.log_args_info(
-                num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds,
-                override_batch_size=self.pipeline_parameters.override_batch_size,
-                max_samples=self.pipeline_parameters.max_samples,
-                job_id=self.pipeline_parameters.job_id,
-                config=self.model_config,
-            )
+        self.evaluation_tracker.general_config_logger.log_args_info(
+            num_fewshot_seeds=self.pipeline_parameters.num_fewshot_seeds,
+            override_batch_size=self.pipeline_parameters.override_batch_size,
+            max_samples=self.pipeline_parameters.max_samples,
+            job_id=self.pipeline_parameters.job_id,
+            config=self.model_config,
+        )
 
-            hlog(f"Evaluate on {len(self.task_names_list)} tasks.")
-            sample_id_to_responses = self._run_model()
-            self._compute_metrics(sample_id_to_responses)
+        sample_id_to_responses = self._run_model()
+        self._compute_metrics(sample_id_to_responses)
 
         if self.is_main_process():
-            with htrack_block("Compiling results"):
-                self.evaluation_tracker.general_config_logger.log_end_time()
-                self.evaluation_tracker.metrics_logger.aggregate(task_dict=self.task_dict, bootstrap_iters=1000)
-                self.evaluation_tracker.details_logger.aggregate()
-
-            with htrack_block("Cleaning up"):  # For non nanotron models
-                for weights in ["delta", "adapter"]:
-                    try:
-                        tmp_weights_dir = (
-                            f"{self.evaluation_tracker.general_config_logger.model_name}-{weights}-applied"
-                        )
-                        shutil.rmtree(tmp_weights_dir)
-                        hlog(f"Removed {tmp_weights_dir}")
-                    except OSError:
-                        pass
+            self.evaluation_tracker.general_config_logger.log_end_time()
+            self.evaluation_tracker.metrics_logger.aggregate(task_dict=self.task_dict, bootstrap_iters=1000)
+            self.evaluation_tracker.details_logger.aggregate()
+
+            for weights in ["delta", "adapter"]:
+                try:
+                    tmp_weights_dir = f"{self.evaluation_tracker.general_config_logger.model_name}-{weights}-applied"
+                    shutil.rmtree(tmp_weights_dir)
+                    logger.info(f"Removed {tmp_weights_dir}")
+                except OSError:
+                    pass
 
     def _run_model(self):
         # Running all requests depending on the model call type (log likelihood, generative, ...)
         # to be able to batch them
+        logger.info("--- RUNNING MODEL ---")
         sample_id_to_responses: dict[(SampleUid, MetricCategory), list[ModelResponse]] = collections.defaultdict(list)
 
         for request_type, requests in self.requests.items():
-            hlog(f"Running {request_type} requests")
+            logger.info(f"Running {request_type} requests")
             run_model = self.model.get_method_from_request_type(request_type=request_type)
             responses = run_model(requests, override_bs=self.pipeline_parameters.override_batch_size)
 
@@ -301,6 +296,7 @@ def _compute_metrics(self, sample_id_to_responses):
         #             "responses": [[response1_1, response1_2, ...], [response2_1, response2_2, ...], ...],
         #             "docs": [doc1, doc2, ...]
         #         }
+        logger.info("--- COMPUTING METRICS ---")
         task_metric_category_groups = collections.defaultdict(
             lambda: collections.defaultdict(lambda: collections.defaultdict(list))
         )
@@ -333,6 +329,7 @@ def _compute_metrics(self, sample_id_to_responses):
                     self.evaluation_tracker.details_logger.log(task_name, task, doc, response, output)
 
     def save_and_push_results(self):
+        logger.info("--- SAVING AND PUSHING RESULTS ---")
         if self.is_main_process():
             self.evaluation_tracker.save()
 
@@ -342,6 +339,7 @@ def _init_final_dict(self):
                 self.final_dict = self.evaluation_tracker.generate_final_dict()
 
     def show_results(self):
+        logger.info("--- DISPLAYING RESULTS ---")
         self._init_final_dict()
         if self.is_main_process():
             print(make_results_table(self.final_dict))
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 5b6a3312..c5395281 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -22,6 +22,7 @@
 
 import ast
 import json
+import logging
 import random
 import re
 import string
@@ -29,11 +30,13 @@
 import numpy as np
 import pycountry
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.tasks.requests import Doc
 from lighteval.utils.utils import as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 # fmt: off
 LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
 INTEGER_INDICES = list(map(str, list(range(1, 27))))
@@ -277,7 +280,9 @@ def bbh_logical_deduction_three_objects(line, task_name: str = None):
 
 def bbh_movie_recommendation(line, task_name: str = None):
     if line["target"] == "Monsters, Inc":  # this line is not correctly formatted
-        hlog_warn("One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted.")
+        logger.warning(
+            "One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted."
+        )
         return []
     instruction = "Recommend movies similar to the given list of movies.\n\n"
     choices = [f"({c})" for c in LETTER_INDICES[:6]]
@@ -318,7 +323,7 @@ def bbh_reasoning_about_colored_objects(line, task_name: str = None):
 
 def bbh_ruin_names(line, task_name: str = None):
     if line["target"] in ["dearth, wind, & fire", "rita, sue and bob poo"]:  # line not correctly formatted
-        hlog_warn("One sample removed from task bbh:ruin_names because its line is incorrectly formatted.")
+        logger.warning("One sample removed from task bbh:ruin_names because its line is incorrectly formatted.")
         return []
     instruction = "Select the humorous edit that 'ruins' the input movie or musical artist name.\n\n"
     choices = [f"({c})" for c in LETTER_INDICES[:6]]
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index cba69457..9d08ba12 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -22,6 +22,7 @@
 
 import collections
 import inspect
+import logging
 import random
 from dataclasses import asdict, dataclass, field
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple
@@ -31,7 +32,6 @@
 from multiprocess import Pool
 from pytablewriter import MarkdownTableWriter
 
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.metrics import (
     apply_generative_metric,
     apply_llm_as_judge_metric,
@@ -60,6 +60,8 @@
 if TYPE_CHECKING:
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
+logger = logging.getLogger(__name__)
+
 
 @dataclass
 class LightevalTaskConfig:
@@ -188,7 +190,7 @@ def __init__(  # noqa: C901
         self.dataset_filter = cfg.hf_filter
         self.trust_dataset = cfg.trust_dataset
         self.dataset: Optional[DatasetDict] = None  # Delayed download
-        hlog(f"{self.dataset_path} {self.dataset_config_name}")
+        logger.info(f"{self.dataset_path} {self.dataset_config_name}")
         self._fewshot_docs = None
         self._docs = None
 
@@ -207,7 +209,7 @@ def __init__(  # noqa: C901
         ignored = [metric for metric in self.metrics if metric.category == MetricCategory.IGNORED]
 
         if len(ignored) > 0:
-            hlog_warn(f"[WARNING] Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
+            logger.warning(f"Not implemented yet: ignoring the metric {' ,'.join(ignored)} for task {self.name}.")
 
         current_categories = [metric.category for metric in self.metrics]
         self.has_metric_category = {category: (category in current_categories) for category in MetricCategory}
@@ -262,7 +264,7 @@ def get_first_possible_fewshot_splits(
         if len(stored_splits) > 0:
             return stored_splits[:number_of_splits]
 
-        hlog_warn(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.")
+        logger.warning(f"Careful, the task {self.name} is using evaluation data to build the few shot examples.")
         return None
 
     def _get_docs_from_split(self, splits: list[str], few_shots=False) -> list[Doc]:
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index 7555b72a..982a6654 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import logging
 import random
 from collections import defaultdict
 from dataclasses import dataclass
@@ -27,12 +28,14 @@
 from itertools import cycle
 from typing import TYPE_CHECKING, Optional, Tuple, Union
 
-from lighteval.logging.hierarchical_logger import hlog_warn
 from lighteval.models.abstract_model import LightevalModel
 from lighteval.tasks.requests import Doc
 from lighteval.utils.utils import as_list
 
 
+logger = logging.getLogger(__name__)
+
+
 if TYPE_CHECKING:
     from lighteval.tasks.lighteval_task import LightevalTask
 
@@ -416,5 +419,5 @@ def get_fewshot_seeds(self, few_shot_iterations: int = None) -> list[int]:
         if few_shot_iterations <= 1:
             return [0]
         seeds = range(few_shot_iterations)
-        hlog_warn(f"Running {self.task.name} with {few_shot_iterations} few-shot iterations.")
+        logger.warning(f"Running {self.task.name} with {few_shot_iterations} few-shot iterations.")
         return seeds
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 612d981d..69532c09 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -22,23 +22,24 @@
 
 import collections
 import importlib
+import logging
 import os
 from functools import lru_cache, partial
 from itertools import groupby
 from pathlib import Path
-from pprint import pformat
 from types import ModuleType
 from typing import Callable, Dict, List, Optional, Union
 
 from datasets.load import dataset_module_factory
 
 import lighteval.tasks.default_tasks as default_tasks
-from lighteval.logging.hierarchical_logger import hlog, hlog_warn
 from lighteval.tasks.extended import AVAILABLE_EXTENDED_TASKS_MODULES
 from lighteval.tasks.lighteval_task import LightevalTask, LightevalTaskConfig
 from lighteval.utils.imports import CANNOT_USE_EXTENDED_TASKS_MSG, can_load_extended_tasks
 
 
+logger = logging.getLogger(__name__)
+
 # Helm, Bigbench, Harness are implementations following an evaluation suite setup
 # Original follows the original implementation as closely as possible
 # Leaderboard are the evaluations we fixed on the open llm leaderboard - you should get similar results
@@ -104,8 +105,7 @@ def get_task_instance(self, task_name: str):
         """
         task_class = self.task_registry.get(task_name)
         if task_class is None:
-            hlog_warn(f"{task_name} not found in provided tasks")
-            hlog_warn(pformat(list(self.task_registry.keys())))
+            logger.error(f"{task_name} not found in provided tasks")
             raise ValueError(f"Cannot find tasks {task_name} in task list or in custom task registry)")
 
         return task_class()
@@ -133,12 +133,12 @@ def task_registry(self):
             for extended_task_module in AVAILABLE_EXTENDED_TASKS_MODULES:
                 custom_tasks_module.append(extended_task_module)
         else:
-            hlog_warn(CANNOT_USE_EXTENDED_TASKS_MSG)
+            logger.warning(CANNOT_USE_EXTENDED_TASKS_MSG)
 
         for module in custom_tasks_module:
             TASKS_TABLE.extend(module.TASKS_TABLE)
             # We don't log the tasks themselves as it makes the logs unreadable
-            hlog(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}")
+            logger.info(f"Found {len(module.TASKS_TABLE)} custom tasks in {module.__file__}")
 
         if len(TASKS_TABLE) > 0:
             custom_tasks_registry = create_lazy_tasks(meta_table=TASKS_TABLE, cache_dir=self._cache_dir)
@@ -147,7 +147,7 @@ def task_registry(self):
         # Check the overlap between default_tasks_registry and custom_tasks_registry
         intersection = set(default_tasks_registry.keys()).intersection(set(custom_tasks_registry.keys()))
         if len(intersection) > 0:
-            hlog_warn(
+            logger.warning(
                 f"Following tasks ({intersection}) exists both in the default and custom tasks. Will use the default ones on conflict."
             )
 
@@ -315,7 +315,9 @@ def taskinfo_selector(tasks: str, task_registry: Registry) -> tuple[list[str], d
         few_shot = int(few_shot)
 
         if suite_name not in DEFAULT_SUITES:
-            hlog(f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations.")
+            logger.warning(
+                f"Suite {suite_name} unknown. This is not normal, unless you are testing adding new evaluations."
+            )
 
         # This adds support for task supersets (eg: mmlu -> all the mmlu tasks)
         for expanded_task in task_registry.expand_task_definition(f"{suite_name}|{task_name}"):
@@ -348,7 +350,7 @@ def create_lazy_tasks(
     # Every task is renamed suite|task, if the suite is in DEFAULT_SUITE
     for config in meta_table:
         if not any(suite in config.suite for suite in DEFAULT_SUITES):
-            hlog_warn(
+            logger.warning(
                 f"This evaluation is not in any known suite: {config.name} is in {config.suite}, not in {DEFAULT_SUITES}. Skipping."
             )
             continue
diff --git a/src/lighteval/utils/parallelism.py b/src/lighteval/utils/parallelism.py
index 892725d9..3308240c 100644
--- a/src/lighteval/utils/parallelism.py
+++ b/src/lighteval/utils/parallelism.py
@@ -23,10 +23,10 @@
 import functools
 import gc
 import inspect
+import logging
 
 import torch
 
-from lighteval.logging.hierarchical_logger import hlog, logger
 from lighteval.utils.imports import (
     NO_ACCELERATE_ERROR_MSG,
     NO_NANOTRON_ERROR_MSG,
@@ -35,6 +35,9 @@
 )
 
 
+logger = logging.getLogger(__name__)
+
+
 def should_reduce_batch_size(exception: Exception) -> bool:
     """
     Checks if `exception` relates to CUDA out-of-memory, CUDNN not supported, or CPU out-of-memory
@@ -127,10 +130,10 @@ def test_all_gather(accelerator=None, parallel_context=None):
     if accelerator:
         if not is_accelerate_available():
             raise ImportError(NO_ACCELERATE_ERROR_MSG)
-        hlog("Test gather tensor")
+        logger.info("Test gather tensor")
         test_tensor: torch.Tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
         gathered_tensor: torch.Tensor = accelerator.gather(test_tensor)
-        hlog(f"gathered_tensor {gathered_tensor}, should be {list(range(accelerator.num_processes))}")
+        logger.info(f"gathered_tensor {gathered_tensor}, should be {list(range(accelerator.num_processes))}")
         accelerator.wait_for_everyone()
     elif parallel_context:
         if not is_nanotron_available():
@@ -138,7 +141,7 @@ def test_all_gather(accelerator=None, parallel_context=None):
         from nanotron import distributed as dist
         from nanotron import logging
 
-        hlog("Test gather tensor")
+        logger.info("Test gather tensor")
         # Do a first NCCL sync to warmup and try to avoid Timeout after model/data loading
         logging.log_rank(
             f"[TEST] Running NCCL sync for ranks {list(range(parallel_context.world_pg.size()))}",
@@ -162,4 +165,4 @@ def test_all_gather(accelerator=None, parallel_context=None):
         del test_tensor_list
         del test_tensor
     else:
-        hlog("Not running in a parallel setup, nothing to test")
+        logger.info("Not running in a parallel setup, nothing to test")