diff --git a/.github/workflows/modelconverter_test.yaml b/.github/workflows/modelconverter_test.yaml
index 50f9c73..e5bb82a 100644
--- a/.github/workflows/modelconverter_test.yaml
+++ b/.github/workflows/modelconverter_test.yaml
@@ -51,7 +51,7 @@ jobs:
         cache: pip
 
     - name: Install dependencies
-      run: pip install -e .[dev]
+      run: pip install -e .[dev] --extra-index-url https://artifacts.luxonis.com/artifactory/luxonis-python-release-local/
 
     - name: Authenticate to Google Cloud
       id: google-auth
diff --git a/.github/workflows/unittests.yaml b/.github/workflows/unittests.yaml
index 39fbfcb..a92ef85 100644
--- a/.github/workflows/unittests.yaml
+++ b/.github/workflows/unittests.yaml
@@ -25,7 +25,7 @@ jobs:
         cache: pip
 
     - name: Install package
-      run: python -m pip install -e .[dev]
+      run: python -m pip install -e .[dev] --extra-index-url https://artifacts.luxonis.com/artifactory/luxonis-python-release-local/
 
     - name: Run Unit Tests
       env:
diff --git a/README.md b/README.md
index b24ee18..5f7bd71 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,13 @@ pip install modelconv
 
 Run `modelconverter --help` to see the available commands and options.
 
+> \[!NOTE\]
+> To use the [benchmarking feature](#benchmarking), the `depthai v3` package must be installed. While the `depthai v3` is not yet released on PyPI, you can install it with the following command:
+>
+> ```bash
+> pip install -r requirements-bench.txt --extra-index-url https://artifacts.luxonis.com/artifactory/luxonis-python-release-local/
+> ```
+
 ## Configuration
 
 There are two main ways to execute configure the conversion process:
@@ -437,3 +444,6 @@ modelconverter benchmark rvc3 --model-path <path_to_model.xml>
 
 The command prints a table with the benchmark results to the console and
 optionally saves the results to a `.csv` file.
+
+> \[!NOTE\]
+> For **RVC2** and **RVC4**: The `--model-path` can be a path to a local .blob file, a NN Archive file (.tar.xz), or a name of a model slug from [Luxonis HubAI](https://hub.luxonis.com/ai). To access models from different teams in Luxonis HubAI, remember to update the HUBAI_API_KEY environment variable respectively.
diff --git a/modelconverter/__main__.py b/modelconverter/__main__.py
index 5d73669..6c2a7ab 100644
--- a/modelconverter/__main__.py
+++ b/modelconverter/__main__.py
@@ -175,10 +175,12 @@ def benchmark(
 
     **RVC2**
 
-    - `--repetitions`: The number of repetitions to perform. Default: `1`
+    - `--repetitions`: The number of repetitions to perform. Default: `10`
 
     - `--num-threads`: The number of threads to use for inference. Default: `2`
 
+    - `--num-messages`: The number of messages to measure for each report. Default: `50`
+
     ---
 
     **RVC3**
@@ -191,8 +193,18 @@ def benchmark(
 
     - `--profile`: The SNPE profile to use for inference. Default: `"default"`
 
+    - `--runtime`: The SNPE runtime to use for inference (dsp or cpu). Default: `"dsp"`
+
     - `--num-images`: The number of images to use for inference. Default: `1000`
 
+    - `--dai-benchmark`: Whether to run the benchmark using the DAI V3. If False the SNPE tools are used. Default: `True`
+
+    - `--repetitions`: The number of repetitions to perform (dai-benchmark only). Default: `10`
+
+    - `--num-threads`: The number of threads to use for inference (dai-benchmark only). Default: `1`
+
+    - `--num-messages`: The number of messages to measure for each report (dai-benchmark only). Default: `50`
+
     ---
     """
 
@@ -203,6 +215,13 @@ def benchmark(
             key = key[2:].replace("-", "_")
         else:
             raise typer.BadParameter(f"Unknown argument: {key}")
+        if key == "dai_benchmark":
+            value = value.capitalize()
+            if value not in ["True", "False"]:
+                raise typer.BadParameter(
+                    "dai_benchmark must be either True or False"
+                )
+            value = value == "True"
         kwargs[key] = value
     Benchmark = get_benchmark(target)
     benchmark = Benchmark(str(model_path))
diff --git a/modelconverter/packages/base_benchmark.py b/modelconverter/packages/base_benchmark.py
index e3eba9a..0fb3d33 100644
--- a/modelconverter/packages/base_benchmark.py
+++ b/modelconverter/packages/base_benchmark.py
@@ -1,3 +1,4 @@
+import re
 from abc import ABC, abstractmethod
 from collections import namedtuple
 from logging import getLogger
@@ -7,7 +8,7 @@
 import pandas as pd
 from typing_extensions import TypeAlias
 
-from modelconverter.utils import resolve_path
+from modelconverter.utils import is_hubai_available, resolve_path
 
 logger = getLogger(__name__)
 
@@ -23,14 +24,36 @@
 
 
 class Benchmark(ABC):
+    VALID_EXTENSIONS = (".tar.xz", ".blob", ".dlc")
+    HUB_MODEL_PATTERN = re.compile(r"^(?:([^/]+)/)?([^:]+):(.+)$")
+
     def __init__(
         self,
         model_path: str,
         dataset_path: Optional[Path] = None,
     ):
-        self.model_path = resolve_path(model_path, Path.cwd())
+        if any(model_path.endswith(ext) for ext in self.VALID_EXTENSIONS):
+            self.model_path = resolve_path(model_path, Path.cwd())
+            self.model_name = self.model_path.stem
+        else:
+            hub_match = self.HUB_MODEL_PATTERN.match(model_path)
+            if not hub_match:
+                raise ValueError(
+                    "Invalid 'model-path' format. Expected either:\n"
+                    "- Model file path: path/to/model.blob, path/to/model.dlc or path/to/model.tar.xz\n"
+                    "- HubAI model slug: [team_name/]model_name:variant"
+                )
+            team_name, model_name, model_variant = hub_match.groups()
+            if is_hubai_available(model_name, model_variant):
+                self.model_path = model_path
+                self.model_name = model_name
+            else:
+                raise ValueError(
+                    f"Model {team_name+'/' if team_name else ''}{model_name}:{model_variant} not found in HubAI."
+                )
+
         self.dataset_path = dataset_path
-        self.model_name = self.model_path.stem
+
         self.header = [
             *self.default_configuration.keys(),
             "fps",
@@ -64,7 +87,13 @@ def print_results(
             title=f"Benchmark Results for [yellow]{self.model_name}",
             box=box.ROUNDED,
         )
-        for field in self.header:
+
+        updated_header = [
+            *results[0][0].keys(),
+            "fps",
+            "latency (ms)",
+        ]
+        for field in updated_header:
             table.add_column(f"[cyan]{field}")
         for configuration, result in results:
             fps_color = (
@@ -74,17 +103,22 @@ def print_results(
                 if result.fps < 5
                 else "green"
             )
-            latency_color = (
-                "yellow"
-                if 50 < result.latency < 100
-                else "red"
-                if result.latency > 100
-                else "green"
-            )
+            if isinstance(result.latency, str):
+                latency_color = "orange3"
+            else:
+                latency_color = (
+                    "yellow"
+                    if 50 < result.latency < 100
+                    else "red"
+                    if result.latency > 100
+                    else "green"
+                )
             table.add_row(
                 *map(lambda x: f"[magenta]{x}", configuration.values()),
                 f"[{fps_color}]{result.fps:.2f}",
-                f"[{latency_color}]{result.latency:.5f}",
+                f"[{latency_color}]{result.latency}"
+                if isinstance(result.latency, str)
+                else f"[{latency_color}]{result.latency:.5f}",
             )
         console = Console()
         console.print(table)
diff --git a/modelconverter/packages/rvc2/benchmark.py b/modelconverter/packages/rvc2/benchmark.py
index 808edd0..e57ff3d 100644
--- a/modelconverter/packages/rvc2/benchmark.py
+++ b/modelconverter/packages/rvc2/benchmark.py
@@ -1,13 +1,13 @@
 import logging
-import time
 from pathlib import Path
-from typing import Dict, List, cast
+from typing import List
 
 import depthai as dai
 import numpy as np
-from depthai import NNData
 from rich.progress import Progress
 
+from modelconverter.utils import environ
+
 from ..base_benchmark import Benchmark, BenchmarkResult, Configuration
 
 logger = logging.getLogger(__name__)
@@ -20,14 +20,14 @@ def default_configuration(self) -> Configuration:
         repetitions: The number of repetitions to perform.
         num_threads: The number of threads to use for inference.
         """
-        return {"repetitions": 1, "num_threads": 2}
+        return {"repetitions": 10, "num_messages": 50, "num_threads": 2}
 
     @property
     def all_configurations(self) -> List[Configuration]:
         return [
-            {"repetitions": 5, "num_threads": 1},
-            {"repetitions": 5, "num_threads": 2},
-            {"repetitions": 5, "num_threads": 3},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 1},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 2},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 3},
         ]
 
     def benchmark(self, configuration: Configuration) -> BenchmarkResult:
@@ -35,135 +35,102 @@ def benchmark(self, configuration: Configuration) -> BenchmarkResult:
 
     @staticmethod
     def _benchmark(
-        model_path: Path, repetitions: int, num_threads: int
+        model_path: Path | str,
+        repetitions: int,
+        num_messages: int,
+        num_threads: int,
     ) -> BenchmarkResult:
-        model = dai.OpenVINO.Blob(model_path)
-        input_name_shape: Dict[str, List[int]] = {}
-        input_name_type = {}
-        for i in list(model.networkInputs):
-            input_name_shape[i] = model.networkInputs[i].dims
-            input_name_type[i] = model.networkInputs[i].dataType.name
-
-        output_name_shape = {}
-        output_name_type = {}
-        for i in list(model.networkOutputs):
-            output_name_shape[i] = model.networkOutputs[i].dims
-            output_name_type[i] = model.networkOutputs[i].dataType.name
-
-        pipeline = dai.Pipeline()
-
-        detection_nn = pipeline.createNeuralNetwork()
-        detection_nn.setBlobPath(model_path)
-        detection_nn.setNumInferenceThreads(num_threads)
-        detection_nn.input.setBlocking(True)
-        detection_nn.input.setQueueSize(1)
-
-        nn_in = pipeline.createXLinkIn()
-        nn_in.setMaxDataSize(6291456)
-        nn_in.setStreamName("in_nn")
-        nn_in.out.link(detection_nn.input)
-
-        xout_nn = pipeline.createXLinkOut()
-        xout_nn.setStreamName("nn")
-        xout_nn.input.setQueueSize(1)
-        xout_nn.input.setBlocking(True)
-        detection_nn.out.link(xout_nn.input)
-
-        xlink_buffer_max_size = 5 * 1024 * 1024
-        product_sum = sum(
-            map(lambda x: np.product(np.array(x)), output_name_shape.values())
-        )
-
-        xlink_buffer_count = int(xlink_buffer_max_size / product_sum)
-
-        logger.info(f"XLink buffer count: {xlink_buffer_count}")
-        if xlink_buffer_count > 1000:
-            logger.warning(
-                "XLink buffer count is too high! "
-                "The benchmarking will take more time and "
-                "the results may be overestimated."
+        device = dai.Device()
+        if device.getPlatform() != dai.Platform.RVC2:
+            raise ValueError(
+                f"Found {device.getPlatformAsString()}, expected RVC2 platform."
+            )
+
+        if isinstance(model_path, str):
+            modelPath = dai.getModelFromZoo(
+                dai.NNModelDescription(
+                    model_path,
+                    platform=device.getPlatformAsString(),
+                ),
+                apiKey=environ.HUBAI_API_KEY if environ.HUBAI_API_KEY else "",
+            )
+        elif str(model_path).endswith(".tar.xz"):
+            modelPath = str(model_path)
+        elif str(model_path).endswith(".blob"):
+            modelPath = model_path
+        else:
+            raise ValueError(
+                "Unsupported model format. Supported formats: .tar.xz, .blob, or HubAI model slug."
             )
 
-        with dai.Device(pipeline) as device, Progress() as progress:
-            device = cast(dai.Device, device)
-            detection_in_count = 100 + xlink_buffer_count
-            detection_in = device.getInputQueue(
-                "in_nn", maxSize=detection_in_count, blocking=True
+        inputSizes = []
+        inputNames = []
+        if isinstance(model_path, str) or str(model_path).endswith(".tar.xz"):
+            modelArhive = dai.NNArchive(modelPath)
+            for input in modelArhive.getConfig().model.inputs:
+                inputSizes.append(input.shape[::-1])
+                inputNames.append(input.name)
+        elif str(model_path).endswith(".blob"):
+            blob_model = dai.OpenVINO.Blob(modelPath)
+            for input in blob_model.networkInputs:
+                inputSizes.append(blob_model.networkInputs[input].dims)
+                inputNames.append(input)
+
+        inputData = dai.NNData()
+        for name, inputSize in zip(inputNames, inputSizes):
+            img = np.random.randint(
+                0, 255, (inputSize[1], inputSize[0], 3), np.uint8
             )
-            q_nn = device.getOutputQueue(name="nn", maxSize=1, blocking=True)
+            inputData.addTensor(name, img)
 
-            fps_storage = []
-            diffs = []
-            time.sleep(1)
+        with dai.Pipeline(device) as pipeline, Progress() as progress:
             repet_task = progress.add_task(
                 "[magenta]Repetition", total=repetitions
             )
-            infer_task = progress.add_task(
-                "[magenta]Inference", total=300 + 2 * xlink_buffer_count
-            )
-            for _ in range(repetitions):
-                progress.reset(infer_task, total=300 + 2 * xlink_buffer_count)
-                for _ in range(100 + xlink_buffer_count):
-                    nn_data = dai.NNData()
-                    for inp_name in input_name_shape:
-                        if input_name_type[inp_name] in ["FLOAT16", "FLOAT32"]:
-                            frame = cast(
-                                np.ndarray,
-                                np.random.rand(*input_name_shape[inp_name]),
-                            )
-                            frame = frame.astype(
-                                "float16"
-                                if input_name_type[inp_name] == "FLOAT16"
-                                else "float32"
-                            )
-                        elif input_name_type[inp_name] in ["INT", "I8", "U8F"]:
-                            frame = np.random.randint(
-                                256,
-                                size=input_name_shape[inp_name],
-                                dtype=(
-                                    np.int32
-                                    if input_name_type[inp_name] == "INT"
-                                    else (
-                                        np.uint8
-                                        if input_name_type[inp_name] == "U8F"
-                                        else np.int8
-                                    )
-                                ),
-                            )
-                        else:
-                            raise RuntimeError(
-                                f"Unknown input type detected: {input_name_type[inp_name]}!"
-                            )
-
-                        nn_data.setLayer(inp_name, frame)
-
-                    if len(input_name_shape) == 0:
-                        raise RuntimeError(
-                            "Failed to create input data: missing required information for one or more input layers."
-                        )
-                    detection_in.send(nn_data)
-                    progress.update(infer_task, advance=1)
-
-                for _ in range(100):
-                    progress.update(infer_task, advance=1)
-                    time.sleep(3 / 100)
-
-                for _ in range(40 + xlink_buffer_count):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
-
-                start = time.time()
-                for _ in range(50):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
-                diff = time.time() - start
-                diffs.append(diff / 50)
-                fps_storage.append(50 / diff)
-
-                for _ in range(10):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
+
+            benchmarkOut = pipeline.create(dai.node.BenchmarkOut)
+            benchmarkOut.setRunOnHost(False)
+            benchmarkOut.setFps(-1)
+
+            neuralNetwork = pipeline.create(dai.node.NeuralNetwork)
+            if isinstance(model_path, str) or str(model_path).endswith(
+                ".tar.xz"
+            ):
+                neuralNetwork.setNNArchive(modelArhive)
+            elif str(model_path).endswith(".blob"):
+                neuralNetwork.setBlobPath(modelPath)
+            neuralNetwork.setNumInferenceThreads(num_threads)
+
+            benchmarkIn = pipeline.create(dai.node.BenchmarkIn)
+            benchmarkIn.setRunOnHost(False)
+            benchmarkIn.sendReportEveryNMessages(num_messages)
+            benchmarkIn.logReportsAsWarnings(False)
+
+            benchmarkOut.out.link(neuralNetwork.input)
+            neuralNetwork.out.link(benchmarkIn.input)
+
+            outputQueue = benchmarkIn.report.createOutputQueue()
+            inputQueue = benchmarkOut.input.createInputQueue()
+
+            pipeline.start()
+            inputQueue.send(inputData)
+
+            rep = 0
+            fps_list = []
+            avg_latency_list = []
+            while pipeline.isRunning() and rep < repetitions:
+                benchmarkReport = outputQueue.get()
+                if not isinstance(benchmarkReport, dai.BenchmarkReport):
+                    raise ValueError(
+                        f"Expected BenchmarkReport, got {type(benchmarkReport)}"
+                    )
+                fps = benchmarkReport.fps
+                avg_latency = benchmarkReport.averageLatency * 1000
+
+                fps_list.append(fps)
+                avg_latency_list.append(avg_latency)
                 progress.update(repet_task, advance=1)
+                rep += 1
 
-            diffs = np.array(diffs) * 1000
-            return BenchmarkResult(np.mean(fps_storage), np.mean(diffs))
+            # Currently, the latency measurement is not supported on RVC2 by the depthai library.
+            return BenchmarkResult(np.mean(fps_list), "N/A")
diff --git a/modelconverter/packages/rvc4/benchmark.py b/modelconverter/packages/rvc4/benchmark.py
index 4e71bd0..b449766 100644
--- a/modelconverter/packages/rvc4/benchmark.py
+++ b/modelconverter/packages/rvc4/benchmark.py
@@ -1,15 +1,19 @@
 import io
+import json
 import logging
 import re
+import shutil
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Dict, Final, List, Optional, Tuple, cast
 
+import depthai as dai
 import numpy as np
 import pandas as pd
+from rich.progress import Progress
 
-from modelconverter.utils import subprocess_run
+from modelconverter.utils import environ, subprocess_run
 
 from ..base_benchmark import Benchmark, BenchmarkResult, Configuration
 
@@ -29,6 +33,11 @@
     "system_settings",
 ]
 
+RUNTIMES: Dict[str, str] = {
+    "dsp": "use_dsp",
+    "cpu": "use_cpu",
+}
+
 
 class AdbHandler:
     def __init__(self, device_id: Optional[str] = None) -> None:
@@ -71,29 +80,49 @@ def push(self, src: str, dst: str) -> Tuple[int, str, str]:
 
 class RVC4Benchmark(Benchmark):
     adb = AdbHandler()
+    force_cpu: bool = False
 
     @property
     def default_configuration(self) -> Configuration:
         """
         profile: The SNPE profile to use for inference.
+        runtime: The SNPE runtime to use for inference.
         num_images: The number of images to use for inference.
+        dai_benchmark: Whether to use the DepthAI for benchmarking.
+        repetitions: The number of repetitions to perform (dai-benchmark only).
+        num_threads: The number of threads to use for inference (dai-benchmark only).
+        num_messages: The number of messages to use for inference (dai-benchmark only).
         """
-        return {"profile": "default", "num_images": 1000}
+        return {
+            "profile": "default",
+            "runtime": "dsp",
+            "num_images": 1000,
+            "dai_benchmark": True,
+            "repetitions": 10,
+            "num_threads": 1,
+            "num_messages": 50,
+        }
 
     @property
     def all_configurations(self) -> List[Configuration]:
         return [{"profile": profile} for profile in PROFILES]
 
-    def _get_input_sizes(self) -> Dict[str, List[int]]:
+    def _get_input_sizes(self) -> Tuple[Dict[str, List[int]], Dict[str, str]]:
         csv_path = Path("info.csv")
         subprocess_run(
-            ["snpe-dlc-info", "-i", self.model_path, "-s", csv_path]
+            [
+                "snpe-dlc-info",
+                "-i",
+                self.model_path,
+                "-s",
+                csv_path,
+            ]
         )
         content = csv_path.read_text()
         csv_path.unlink()
 
         start_marker = "Input Name,Dimensions,Type,Encoding Info"
-        end_marker = "Total parameters:"
+        end_marker = "Output Name,Dimensions,Type,Encoding Info"
         start_index = content.find(start_marker)
         end_index = content.find(end_marker, start_index)
 
@@ -106,18 +135,34 @@ def _get_input_sizes(self) -> Dict[str, List[int]]:
             )
             for _, row in df.iterrows()
         }
-        return sizes
+        data_types = {
+            str(row["Input Name"]): str(row["Type"])
+            for _, row in df.iterrows()
+        }
+
+        return sizes, data_types
 
     def _prepare_raw_inputs(self, num_images: int) -> None:
-        input_sizes = self._get_input_sizes()
+        input_sizes, data_types = self._get_input_sizes()
         input_list = ""
         self.adb.shell(f"mkdir /data/local/tmp/{self.model_name}/inputs")
         for i in range(num_images):
             for name, size in input_sizes.items():
+                if data_types[name] == "Float_32":
+                    self.force_cpu = True
+                    numpy_type = np.float32
+                elif data_types[name] == "Float_16":
+                    numpy_type = np.float16
+                elif data_types[name] == "uFxp_8":
+                    numpy_type = np.uint8
+                else:
+                    raise ValueError(
+                        f"Unsupported data type {data_types[name]} for input {name}."
+                    )
                 img = cast(np.ndarray, np.random.rand(*size)).astype(
-                    np.float32
+                    numpy_type
                 )
-                with tempfile.TemporaryFile() as f:
+                with tempfile.NamedTemporaryFile() as f:
                     img.tofile(f)
                     self.adb.push(
                         f.name,
@@ -125,39 +170,162 @@ def _prepare_raw_inputs(self, num_images: int) -> None:
                     )
 
                 input_list += f"{name}:=/data/local/tmp/{self.model_name}/inputs/{name}_{i}.raw "
-        with tempfile.NamedTemporaryFile(mode="w", delete=False) as f:
+            input_list += "\n"
+
+        temp_path = tempfile.mktemp()
+        with open(temp_path, "w") as f:
             f.write(input_list)
+            f.flush()
+        try:
             self.adb.push(
-                f.name, f"/data/local/tmp/{self.model_name}/input_list.txt"
+                temp_path, f"/data/local/tmp/{self.model_name}/input_list.txt"
             )
+        finally:
+            Path(temp_path).unlink()
+
+    def _get_data_type(self) -> dai.TensorInfo.DataType:
+        """Retrieve the data type of the model inputs. If the model is not a HubAI
+        model, it defaults to dai.TensorInfo.DataType.U8F (INT8).
+
+        @return: The data type of the model inputs.
+        @rtype: dai.TensorInfo.DataType
+        """
+        from modelconverter.cli import Request, slug_to_id
+
+        if not isinstance(
+            self.model_path, str
+        ) or not self.HUB_MODEL_PATTERN.match(self.model_path):
+            return dai.TensorInfo.DataType.U8F
+
+        model_id = slug_to_id(self.model_name, "models")
+        model_variant = self.model_path.split(":")[1]
+
+        model_variants = []
+        for is_public in [True, False]:
+            try:
+                model_variants += Request.get(
+                    "modelVersions/",
+                    params={"model_id": model_id, "is_public": is_public},
+                )
+            except Exception:
+                continue
+
+        model_version_id = None
+        for version in model_variants:
+            if version["variant_slug"] == model_variant:
+                model_version_id = version["id"]
+                break
+
+        if not model_version_id:
+            return dai.TensorInfo.DataType.U8F
+
+        model_instances = []
+        for is_public in [True, False]:
+            try:
+                model_instances += Request.get(
+                    "modelInstances/",
+                    params={
+                        "model_id": model_id,
+                        "model_version_id": model_version_id,
+                        "is_public": is_public,
+                    },
+                )
+            except Exception:
+                continue
+
+        model_precision_type = "INT8"
+        for instance in model_instances:
+            if instance["platforms"] == ["RVC4"]:
+                model_precision_type = instance.get(
+                    "model_precision_type", "INT8"
+                )
+                break
+
+        if model_precision_type == "FP16":
+            return dai.TensorInfo.DataType.FP16
+        elif model_precision_type == "FP32":
+            self.force_cpu = True
+            return dai.TensorInfo.DataType.FP32
+
+        return dai.TensorInfo.DataType.U8F
 
     def benchmark(self, configuration: Configuration) -> BenchmarkResult:
+        dai_benchmark = configuration.get("dai_benchmark")
         try:
-            return self._benchmark(self.model_path, **configuration)
+            if dai_benchmark:
+                for key in ["dai_benchmark", "num_images"]:
+                    configuration.pop(key)
+                return self._benchmark_dai(self.model_path, **configuration)
+            else:
+                for key in [
+                    "dai_benchmark",
+                    "repetitions",
+                    "num_threads",
+                    "num_messages",
+                ]:
+                    configuration.pop(key)
+                return self._benchmark_snpe(self.model_path, **configuration)
         finally:
-            # so we don't delete the wrong directory
-            assert self.model_name
+            if not dai_benchmark:
+                # so we don't delete the wrong directory
+                assert self.model_name
 
-            self.adb.shell(f"rm -rf /data/local/tmp/{self.model_name}")
+                self.adb.shell(f"rm -rf /data/local/tmp/{self.model_name}")
 
-    def _benchmark(
-        self, model_path: Path, num_images: int, profile: str
+    def _benchmark_snpe(
+        self,
+        model_path: Path | str,
+        num_images: int,
+        profile: str,
+        runtime: str,
     ) -> BenchmarkResult:
+        runtime = RUNTIMES[runtime] if runtime in RUNTIMES else "use_dsp"
+
+        if isinstance(model_path, str):
+            model_archive = dai.getModelFromZoo(
+                dai.NNModelDescription(
+                    model_path,
+                    platform=dai.Platform.RVC4.name,
+                ),
+                apiKey=environ.HUBAI_API_KEY if environ.HUBAI_API_KEY else "",
+            )
+            tmp_dir = Path(model_archive).parent / "tmp"
+            shutil.unpack_archive(model_archive, tmp_dir)
+
+            dlc_model_name = json.loads((tmp_dir / "config.json").read_text())[
+                "model"
+            ]["metadata"]["path"]
+            dlc_path = next(tmp_dir.rglob(dlc_model_name), None)
+            if not dlc_path:
+                raise ValueError("Could not find model.dlc in the archive.")
+            self.model_path = dlc_path
+        elif str(model_path).endswith(".dlc"):
+            dlc_path = model_path
+        else:
+            raise ValueError(
+                "Unsupported model format. Supported formats: .dlc, or HubAI model slug."
+            )
+
         self.adb.shell(f"mkdir /data/local/tmp/{self.model_name}")
         self.adb.push(
-            str(model_path), f"/data/local/tmp/{self.model_name}/model.dlc"
+            str(dlc_path), f"/data/local/tmp/{self.model_name}/model.dlc"
         )
         self._prepare_raw_inputs(num_images)
+        if self.force_cpu:
+            logger.warning(
+                "Forcing CPU runtime due to Float_32 input data type."
+            )
+            runtime = "use_cpu"
 
         _, stdout, _ = self.adb.shell(
-            "source /data/local/tmp/source_me.sh && "
+            # "source /data/local/tmp/source_me.sh && "
             "snpe-parallel-run "
             f"--container /data/local/tmp/{self.model_name}/model.dlc "
             f"--input_list /data/local/tmp/{self.model_name}/input_list.txt "
             f"--output_dir /data/local/tmp/{self.model_name}/outputs "
             f"--perf_profile {profile} "
-            "--cpu_fallback false "
-            "--use_dsp"
+            "--cpu_fallback true "
+            f"--{runtime}"
         )
         pattern = re.compile(r"(\d+\.\d+) infs/sec")
         match = pattern.search(stdout)
@@ -167,4 +335,120 @@ def _benchmark(
                 f"stdout:\n{stdout}"
             )
         fps = float(match.group(1))
-        return BenchmarkResult(fps=fps, latency=0)
+        return BenchmarkResult(fps=fps, latency="N/A")
+
+    def _benchmark_dai(
+        self,
+        model_path: Path | str,
+        profile: str,
+        runtime: str,
+        repetitions: int,
+        num_threads: int,
+        num_messages: int,
+    ) -> BenchmarkResult:
+        device = dai.Device()
+
+        if device.getPlatform() != dai.Platform.RVC4:
+            raise ValueError(
+                f"Found {device.getPlatformAsString()}, expected RVC4 platform."
+            )
+
+        if isinstance(model_path, str):
+            modelPath = dai.getModelFromZoo(
+                dai.NNModelDescription(
+                    model_path,
+                    platform=device.getPlatformAsString(),
+                ),
+                apiKey=environ.HUBAI_API_KEY if environ.HUBAI_API_KEY else "",
+            )
+        elif str(model_path).endswith(".tar.xz"):
+            modelPath = str(model_path)
+        elif str(model_path).endswith(".dlc"):
+            raise ValueError(
+                "DLC model format is not currently supported for dai-benchmark. Please use SNPE for DLC models."
+            )
+        else:
+            raise ValueError(
+                "Unsupported model format. Supported formats: .tar.xz, or HubAI model slug."
+            )
+
+        inputSizes = []
+        inputNames = []
+        if isinstance(model_path, str) or str(model_path).endswith(".tar.xz"):
+            modelArhive = dai.NNArchive(modelPath)
+            for input in modelArhive.getConfig().model.inputs:
+                inputSizes.append(input.shape)
+                inputNames.append(input.name)
+
+        data_type = self._get_data_type()
+        inputData = dai.NNData()
+        for name, inputSize in zip(inputNames, inputSizes):
+            img = np.random.randint(0, 255, inputSize, np.uint8)
+            inputData.addTensor(name, img, dataType=data_type)
+
+        with dai.Pipeline(device) as pipeline, Progress() as progress:
+            repet_task = progress.add_task(
+                "[magenta]Repetition", total=repetitions
+            )
+
+            benchmarkOut = pipeline.create(dai.node.BenchmarkOut)
+            benchmarkOut.setRunOnHost(False)
+            benchmarkOut.setFps(-1)
+
+            neuralNetwork = pipeline.create(dai.node.NeuralNetwork)
+            if isinstance(model_path, str) or str(model_path).endswith(
+                ".tar.xz"
+            ):
+                neuralNetwork.setNNArchive(modelArhive)
+
+            if self.force_cpu:
+                logger.warning(
+                    "Forcing CPU runtime due to Float_32 input data type."
+                )
+                runtime = "cpu"
+            neuralNetwork.setBackendProperties(
+                {
+                    "runtime": runtime,
+                    "performance_profile": profile,
+                }
+            )
+            if num_threads > 1:
+                logger.warning(
+                    "num_threads > 1 is not supported for RVC4. Setting num_threads to 1."
+                )
+                num_threads = 1
+            neuralNetwork.setNumInferenceThreads(num_threads)
+
+            benchmarkIn = pipeline.create(dai.node.BenchmarkIn)
+            benchmarkIn.setRunOnHost(False)
+            benchmarkIn.sendReportEveryNMessages(num_messages)
+            benchmarkIn.logReportsAsWarnings(False)
+
+            benchmarkOut.out.link(neuralNetwork.input)
+            neuralNetwork.out.link(benchmarkIn.input)
+
+            outputQueue = benchmarkIn.report.createOutputQueue()
+            inputQueue = benchmarkOut.input.createInputQueue()
+
+            pipeline.start()
+            inputQueue.send(inputData)
+
+            rep = 0
+            fps_list = []
+            avg_latency_list = []
+            while pipeline.isRunning() and rep < repetitions:
+                benchmarkReport = outputQueue.get()
+                if not isinstance(benchmarkReport, dai.BenchmarkReport):
+                    raise ValueError(
+                        f"Expected BenchmarkReport, got {type(benchmarkReport)}"
+                    )
+                fps = benchmarkReport.fps
+                avg_latency = benchmarkReport.averageLatency * 1000
+
+                fps_list.append(fps)
+                avg_latency_list.append(avg_latency)
+                progress.update(repet_task, advance=1)
+                rep += 1
+
+            # Currently, the latency measurement is only supported on RVC4 when using ImgFrame as the input to the BenchmarkOut which we don't do here.
+            return BenchmarkResult(np.mean(fps_list), "N/A")
diff --git a/modelconverter/utils/__init__.py b/modelconverter/utils/__init__.py
index 9566125..7d7bf33 100644
--- a/modelconverter/utils/__init__.py
+++ b/modelconverter/utils/__init__.py
@@ -19,6 +19,7 @@
     resolve_path,
     upload_file_to_remote,
 )
+from .hubai_utils import is_hubai_available
 from .image import read_calib_dir, read_image
 from .layout import guess_new_layout, make_default_layout
 from .metadata import Metadata, get_metadata
@@ -45,6 +46,7 @@
     "subprocess_run",
     "download_from_remote",
     "upload_file_to_remote",
+    "is_hubai_available",
     "get_protocol",
     "process_nn_archive",
     "modelconverter_config_to_nn",
diff --git a/modelconverter/utils/hubai_utils.py b/modelconverter/utils/hubai_utils.py
new file mode 100644
index 0000000..1229179
--- /dev/null
+++ b/modelconverter/utils/hubai_utils.py
@@ -0,0 +1,25 @@
+def is_hubai_available(model_name: str, model_variant: str) -> bool:
+    from modelconverter.cli import Request, slug_to_id
+
+    model_slug = f"{model_name}:{model_variant}"
+
+    model_id = slug_to_id(
+        model_name,
+        "models",
+    )
+
+    model_variants = []
+    for is_public in [True, False]:
+        try:
+            model_variants += Request.get(
+                "modelVersions/",
+                params={"model_id": model_id, "is_public": is_public},
+            )
+        except Exception:
+            pass
+
+    for version in model_variants:
+        if f"{model_name}:{version['variant_slug']}" == model_slug:
+            return True
+
+    return False
diff --git a/requirements-bench.txt b/requirements-bench.txt
index a2d1086..9f3e337 100644
--- a/requirements-bench.txt
+++ b/requirements-bench.txt
@@ -1,2 +1,2 @@
-depthai
+depthai>=3.0.0a12
 pandas
diff --git a/requirements.txt b/requirements.txt
index cb9cb9c..35ca1da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,4 @@ keyring
 onnx_graphsurgeon
 onnxoptimizer
 wget
-aiobotocore<2.18 # to be removed after luxonis-ml>=0.6.0
\ No newline at end of file
+aiobotocore<2.18 # to be removed after luxonis-ml>=0.6.0
diff --git a/shared_with_container/configs/defaults.yaml b/shared_with_container/configs/defaults.yaml
index 095bfbe..395b175 100644
--- a/shared_with_container/configs/defaults.yaml
+++ b/shared_with_container/configs/defaults.yaml
@@ -85,6 +85,9 @@ stages:
     # Do not run ONNX simplifier on the provided model.
     disable_onnx_simplification: false
 
+    # Do not run ONNX graph optimisations on the provided model.
+    disable_onnx_optimisation: false
+
     # List of input names with shapes,
     # data types, values for freezing and input modifiers.
     # Overrides the top-level input modifiers.