luxonis · ptoupas · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+default_language_version:
+    python: python3
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.2

@@ -175,10 +175,12 @@ def benchmark(
 
     **RVC2**
 
-    - `--repetitions`: The number of repetitions to perform. Default: `1`
+    - `--repetitions`: The number of repetitions to perform. Default: `10`
 
     - `--num-threads`: The number of threads to use for inference. Default: `2`
 
+    - `--num-messages`: The number of messages to measure for each report. Default: `50`
+
     ---
 
     **RVC3**
@@ -191,8 +193,18 @@ def benchmark(
 
     - `--profile`: The SNPE profile to use for inference. Default: `"default"`
 
+    - `--runtime`: The SNPE runtime to use for inference (dsp or cpu). Default: `"dsp"`
+
     - `--num-images`: The number of images to use for inference. Default: `1000`
 
+    - `--dai-benchmark`: Whether to run the benchmark using the DAI V3. If False the SNPE tools are used. Default: `True`
+
+    - `--repetitions`: The number of repetitions to perform (dai-benchmark only). Default: `10`
+
+    - `--num-threads`: The number of threads to use for inference (dai-benchmark only). Default: `1`
+
+    - `--num-messages`: The number of messages to measure for each report (dai-benchmark only). Default: `50`
+
     ---
     """
 
@@ -203,6 +215,13 @@ def benchmark(
             key = key[2:].replace("-", "_")
         else:
             raise typer.BadParameter(f"Unknown argument: {key}")
+        if key == "dai_benchmark":
+            value = value.capitalize()
+            if value not in ["True", "False"]:
+                raise typer.BadParameter(
+                    "dai_benchmark must be either True or False"
+                )
+            value = value == "True"
         kwargs[key] = value
     Benchmark = get_benchmark(target)
     benchmark = Benchmark(str(model_path))

@@ -7,7 +7,7 @@
 import pandas as pd
 from typing_extensions import TypeAlias
 
-from modelconverter.utils import resolve_path
+from modelconverter.utils import is_hubai_available, resolve_path
 
 logger = getLogger(__name__)
 
@@ -28,9 +28,14 @@ def __init__(
         model_path: str,
         dataset_path: Optional[Path] = None,
     ):
-        self.model_path = resolve_path(model_path, Path.cwd())
+        if not is_hubai_available(model_path):
+            self.model_path = resolve_path(model_path, Path.cwd())
+            self.model_name = self.model_path.stem
+        else:
+            self.model_path = model_path
+            self.model_name = self.model_path.split("/", 1)[-1]
         self.dataset_path = dataset_path
-        self.model_name = self.model_path.stem
+
         self.header = [
             *self.default_configuration.keys(),
             "fps",
@@ -64,7 +69,13 @@ def print_results(
             title=f"Benchmark Results for [yellow]{self.model_name}",
             box=box.ROUNDED,
         )
-        for field in self.header:
+
+        updated_header = [
+            *results[0][0].keys(),
+            "fps",
+            "latency (ms)",
+        ]
+        for field in updated_header:
             table.add_column(f"[cyan]{field}")
         for configuration, result in results:
             fps_color = (
@@ -74,17 +85,22 @@ def print_results(
                 if result.fps < 5
                 else "green"
             )
-            latency_color = (
-                "yellow"
-                if 50 < result.latency < 100
-                else "red"
-                if result.latency > 100
-                else "green"
-            )
+            if isinstance(result.latency, str):
+                latency_color = "orange3"
+            else:
+                latency_color = (
+                    "yellow"
+                    if 50 < result.latency < 100
+                    else "red"
+                    if result.latency > 100
+                    else "green"
+                )
             table.add_row(
                 *map(lambda x: f"[magenta]{x}", configuration.values()),
                 f"[{fps_color}]{result.fps:.2f}",
-                f"[{latency_color}]{result.latency:.5f}",
+                f"[{latency_color}]{result.latency}"
+                if isinstance(result.latency, str)
+                else f"[{latency_color}]{result.latency:.5f}",
             )
         console = Console()
         console.print(table)

@@ -1,11 +1,9 @@
 import logging
-import time
 from pathlib import Path
-from typing import Dict, List, cast
+from typing import List
 
 import depthai as dai
 import numpy as np
-from depthai import NNData
 from rich.progress import Progress
 
 from ..base_benchmark import Benchmark, BenchmarkResult, Configuration
@@ -20,150 +18,116 @@ def default_configuration(self) -> Configuration:
         repetitions: The number of repetitions to perform.
         num_threads: The number of threads to use for inference.
         """
-        return {"repetitions": 1, "num_threads": 2}
+        return {"repetitions": 10, "num_messages": 50, "num_threads": 2}
 
     @property
     def all_configurations(self) -> List[Configuration]:
         return [
-            {"repetitions": 5, "num_threads": 1},
-            {"repetitions": 5, "num_threads": 2},
-            {"repetitions": 5, "num_threads": 3},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 1},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 2},
+            {"repetitions": 10, "num_messages": 50, "num_threads": 3},
         ]
 
     def benchmark(self, configuration: Configuration) -> BenchmarkResult:
         return self._benchmark(self.model_path, **configuration)
 
     @staticmethod
     def _benchmark(
-        model_path: Path, repetitions: int, num_threads: int
+        model_path: Path | str,
+        repetitions: int,
+        num_messages: int,
+        num_threads: int,
     ) -> BenchmarkResult:
-        model = dai.OpenVINO.Blob(model_path)
-        input_name_shape: Dict[str, List[int]] = {}
-        input_name_type = {}
-        for i in list(model.networkInputs):
-            input_name_shape[i] = model.networkInputs[i].dims
-            input_name_type[i] = model.networkInputs[i].dataType.name
-
-        output_name_shape = {}
-        output_name_type = {}
-        for i in list(model.networkOutputs):
-            output_name_shape[i] = model.networkOutputs[i].dims
-            output_name_type[i] = model.networkOutputs[i].dataType.name
-
-        pipeline = dai.Pipeline()
-
-        detection_nn = pipeline.createNeuralNetwork()
-        detection_nn.setBlobPath(model_path)
-        detection_nn.setNumInferenceThreads(num_threads)
-        detection_nn.input.setBlocking(True)
-        detection_nn.input.setQueueSize(1)
-
-        nn_in = pipeline.createXLinkIn()
-        nn_in.setMaxDataSize(6291456)
-        nn_in.setStreamName("in_nn")
-        nn_in.out.link(detection_nn.input)
-
-        xout_nn = pipeline.createXLinkOut()
-        xout_nn.setStreamName("nn")
-        xout_nn.input.setQueueSize(1)
-        xout_nn.input.setBlocking(True)
-        detection_nn.out.link(xout_nn.input)
-
-        xlink_buffer_max_size = 5 * 1024 * 1024
-        product_sum = sum(
-            map(lambda x: np.product(np.array(x)), output_name_shape.values())
-        )
-
-        xlink_buffer_count = int(xlink_buffer_max_size / product_sum)
-
-        logger.info(f"XLink buffer count: {xlink_buffer_count}")
-        if xlink_buffer_count > 1000:
-            logger.warning(
-                "XLink buffer count is too high! "
-                "The benchmarking will take more time and "
-                "the results may be overestimated."
+        device = dai.Device()
+        if device.getPlatform() != dai.Platform.RVC2:
+            raise ValueError(
+                f"Found {device.getPlatformAsString()}, expected RVC2 platform."
             )
 
-        with dai.Device(pipeline) as device, Progress() as progress:
-            device = cast(dai.Device, device)
-            detection_in_count = 100 + xlink_buffer_count
-            detection_in = device.getInputQueue(
-                "in_nn", maxSize=detection_in_count, blocking=True
+        if isinstance(model_path, str):
+            modelPath = dai.getModelFromZoo(
+                dai.NNModelDescription(
+                    model_path,
+                    platform=device.getPlatformAsString(),
+                )
             )
-            q_nn = device.getOutputQueue(name="nn", maxSize=1, blocking=True)
+        elif str(model_path).endswith(".tar.xz"):
+            modelPath = str(model_path)
+        elif str(model_path).endswith(".blob"):
+            modelPath = model_path
+        else:
+            raise ValueError(
+                "Unsupported model format. Supported formats: .tar.xz, .blob, or HubAI model slug."
+            )
+
+        inputSizes = []
+        inputNames = []
+        if isinstance(model_path, str) or str(model_path).endswith(".tar.xz"):
+            modelArhive = dai.NNArchive(modelPath)
+            for input in modelArhive.getConfig().model.inputs:
+                inputSizes.append(input.shape[::-1])
+                inputNames.append(input.name)
+        elif str(model_path).endswith(".blob"):
+            blob_model = dai.OpenVINO.Blob(modelPath)
+            for input in blob_model.networkInputs:
+                inputSizes.append(blob_model.networkInputs[input].dims)
+                inputNames.append(input)
+
+        inputData = dai.NNData()
+        for name, inputSize in zip(inputNames, inputSizes):
+            img = np.random.randint(
+                0, 255, (inputSize[1], inputSize[0], 3), np.uint8
+            )
+            inputData.addTensor(name, img)
 
-            fps_storage = []
-            diffs = []
-            time.sleep(1)
+        with dai.Pipeline(device) as pipeline, Progress() as progress:
             repet_task = progress.add_task(
                 "[magenta]Repetition", total=repetitions
             )
-            infer_task = progress.add_task(
-                "[magenta]Inference", total=300 + 2 * xlink_buffer_count
-            )
-            for _ in range(repetitions):
-                progress.reset(infer_task, total=300 + 2 * xlink_buffer_count)
-                for _ in range(100 + xlink_buffer_count):
-                    nn_data = dai.NNData()
-                    for inp_name in input_name_shape:
-                        if input_name_type[inp_name] in ["FLOAT16", "FLOAT32"]:
-                            frame = cast(
-                                np.ndarray,
-                                np.random.rand(*input_name_shape[inp_name]),
-                            )
-                            frame = frame.astype(
-                                "float16"
-                                if input_name_type[inp_name] == "FLOAT16"
-                                else "float32"
-                            )
-                        elif input_name_type[inp_name] in ["INT", "I8", "U8F"]:
-                            frame = np.random.randint(
-                                256,
-                                size=input_name_shape[inp_name],
-                                dtype=(
-                                    np.int32
-                                    if input_name_type[inp_name] == "INT"
-                                    else (
-                                        np.uint8
-                                        if input_name_type[inp_name] == "U8F"
-                                        else np.int8
-                                    )
-                                ),
-                            )
-                        else:
-                            raise RuntimeError(
-                                f"Unknown input type detected: {input_name_type[inp_name]}!"
-                            )
-
-                        nn_data.setLayer(inp_name, frame)
-
-                    if len(input_name_shape) == 0:
-                        raise RuntimeError(
-                            "Failed to create input data: missing required information for one or more input layers."
-                        )
-                    detection_in.send(nn_data)
-                    progress.update(infer_task, advance=1)
-
-                for _ in range(100):
-                    progress.update(infer_task, advance=1)
-                    time.sleep(3 / 100)
-
-                for _ in range(40 + xlink_buffer_count):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
-
-                start = time.time()
-                for _ in range(50):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
-                diff = time.time() - start
-                diffs.append(diff / 50)
-                fps_storage.append(50 / diff)
-
-                for _ in range(10):
-                    cast(NNData, q_nn.get()).getFirstLayerFp16()
-                    progress.update(infer_task, advance=1)
+
+            benchmarkOut = pipeline.create(dai.node.BenchmarkOut)
+            benchmarkOut.setRunOnHost(False)
+            benchmarkOut.setFps(-1)
+
+            neuralNetwork = pipeline.create(dai.node.NeuralNetwork)
+            if isinstance(model_path, str) or str(model_path).endswith(
+                ".tar.xz"
+            ):
+                neuralNetwork.setNNArchive(modelArhive)
+            elif str(model_path).endswith(".blob"):
+                neuralNetwork.setBlobPath(modelPath)
+            neuralNetwork.setNumInferenceThreads(num_threads)
+
+            benchmarkIn = pipeline.create(dai.node.BenchmarkIn)
+            benchmarkIn.setRunOnHost(False)
+            benchmarkIn.sendReportEveryNMessages(num_messages)
+            benchmarkIn.logReportsAsWarnings(False)
+
+            benchmarkOut.out.link(neuralNetwork.input)
+            neuralNetwork.out.link(benchmarkIn.input)
+
+            outputQueue = benchmarkIn.report.createOutputQueue()
+            inputQueue = benchmarkOut.input.createInputQueue()
+
+            pipeline.start()
+            inputQueue.send(inputData)
+
+            rep = 0
+            fps_list = []
+            avg_latency_list = []
+            while pipeline.isRunning() and rep < repetitions:
+                benchmarkReport = outputQueue.get()
+                if not isinstance(benchmarkReport, dai.BenchmarkReport):
+                    raise ValueError(
+                        f"Expected BenchmarkReport, got {type(benchmarkReport)}"
+                    )
+                fps = benchmarkReport.fps
+                avg_latency = benchmarkReport.averageLatency * 1000
+
+                fps_list.append(fps)
+                avg_latency_list.append(avg_latency)
                 progress.update(repet_task, advance=1)
+                rep += 1
 
-            diffs = np.array(diffs) * 1000
-            return BenchmarkResult(np.mean(fps_storage), np.mean(diffs))
+            # Currently, the latency measurement is not supported on RVC2 by the depthai library.
+            return BenchmarkResult(np.mean(fps_list), "N/A")