luxonis · HonzaCuhel · Oct 28, 2024 · Oct 17, 2024 · Oct 19, 2024 · Oct 19, 2024
diff --git a/README.md b/README.md
@@ -157,13 +157,13 @@ datadreamer --config <path-to-config>
 
 ### 🔧 Additional Parameters
 
-- `--task`: Choose between detection and classification. Default is `detection`.
+- `--task`: Choose between detection, classification and instance segmentation. Default is `detection`.
 - `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`, `cls-single`.
 - `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.
 - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
-- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.
+- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-slimsam` for instance segmentation. Default is `owlv2`.
 - `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.
 - `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.
 - `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `""`.
@@ -199,6 +199,7 @@ datadreamer --config <path-to-config>
 |                   | [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning)                     | Fast and accurate (1024x1024 images)    |
 | Image Annotation  | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble)                    | Open-Vocabulary object detector         |
 |                   | [CLIP](https://huggingface.co/openai/clip-vit-base-patch32)                           | Zero-shot-image-classification          |
+|                   | [SlimSAM](https://huggingface.co/Zigeng/SlimSAM-uniform-50)                           | Zero-shot-instance-segmentation         |
 
 <a name="example"></a>
 
@@ -271,6 +272,23 @@ save_dir/
 }
 ```
 
+3. Instance Segmentation Annotations (instance_segmentation_annotations.json):
+
+- Each entry corresponds to an image and contains bounding boxes, masks and labels for objects in the image.
+- Format:
+
+```bash
+{
+  "image_path": {
+    "boxes": [[x_min, y_min, x_max, y_max], ...],
+    "masks": [[[x0, y0],[x1, y1],...], [[x0, y0],[x1, y1],...], ....]
+    "labels": [label_index, ...]
+  },
+  ...
+  "class_names": ["class1", "class2", ...]
+}
+```
+
 <a name="limitations"></a>
 
 ## ⚠️ Limitations

diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py
@@ -3,5 +3,12 @@
 from .clip_annotator import CLIPAnnotator
 from .image_annotator import BaseAnnotator, TaskList
 from .owlv2_annotator import OWLv2Annotator
+from .slimsam_annotator import SlimSAMAnnotator
 
-__all__ = ["BaseAnnotator", "TaskList", "OWLv2Annotator", "CLIPAnnotator"]
+__all__ = [
+    "BaseAnnotator",
+    "TaskList",
+    "OWLv2Annotator",
+    "CLIPAnnotator",
+    "SlimSAMAnnotator",
+]
diff --git a/datadreamer/dataset_annotation/owlv2_annotator.py b/datadreamer/dataset_annotation/owlv2_annotator.py
@@ -98,7 +98,7 @@ def _generate_annotations(
         """
         n = len(images)
         batched_prompts = [prompts] * n
-        target_sizes = torch.Tensor(images[0].size[::-1]).repeat((n, 1)).to(self.device)
+        target_sizes = torch.Tensor([img.size[::-1] for img in images]).to(self.device)
 
         # resize the images to the model's input size
         img_size = (1008, 1008) if self.size == "large" else (960, 960)
@@ -121,7 +121,8 @@ def _get_annotations(
         self,
         pred: Dict[str, torch.Tensor],
         use_tta: bool,
-        img_dim: int,
+        img_width: int,
+        img_height: int,
         synonym_dict: Dict[str, List[str]] | None,
         synonym_dict_rev: Dict[int, int] | None,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -130,7 +131,8 @@ def _get_annotations(
         Args:
             pred: The predictions from the model.
             use_tta (bool): Flag to whether the test-time augmentation was applied.
-            img_dim (int): The dimension of the image.
+            img_width (int): The width of the image.
+            img_height (int): The height of the image.
             synonym_dict (dict): Dictionary for handling synonyms in labels.
             synonym_dict_rev (dict): Dictionary for handling synonyms in labels.
 
@@ -143,17 +145,44 @@ def _get_annotations(
             pred["scores"],
             pred["labels"],
         )
-        # Flip boxes back if using TTA
-        if use_tta:
-            boxes[:, [0, 2]] = img_dim - boxes[:, [2, 0]]
 
         if synonym_dict is not None:
             labels = torch.tensor(
                 [synonym_dict_rev[label.item()] for label in labels], dtype=torch.int64
             )
 
+        boxes = self._correct_bboxes_misalignment(boxes, img_width, img_height)
+
+        # Flip boxes back if using TTA
+        if use_tta:
+            boxes[:, [0, 2]] = img_width - boxes[:, [2, 0]]
+
         return boxes, scores, labels
 
+    def _correct_bboxes_misalignment(
+        self, input_boxes: torch.Tensor, width: int, height: int
+    ) -> List[torch.Tensor]:
+        """This function corrects the bounding boxes misalignment appearing when using
+        the `transformers==4.45.2`.
+
+        Problem description: With a non-square aspect ratio, the predictions are shifted in the smaller dimension.
+        Solution: https://discuss.huggingface.co/t/owl-v2-bounding-box-misalignment-problem/66181
+
+        Args:
+            input_boxes (torch.Tensor): The bounding boxes to be corrected.
+            width (int): The width of the image.
+            height (int): The height of the image.
+
+        Returns:
+            List[torch.Tensor]: The corrected bounding boxes.
+        """
+        width_ratio = width / height if width < height else 1
+        height_ratio = height / width if height < width else 1
+        ratios = torch.tensor(
+            [width_ratio, height_ratio] * 2, device=input_boxes.device
+        )
+        return input_boxes * ratios
+
     def annotate_batch(
         self,
         images: List[PIL.Image.Image],
@@ -206,31 +235,34 @@ def annotate_batch(
         final_labels = []
 
         for i, (pred, aug_pred) in enumerate(zip(preds, augmented_preds)):
+            img_width, img_height = images[i].size
             boxes, scores, labels = self._get_annotations(
                 pred,
                 False,
-                images[i].size[0],
+                img_width,
+                img_height,
                 synonym_dict,
                 synonym_dict_rev if synonym_dict is not None else None,
             )
 
-            all_boxes = [boxes.to("cpu")]
-            all_scores = [scores.to("cpu")]
-            all_labels = [labels.to("cpu")]
+            all_boxes = [boxes.cpu()]
+            all_scores = [scores.cpu()]
+            all_labels = [labels.cpu()]
 
             # Flip boxes back if using TTA
             if use_tta:
                 aug_boxes, aug_scores, aug_labels = self._get_annotations(
                     aug_pred,
                     True,
-                    images[i].size[0],
+                    img_width,
+                    img_height,
                     synonym_dict,
                     synonym_dict_rev if synonym_dict is not None else None,
                 )
 
-                all_boxes.append(aug_boxes.to("cpu"))
-                all_scores.append(aug_scores.to("cpu"))
-                all_labels.append(aug_labels.to("cpu"))
+                all_boxes.append(aug_boxes.cpu())
+                all_scores.append(aug_scores.cpu())
+                all_labels.append(aug_labels.cpu())
 
             one_hot_labels = torch.nn.functional.one_hot(
                 torch.cat(all_labels), num_classes=len(prompts)
@@ -294,8 +326,8 @@ def release(self, empty_cuda_cache: bool = False) -> None:
 
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
-    annotator = OWLv2Annotator(device="cpu", size="large")
+    annotator = OWLv2Annotator(device="cpu", size="base")
     final_boxes, final_scores, final_labels = annotator.annotate_batch(
-        [im], ["robot", "horse"]
+        [im], ["bus", "person"]
     )
     annotator.release()
diff --git a/datadreamer/dataset_annotation/slimsam_annotator.py b/datadreamer/dataset_annotation/slimsam_annotator.py
@@ -0,0 +1,153 @@
+from __future__ import annotations
+
+import logging
+from typing import List
+
+import numpy as np
+import PIL
+import torch
+from transformers import SamModel, SamProcessor
+
+from datadreamer.dataset_annotation.image_annotator import BaseAnnotator
+from datadreamer.dataset_annotation.utils import mask_to_polygon
+
+logger = logging.getLogger(__name__)
+
+
+class SlimSAMAnnotator(BaseAnnotator):
+    """A class for image annotation using the SlimSAM model, specializing in instance
+    segmentation.
+
+    Attributes:
+        model (SAM): The SAM model for instance segmentation.
+        processor (SamProcessor): The processor for the SAM model.
+        device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU).
+        size (str): The size of the SAM model to use ('base' or 'large').
+
+    Methods:
+        _init_model(): Initializes the SAM model.
+        _init_processor(): Initializes the processor for the SAM model.
+        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+        release(empty_cuda_cache): Releases resources and optionally empties the CUDA cache.
+    """
+
+    def __init__(
+        self,
+        seed: float = 42,
+        device: str = "cuda",
+        size: str = "base",
+    ) -> None:
+        """Initializes the SAMAnnotator with a specific seed and device.
+
+        Args:
+            seed (float): Seed for reproducibility. Defaults to 42.
+            device (str): The device to run the model on. Defaults to 'cuda'.
+        """
+        super().__init__(seed)
+        self.size = size
+        self.model = self._init_model()
+        self.processor = self._init_processor()
+        self.device = device
+        self.model.to(self.device)
+
+    def _init_model(self) -> SamModel:
+        """Initializes the SAM model for object detection.
+
+        Returns:
+            SamModel: The initialized SAM model.
+        """
+        logger.info(f"Initializing `SlimSAM {self.size} model...")
+        if self.size == "large":
+            return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-50")
+        return SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77")
+
+    def _init_processor(self) -> SamProcessor:
+        """Initializes the processor for the SAM model.
+
+        Returns:
+            SamProcessor: The initialized processor.
+        """
+        if self.size == "large":
+            return SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-50")
+        return SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-77")
+
+    def annotate_batch(
+        self,
+        images: List[PIL.Image.Image],
+        boxes_batch: List[np.ndarray],
+        iou_threshold: float = 0.2,
+    ) -> List[List[List[float]]]:
+        """Annotates images for the task of instance segmentation using the SlimSAM
+        model.
+
+        Args:
+            images: The images to be annotated.
+            boxes_batch: The bounding boxes of found objects.
+            iou_threshold (float, optional): Intersection over union threshold for non-maximum suppression. Defaults to 0.2.
+
+        Returns:
+            List: A list containing the final segment masks represented as a polygon.
+        """
+        final_segments = []
+
+        n = len(images)
+
+        for i in range(n):
+            boxes = boxes_batch[i].tolist()
+            if len(boxes) == 0:
+                final_segments.append([])
+                continue
+
+            inputs = self.processor(
+                images[i], input_boxes=[boxes], return_tensors="pt"
+            ).to(self.device)
+
+            with torch.no_grad():
+                outputs = self.model(**inputs, return_dict=True)
+
+            masks = self.processor.image_processor.post_process_masks(
+                outputs.pred_masks.cpu(),
+                inputs["original_sizes"].cpu(),
+                inputs["reshaped_input_sizes"].cpu(),
+            )[0]
+
+            iou_scores = outputs.iou_scores.cpu()
+
+            image_masks = []
+            for j in range(len(boxes)):
+                keep_idx = iou_scores[0, j] >= iou_threshold
+                filtered_masks = masks[j, keep_idx].cpu().float()
+                final_masks = filtered_masks.permute(1, 2, 0)
+                final_masks = final_masks.mean(axis=-1)
+                final_masks = (final_masks > 0).int()
+                final_masks = final_masks.numpy().astype(np.uint8)
+                polygon = mask_to_polygon(final_masks)
+                if len(polygon) != 0:
+                    image_masks.append(polygon)
+
+            final_segments.append(image_masks)
+
+        return final_segments
+
+    def release(self, empty_cuda_cache: bool = False) -> None:
+        """Releases the model and optionally empties the CUDA cache.
+
+        Args:
+            empty_cuda_cache (bool, optional): Whether to empty the CUDA cache. Defaults to False.
+        """
+        self.model = self.model.to("cpu")
+        if empty_cuda_cache:
+            with torch.no_grad():
+                torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    import requests
+    from PIL import Image
+
+    url = "https://ultralytics.com/images/bus.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    annotator = SlimSAMAnnotator(device="cpu", size="large")
+    final_segments = annotator.annotate_batch([im], [np.array([[3, 229, 559, 650]])])
+    print(len(final_segments), len(final_segments[0]))
+    print(final_segments[0][0][:5])
diff --git a/datadreamer/dataset_annotation/utils.py b/datadreamer/dataset_annotation/utils.py
@@ -2,6 +2,8 @@
 
 from typing import List
 
+import cv2
+import numpy as np
 from torchvision import transforms
 
 
@@ -32,3 +34,27 @@ def apply_tta(image) -> List[transforms.Compose]:
 
     augmented_images = [t(image) for t in tta_transforms]
     return augmented_images
+
+
+def mask_to_polygon(mask: np.ndarray) -> List[List[int]]:
+    """Converts a binary mask to a polygon.
+
+    Args:
+        mask: The binary mask to be converted.
+
+    Returns:
+        List: A list of vertices of the polygon.
+    """
+    # Find contours in the binary mask
+    contours, _ = cv2.findContours(
+        mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+    )
+    if len(contours) == 0:
+        return []
+    # Find the contour with the largest area
+    largest_contour = max(contours, key=cv2.contourArea)
+
+    # Extract the vertices of the contour
+    polygon = largest_contour.reshape(-1, 2).tolist()
+
+    return polygon