Add FastSAM

luxonis · sokovninn · Nov 12, 2024 · Apr 29, 2024 · Apr 29, 2024 · May 8, 2024
commit 3d3c88ea7a52740fc9f072ab762e6f4c1d5d1e4e
diff --git a/datadreamer/dataset_annotation/__init__.py b/datadreamer/dataset_annotation/__init__.py
@@ -1,7 +1,14 @@
 from __future__ import annotations
 
 from .clip_annotator import CLIPAnnotator
+from .fastsam_annotator import FastSAMAnnotator
 from .image_annotator import BaseAnnotator, TaskList
 from .owlv2_annotator import OWLv2Annotator
 
-__all__ = ["BaseAnnotator", "TaskList", "OWLv2Annotator", "CLIPAnnotator"]
+__all__ = [
+    "BaseAnnotator",
+    "TaskList",
+    "OWLv2Annotator",
+    "CLIPAnnotator",
+    "FastSAMAnnotator",
+]
diff --git a/datadreamer/dataset_annotation/fastsam_annotator.py b/datadreamer/dataset_annotation/fastsam_annotator.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+import logging
+from typing import List, Literal, Tuple
+
+import numpy as np
+import PIL
+from ultralytics import FastSAM
+
+logger = logging.getLogger(__name__)
+
+
+class FastSAMAnnotator:
+    """A class for image annotation using the FastSAM model, specializing in instance
+    segmentation.
+
+    Attributes:
+        model (FastSAM): The FastSAM model.
+
+
+    Methods:
+        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+    """
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        size: Literal["base", "large"] = "large",
+    ) -> None:
+        """Initializes the FastSAMAnnotator object.
+
+        Args:
+            size (str): The size of the FastSAM model to use ('s' or 'x').
+        """
+        self.size = size
+        self.device = device
+        self.model = self._init_model()
+
+    def _init_model(self) -> FastSAM:
+        """Initializes the FastSAM model for instance segmentation.
+
+        Returns:
+            FastSAM: The initialized FastSAM model.
+        """
+        model_size = "s" if self.size == "base" else "x"
+        logger.info(f"Initializing FastSAM {model_size} model...")
+        return FastSAM(f"FastSAM-{model_size}.pt")
+
+    def annotate_batch(
+        self,
+        images: List[PIL.Image.Image],
+        prompts: List[str],
+        boxes_batch: List[np.ndarray],
+        scores_batch: List[np.ndarray],
+        labels_batch: List[np.ndarray],
+        conf_threshold: float = 0.5,
+        iou_threshold: float = 0.2,
+    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
+        """Annotates images using the OWLv2 model.
+
+        Args:
+            images: The images to be annotated.
+            prompts: Prompts to guide the annotation.
+            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
+            iou_threshold (float, optional): Intersection over union threshold for non-maximum suppression. Defaults to 0.2.
+            use_tta (bool, optional): Flag to apply test-time augmentation. Defaults to False.
+            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
+
+        Returns:
+            tuple: A tuple containing the final bounding boxes, scores, and labels for the annotations.
+        """
+        final_segments = []
+
+        n = len(images)
+
+        for i in range(n):
+            batch_segments = []
+            for box, label in zip(boxes_batch[i], labels_batch[i]):
+                result = self.model(
+                    images[i],
+                    device=self.device,
+                    bboxes=box,
+                    texts=prompts[label],
+                    labels=[1],
+                    conf=conf_threshold,
+                    iou=iou_threshold,
+                    verbose=False,
+                )
+                mask_segment = result[0].masks.xy[0]
+                print("mask", mask_segment.shape)
+                batch_segments.append(mask_segment)
+            final_segments.append(batch_segments)
+
+        return boxes_batch, scores_batch, labels_batch, final_segments
+
+
+if __name__ == "__main__":
+    import requests
+    from PIL import Image
+
+    url = "https://ultralytics.com/images/bus.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    annotator = FastSAMAnnotator(device="cpu", size="base")
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -16,7 +16,11 @@
 from PIL import Image
 from tqdm import tqdm
 
-from datadreamer.dataset_annotation import CLIPAnnotator, OWLv2Annotator
+from datadreamer.dataset_annotation import (
+    CLIPAnnotator,
+    FastSAMAnnotator,
+    OWLv2Annotator,
+)
 from datadreamer.image_generation import (
     StableDiffusionImageGenerator,
     StableDiffusionLightningImageGenerator,
@@ -54,6 +58,8 @@
 
 det_annotators = {"owlv2": OWLv2Annotator}
 clf_annotators = {"clip": CLIPAnnotator}
+inst_seg_annotators = {"owlv2_fastsam": FastSAMAnnotator}
+inst_seg_to_det = {"owlv2_fastsam": OWLv2Annotator}
 
 setup_logging(use_rich=True)
 
@@ -70,7 +76,7 @@ def parse_args():
     parser.add_argument(
         "--task",
         type=str,
-        choices=["detection", "classification"],
+        choices=["detection", "classification", "instance-segmentation"],
         help="Task to generate data for",
     )
 
@@ -116,7 +122,7 @@ def parse_args():
     parser.add_argument(
         "--image_annotator",
         type=str,
-        choices=["owlv2", "clip"],
+        choices=["owlv2", "clip", "owlv2_fastsam"],
         help="Image annotator to use",
     )
 
@@ -357,6 +363,14 @@ def check_args(args):
             "--image_annotator must be one of the available annotators for classification task"
         )
 
+    if (
+        args.task == "instance-segmentation"
+        and args.image_annotator not in inst_seg_annotators
+    ):
+        raise ValueError(
+            "--image_annotator must be one of the available annotators for instance segmentation task"
+        )
+
     # Check coorect task and dataset_format
     if args.task == "classification" and args.dataset_format in ["coco", "yolo"]:
         raise ValueError(
@@ -368,6 +382,11 @@ def check_args(args):
             "--dataset_format must be one of the available dataset formats for detection task: raw, coco, yolo, luxonis-dataset"
         )
 
+    if args.task == "instance-segmentation" and args.dataset_format in ["cls-single"]:
+        raise ValueError(
+            "--dataset_format must be one of the available dataset formats for instance segmentation task: raw, coco, yolo, luxonis-dataset"
+        )
+
     # Check split_ratios
     if (
         len(args.split_ratios) != 3
@@ -540,6 +559,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
     boxes_list = []
     scores_list = []
     labels_list = []
+    segment_list = []
     image_paths = []
 
     if args.task == "classification":
@@ -583,7 +603,12 @@ def read_image_batch(image_batch, batch_num, batch_size):
             )
     else:
         # Detection annotation
-        annotator_class = det_annotators[args.image_annotator]
+        if args.task == "detection":
+            annotator_class = det_annotators[args.image_annotator]
+        else:
+            annotator_class = inst_seg_to_det[args.image_annotator]
+            inst_seg_annotator_class = inst_seg_annotators[args.image_annotator]
+            inst_seg_annotator = inst_seg_annotator_class(device=args.device)
         annotator = annotator_class(device=args.device, size=args.annotator_size)
 
         for i, image_batch in tqdm(
@@ -608,14 +633,42 @@ def read_image_batch(image_batch, batch_num, batch_size):
             boxes_list.extend(boxes_batch)
             scores_list.extend(scores_batch)
 
+            if args.task == "instance-segmentation":
+                (
+                    boxes_batch,
+                    scores_batch,
+                    local_labels_batch,
+                    masks_batch,
+                ) = inst_seg_annotator.annotate_batch(
+                    images=images,
+                    prompts=args.class_names,
+                    boxes_batch=boxes_batch,
+                    scores_batch=scores_batch,
+                    labels_batch=local_labels_batch,
+                    conf_threshold=args.conf_threshold,
+                    iou_threshold=args.annotation_iou_threshold,
+                )
+                print(
+                    "mask_batch",
+                    len(masks_batch),
+                    len(masks_batch[0]),
+                    len(scores_batch),
+                    scores_batch[0].shape,
+                )
+                segment_list.extend(masks_batch)
+
             for j, image in enumerate(images):
                 labels = []
                 # Save bbox visualizations
                 fig, ax = plt.subplots(1)
                 ax.imshow(image)
-                for box, score, label in zip(
-                    boxes_batch[j], scores_batch[j], local_labels_batch[j]
-                ):
+                for k in range(len(boxes_batch[j])):
+                    box = boxes_batch[j][k]
+                    score = scores_batch[j][k]
+                    label = local_labels_batch[j][k]
+                    if args.task == "instance-segmentation":
+                        mask = masks_batch[j][k]
+                        print("mask", type(mask))
                     labels.append(label)
                     x1, y1, x2, y2 = box
                     rect = patches.Rectangle(
@@ -658,6 +711,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
             image_paths=image_paths,
             labels_list=labels_list,
             boxes_list=boxes_list,
+            masks_list=segment_list if len(segment_list) > 0 else None,
             class_names=args.class_names,
             save_dir=save_dir,
         )

diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
@@ -4,6 +4,7 @@
 import os
 import shutil
 
+import numpy as np
 from PIL import Image
 
 from datadreamer.utils.base_converter import BaseConverter
@@ -28,8 +29,9 @@ class COCOConverter(BaseConverter):
     │   ├── labels.json
     """
 
-    def __init__(self, seed=42):
+    def __init__(self, seed=42, is_instance_segmentation: bool = False):
         super().__init__(seed)
+        self.is_instance_segmentation = is_instance_segmentation
 
     def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> None:
         """Converts a dataset into a COCO format.
@@ -99,15 +101,23 @@ def process_data(
                         "height": image_height,
                     }
                 )
-
-                for box, label in zip(annotation["boxes"], annotation["labels"]):
+                masks = (
+                    annotation["masks"]
+                    if "masks" in annotation and self.is_instance_segmentation
+                    else [None for i in range(len(annotation["boxes"]))]
+                )
+                for box, label, mask in zip(
+                    annotation["boxes"], annotation["labels"], masks
+                ):
                     annotations.append(
                         {
                             "id": annotation_id,
                             "image_id": len(images_info),
                             "category_id": label,
                             "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
-                            "segmentation": None,  # [[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
+                            "segmentation": np.array(mask).reshape(-1)
+                            if mask is not None
+                            else None,  # [[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
                             "area": (box[2] - box[0]) * (box[3] - box[1]),
                             "iscrowd": 0,
                         }

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
@@ -10,7 +10,7 @@ class Config(LuxonisConfig):
     save_dir: str = "generated_dataset"
     class_names: List[str] = ["bear", "bicycle", "bird", "person"]
     prompts_number: int = 10
-    task: Literal["detection", "classification"] = "detection"
+    task: Literal["detection", "classification", "instance-segmentation"] = "detection"
     seed: int = 42
     device: Literal["cuda", "cpu"] = "cuda"
     annotate_only: bool = False
@@ -39,7 +39,7 @@ class Config(LuxonisConfig):
     # Profanity filter arguments
     disable_lm_filter: bool = False
     # Annotation arguments
-    image_annotator: Literal["owlv2", "clip"] = "owlv2"
+    image_annotator: Literal["owlv2", "clip", "owlv2_fastsam"] = "owlv2"
     conf_threshold: float = 0.15
     annotation_iou_threshold: float = 0.2
     use_tta: bool = False

diff --git a/datadreamer/utils/convert_dataset.py b/datadreamer/utils/convert_dataset.py
@@ -17,6 +17,7 @@ def convert_dataset(
     split_ratios,
     dataset_plugin=None,
     dataset_name=None,
+    is_instance_segmentation=False,
     copy_files=True,
     seed=42,
 ) -> None:
@@ -36,14 +37,19 @@ def convert_dataset(
     """
 
     if dataset_format == "yolo":
-        converter = YOLOConverter(seed=seed)
+        converter = YOLOConverter(
+            seed=seed, is_instance_segmentation=is_instance_segmentation
+        )
     elif dataset_format == "coco":
-        converter = COCOConverter(seed=seed)
+        converter = COCOConverter(
+            seed=seed, is_instance_segmentation=is_instance_segmentation
+        )
     elif dataset_format == "luxonis-dataset":
         converter = LuxonisDatasetConverter(
             dataset_plugin=dataset_plugin,
             dataset_name=dataset_name,
             seed=seed,
+            is_instance_segmentation=is_instance_segmentation,
         )
     elif dataset_format == "cls-single":
         converter = SingleLabelClsConverter(seed=seed)

diff --git a/datadreamer/utils/dataset_utils.py b/datadreamer/utils/dataset_utils.py
@@ -6,6 +6,7 @@ def save_annotations_to_json(
     image_paths,
     labels_list,
     boxes_list=None,
+    masks_list=None,
     class_names=None,
     save_dir=None,
     file_name="annotations.json",
@@ -16,6 +17,7 @@ def save_annotations_to_json(
         image_paths (list): List of image paths.
         labels_list (list): List of labels.
         boxes_list (list, optional): List of bounding boxes. Defaults to None.
+        masks_list (list, optional): List of instance segmentation masks. Defaults to None.
         class_names (list, optional): List of class names. Defaults to None.
         save_dir (str, optional): Directory to save the JSON file. Defaults to None.
         file_name (str, optional): Name of the JSON file. Defaults to 'annotations.json'.
@@ -38,6 +40,10 @@ def save_annotations_to_json(
             bboxes = boxes_list[i]
             annotations[image_name]["boxes"] = bboxes.tolist()
 
+        if masks_list is not None:
+            masks = masks_list[i]
+            annotations[image_name]["masks"] = masks
+
     annotations["class_names"] = class_names
 
     # Save to JSON file

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
@@ -17,9 +17,14 @@ class LuxonisDatasetConverter(BaseConverter):
     """Class for converting a dataset to LuxonisDataset format."""
 
     def __init__(
-        self, dataset_plugin: str = None, dataset_name: str = None, seed: int = 42
+        self,
+        dataset_plugin: str = None,
+        dataset_name: str = None,
+        seed: int = 42,
+        is_instance_segmentation: bool = False,
     ):
         super().__init__(seed)
+        self.is_instance_segmentation = is_instance_segmentation
         self.dataset_plugin = dataset_plugin
         self.dataset_name = dataset_name
 

diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
@@ -30,8 +30,9 @@ class YOLOConverter(BaseConverter):
     │   ├── labels
     """
 
-    def __init__(self, seed=42):
+    def __init__(self, seed=42, is_instance_segmentation: bool = False):
         super().__init__(seed)
+        self.is_instance_segmentation = is_instance_segmentation
 
     def convert(
         self,
@@ -74,6 +75,26 @@ def convert_to_yolo_format(
         height = (box[3] - box[1]) / image_height
         return [x_center, y_center, width, height]
 
+    def convert_masks_to_yolo_format(
+        self, masks: List[List[float]], image_width: int, image_height: int
+    ) -> List[float]:
+        """Converts masks to YOLO format.
+
+        Args:
+            masks (list of list of float): A list containing the masks.
+            image_width (int): The width of the image.
+            image_height (int): The height of the image.
+
+        Returns:
+            list of float: A list containing the masks in YOLO format.
+        """
+        yolo_masks = []
+        for mask in masks:
+            x, y = mask[0], mask[1]
+            yolo_masks.append(x / image_width)
+            yolo_masks.append(y / image_height)
+        return yolo_masks
+
     def process_data(
         self,
         data: Dict,
@@ -130,11 +151,22 @@ def process_data(
                     label_output_dir, os.path.splitext(image_name)[0] + ".txt"
                 )
                 with open(label_file, "w") as f:
-                    for box, label in zip(annotation["boxes"], annotation["labels"]):
-                        yolo_box = self.convert_to_yolo_format(
-                            box, image_width, image_height
-                        )
-                        f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
+                    if self.is_instance_segmentation:
+                        for box, label in zip(
+                            annotation["boxes"], annotation["labels"]
+                        ):
+                            yolo_box = self.convert_to_yolo_format(
+                                box, image_width, image_height
+                            )
+                            f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
+                    else:
+                        for masks, label in zip(
+                            annotation["masks"], annotation["labels"]
+                        ):
+                            yolo_box = self.convert_masks_to_yolo_format(
+                                masks, image_width, image_height
+                            )
+                            f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
 
                 if copy_files:
                     shutil.copy(

diff --git a/requirements.txt b/requirements.txt
@@ -15,3 +15,4 @@ nltk>=3.8.1
 luxonis-ml[all]>=0.3.0
 python-box>=7.1.1
 gcsfs>=2023.1.0
+ultralytics>=8.3.13