Update

luxonis · sokovninn · Nov 12, 2024 · Apr 29, 2024 · Apr 29, 2024 · May 8, 2024
commit f2dbf3378cbafbd9b35428846617b11d37af208d
diff --git a/README.md b/README.md
@@ -157,13 +157,13 @@ datadreamer --config <path-to-config>
 
 ### 🔧 Additional Parameters
 
-- `--task`: Choose between detection and classification. Default is `detection`.
+- `--task`: Choose between detection, classification and instance segmentation. Default is `detection`.
 - `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`, `cls-single`.
 - `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.
 - `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.
 - `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.
 - `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.
-- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.
+- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-fastsam` for instance segmentation. Default is `owlv2`.
 - `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.
 - `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.
 - `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `""`.
@@ -199,6 +199,7 @@ datadreamer --config <path-to-config>
 |                   | [SDXL-Lightning](https://huggingface.co/ByteDance/SDXL-Lightning)                     | Fast and accurate (1024x1024 images)    |
 | Image Annotation  | [OWLv2](https://huggingface.co/google/owlv2-base-patch16-ensemble)                    | Open-Vocabulary object detector         |
 |                   | [CLIP](https://huggingface.co/openai/clip-vit-base-patch32)                           | Zero-shot-image-classification          |
+|                   | [FastSAM](https://docs.ultralytics.com/models/fast-sam)                               | Zero-shot-instance-segmentation         |
 
 <a name="example"></a>
 

diff --git a/datadreamer/dataset_annotation/fastsam_annotator.py b/datadreamer/dataset_annotation/fastsam_annotator.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import logging
-from typing import List, Literal, Tuple
+from typing import List, Literal
 
 import numpy as np
 import PIL
@@ -16,10 +16,12 @@ class FastSAMAnnotator:
 
     Attributes:
         model (FastSAM): The FastSAM model.
-
+        device (str): The device on which the model will run ('cuda' for GPU, 'cpu' for CPU).
+        size (str): The size of the FastSAM model to use ('s' or 'x').
 
     Methods:
-        annotate_batch(image, prompts, conf_threshold, use_tta, synonym_dict): Annotates the given image with bounding boxes and labels.
+        _init_model(): Initializes the FastSAM model.
+        annotate_batch(images, boxes_batch, conf_threshold, iou_threshold): Annotates the given image with given bounding boxes.
     """
 
     def __init__(
@@ -49,49 +51,41 @@ def _init_model(self) -> FastSAM:
     def annotate_batch(
         self,
         images: List[PIL.Image.Image],
-        prompts: List[str],
         boxes_batch: List[np.ndarray],
-        scores_batch: List[np.ndarray],
-        labels_batch: List[np.ndarray],
-        conf_threshold: float = 0.5,
+        conf_threshold: float = 0.15,
         iou_threshold: float = 0.2,
-    ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
-        """Annotates images using the OWLv2 model.
+    ) -> List[List[List[float]]]:
+        """Annotates images for the task of instance segmentation using the FastSAM
+        model.
 
         Args:
             images: The images to be annotated.
-            prompts: Prompts to guide the annotation.
-            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.1.
+            boxes_batch: The bounding boxes of found objects.
+            conf_threshold (float, optional): Confidence threshold for the annotations. Defaults to 0.15.
             iou_threshold (float, optional): Intersection over union threshold for non-maximum suppression. Defaults to 0.2.
-            use_tta (bool, optional): Flag to apply test-time augmentation. Defaults to False.
-            synonym_dict (dict, optional): Dictionary for handling synonyms in labels. Defaults to None.
 
         Returns:
-            tuple: A tuple containing the final bounding boxes, scores, and labels for the annotations.
+            List: A list containing the final segment masks represented as a polygon.
         """
         final_segments = []
 
         n = len(images)
 
         for i in range(n):
-            batch_segments = []
-            for box, label in zip(boxes_batch[i], labels_batch[i]):
-                result = self.model(
-                    images[i],
-                    device=self.device,
-                    bboxes=box,
-                    texts=prompts[label],
-                    labels=[1],
-                    conf=conf_threshold,
-                    iou=iou_threshold,
-                    verbose=False,
-                )
-                mask_segment = result[0].masks.xy[0]
-                print("mask", mask_segment.shape)
-                batch_segments.append(mask_segment)
-            final_segments.append(batch_segments)
-
-        return boxes_batch, scores_batch, labels_batch, final_segments
+            result = self.model(
+                images[i],
+                device=self.device,
+                bboxes=boxes_batch[i],
+                labels=1,
+                conf=conf_threshold,
+                iou=iou_threshold,
+                verbose=False,
+            )
+
+            mask_segments = result[0].masks.xy
+            final_segments.append(list(map(lambda x: x.tolist(), mask_segments)))
+
+        return final_segments
 
 
 if __name__ == "__main__":
@@ -100,4 +94,6 @@ def annotate_batch(
 
     url = "https://ultralytics.com/images/bus.jpg"
     im = Image.open(requests.get(url, stream=True).raw)
-    annotator = FastSAMAnnotator(device="cpu", size="base")
+    annotator = FastSAMAnnotator(device="cpu", size="large")
+    final_segments = annotator.annotate_batch([im], [np.array([[3, 229, 559, 650]])])
+    print(len(final_segments), len(final_segments[0]), len(final_segments[0][0]))
diff --git a/datadreamer/pipelines/generate_dataset_from_scratch.py b/datadreamer/pipelines/generate_dataset_from_scratch.py
@@ -58,8 +58,8 @@
 
 det_annotators = {"owlv2": OWLv2Annotator}
 clf_annotators = {"clip": CLIPAnnotator}
-inst_seg_annotators = {"owlv2_fastsam": FastSAMAnnotator}
-inst_seg_to_det = {"owlv2_fastsam": OWLv2Annotator}
+inst_seg_annotators = {"owlv2-fastsam": FastSAMAnnotator}
+inst_seg_to_det = {"owlv2-fastsam": OWLv2Annotator}
 
 setup_logging(use_rich=True)
 
@@ -122,7 +122,7 @@ def parse_args():
     parser.add_argument(
         "--image_annotator",
         type=str,
-        choices=["owlv2", "clip", "owlv2_fastsam"],
+        choices=["owlv2", "clip", "owlv2-fastsam"],
         help="Image annotator to use",
     )
 
@@ -634,27 +634,12 @@ def read_image_batch(image_batch, batch_num, batch_size):
             scores_list.extend(scores_batch)
 
             if args.task == "instance-segmentation":
-                (
-                    boxes_batch,
-                    scores_batch,
-                    local_labels_batch,
-                    masks_batch,
-                ) = inst_seg_annotator.annotate_batch(
+                masks_batch = inst_seg_annotator.annotate_batch(
                     images=images,
-                    prompts=args.class_names,
                     boxes_batch=boxes_batch,
-                    scores_batch=scores_batch,
-                    labels_batch=local_labels_batch,
                     conf_threshold=args.conf_threshold,
                     iou_threshold=args.annotation_iou_threshold,
                 )
-                print(
-                    "mask_batch",
-                    len(masks_batch),
-                    len(masks_batch[0]),
-                    len(scores_batch),
-                    scores_batch[0].shape,
-                )
                 segment_list.extend(masks_batch)
 
             for j, image in enumerate(images):
@@ -667,8 +652,16 @@ def read_image_batch(image_batch, batch_num, batch_size):
                     score = scores_batch[j][k]
                     label = local_labels_batch[j][k]
                     if args.task == "instance-segmentation":
-                        mask = masks_batch[j][k]
-                        print("mask", type(mask))
+                        if k < len(masks_batch[j]):
+                            mask = masks_batch[j][k]
+                            # Unzip the list of points into separate x and y lists
+                            x_points, y_points = zip(*mask)
+
+                            # Fill the polygon defined by the points to create the mask
+                            ax.fill(
+                                x_points, y_points, "blue", alpha=0.5
+                            )  # 'blue' for mask color and alpha for transparency
+
                     labels.append(label)
                     x1, y1, x2, y2 = box
                     rect = patches.Rectangle(
@@ -724,6 +717,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 "yolo",
                 args.split_ratios,
                 copy_files=False,
+                is_instance_segmentation=args.task == "instance-segmentation",
                 seed=args.seed,
             )
         # Convert annotations to COCO format
@@ -733,6 +727,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
                 args.save_dir,
                 "coco",
                 args.split_ratios,
+                is_instance_segmentation=args.task == "instance-segmentation",
                 copy_files=False,
                 seed=args.seed,
             )
@@ -746,6 +741,7 @@ def read_image_batch(image_batch, batch_num, batch_size):
             args.split_ratios,
             dataset_plugin=args.dataset_plugin,
             dataset_name=args.dataset_name,
+            is_instance_segmentation=args.task == "instance-segmentation",
             copy_files=False,
             seed=args.seed,
         )

diff --git a/datadreamer/utils/coco_converter.py b/datadreamer/utils/coco_converter.py
@@ -48,6 +48,21 @@ def convert(self, dataset_dir, output_dir, split_ratios, copy_files=True) -> Non
         data = BaseConverter.read_annotations(annotation_path)
         self.process_data(data, dataset_dir, output_dir, split_ratios, copy_files)
 
+    def convert_masks_to_coco_format(self, masks):
+        """Converts masks to COCO format.
+
+        Args:
+            masks (list of np.ndarray): A list of masks.
+
+        Returns:
+            list of list of floats: A list of lists of floats representing the segmentation mask polygon.
+        """
+        segmentations = []
+        for mask in masks:
+            segmentation = np.array(mask).reshape(-1).tolist()
+            segmentations.append(segmentation)
+        return segmentations
+
     def process_data(
         self, data, image_dir, output_dir, split_ratios, copy_files=True
     ) -> None:
@@ -102,26 +117,35 @@ def process_data(
                     }
                 )
                 masks = (
-                    annotation["masks"]
-                    if "masks" in annotation and self.is_instance_segmentation
-                    else [None for i in range(len(annotation["boxes"]))]
+                    annotation.get("masks")
+                    if self.is_instance_segmentation
+                    else [None] * len(annotation["boxes"])
                 )
+
+                # Loop through boxes, labels, and masks, appending to annotations
                 for box, label, mask in zip(
                     annotation["boxes"], annotation["labels"], masks
                 ):
+                    bbox = [box[0], box[1], box[2] - box[0], box[3] - box[1]]
+                    segmentation = (
+                        np.array(mask).reshape(-1).tolist()
+                        if mask is not None
+                        else None
+                    )
+                    area = (box[2] - box[0]) * (box[3] - box[1])
+
                     annotations.append(
                         {
                             "id": annotation_id,
                             "image_id": len(images_info),
                             "category_id": label,
-                            "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]],
-                            "segmentation": np.array(mask).reshape(-1)
-                            if mask is not None
-                            else None,  # [[box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3]]], # bbox mask
-                            "area": (box[2] - box[0]) * (box[3] - box[1]),
+                            "bbox": bbox,
+                            "segmentation": segmentation,
+                            "area": area,
                             "iscrowd": 0,
                         }
                     )
+
                     annotation_id += 1
 
                 if copy_files:

diff --git a/datadreamer/utils/config.py b/datadreamer/utils/config.py
@@ -39,7 +39,7 @@ class Config(LuxonisConfig):
     # Profanity filter arguments
     disable_lm_filter: bool = False
     # Annotation arguments
-    image_annotator: Literal["owlv2", "clip", "owlv2_fastsam"] = "owlv2"
+    image_annotator: Literal["owlv2", "clip", "owlv2-fastsam"] = "owlv2"
     conf_threshold: float = 0.15
     annotation_iou_threshold: float = 0.2
     use_tta: bool = False

diff --git a/datadreamer/utils/luxonis_dataset_converter.py b/datadreamer/utils/luxonis_dataset_converter.py
@@ -85,6 +85,20 @@ def dataset_generator():
                         },
                     }
 
+                if "masks" in data[image_path]:  # polyline format
+                    poly = []
+                    masks = data[image_path]["masks"]
+                    for m in masks:
+                        poly = [[point[0] / width, point[1] / height] for point in m]
+                        yield {
+                            "file": image_full_path,
+                            "annotation": {
+                                "type": "polyline",
+                                "class": class_names[label],
+                                "points": poly,
+                            },
+                        }
+
                 if "boxes" in data[image_path]:
                     boxes = data[image_path]["boxes"]
                     for box, label in zip(boxes, labels):

diff --git a/datadreamer/utils/yolo_converter.py b/datadreamer/utils/yolo_converter.py
@@ -152,21 +152,21 @@ def process_data(
                 )
                 with open(label_file, "w") as f:
                     if self.is_instance_segmentation:
-                        for box, label in zip(
-                            annotation["boxes"], annotation["labels"]
-                        ):
-                            yolo_box = self.convert_to_yolo_format(
-                                box, image_width, image_height
-                            )
-                            f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
-                    else:
                         for masks, label in zip(
                             annotation["masks"], annotation["labels"]
                         ):
                             yolo_box = self.convert_masks_to_yolo_format(
                                 masks, image_width, image_height
                             )
                             f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
+                    else:
+                        for box, label in zip(
+                            annotation["boxes"], annotation["labels"]
+                        ):
+                            yolo_box = self.convert_to_yolo_format(
+                                box, image_width, image_height
+                            )
+                            f.write(f"{label} {' '.join(map(str, yolo_box))}\n")
 
                 if copy_files:
                     shutil.copy(

diff --git a/examples/generate_dataset_and_train_yolo.ipynb b/examples/generate_dataset_and_train_yolo.ipynb
@@ -78,13 +78,13 @@
     "- `--class_names` (required): Space-separated list of object names for image generation and annotation. Example: `person moon robot`.\n",
     "- `--prompts_number` (optional): Number of prompts to generate for each object. Defaults to `10`.\n",
     "- `--annotate_only` (optional): Only annotate the images without generating new ones, prompt and image generator will be skipped. Defaults to `False`.\n",
-    "- `--task`: Choose between detection and classification. Default is `detection`.\n",
+    "- `--task`: Choose between detection, classification and instance segmentation. Default is `detection`.\n",
     "- `--dataset_format`: Format of the dataset. Defaults to `raw`. Supported values: `raw`, `yolo`, `coco`, `luxonis-dataset`, `cls-single`.\n",
     "- `--split_ratios`: Split ratios for train, validation, and test sets. Defaults to `[0.8, 0.1, 0.1]`.\n",
     "- `--num_objects_range`: Range of objects in a prompt. Default is 1 to 3.\n",
     "- `--prompt_generator`: Choose between `simple`, `lm` (Mistral-7B), `tiny` (tiny LM), and `qwen2` (Qwen2.5 LM). Default is `qwen2`.\n",
     "- `--image_generator`: Choose image generator, e.g., `sdxl`, `sdxl-turbo` or `sdxl-lightning`. Default is `sdxl-turbo`.\n",
-    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification. Default is `owlv2`.\n",
+    "- `--image_annotator`: Specify the image annotator, like `owlv2` for object detection or `clip` for image classification or `owlv2-fastsam` for instance segmentation. Default is `owlv2`.\n",
     "- `--conf_threshold`: Confidence threshold for annotation. Default is `0.15`.\n",
     "- `--annotation_iou_threshold`: Intersection over Union (IoU) threshold for annotation. Default is `0.2`.\n",
     "- `--prompt_prefix`: Prefix to add to every image generation prompt. Default is `\"\"`.\n",
@@ -96,6 +96,7 @@
     "- `--image_tester_patience`: Patience level for image tester. Default is `1`.\n",
     "- `--lm_quantization`: Quantization to use for Mistral language model. Choose between `none` and `4bit`. Default is `none`.\n",
     "- `--annotator_size`: Size of the annotator model to use. Choose between `base` and `large`. Default is `base`.\n",
+    "- `--disable_lm_filter`: Use only a bad word list for profanity filtering. Default is `False`.\n",
     "- `--batch_size_prompt`: Batch size for prompt generation. Default is 64.\n",
     "- `--batch_size_annotation`: Batch size for annotation. Default is `1`.\n",
     "- `--batch_size_image`: Batch size for image generation. Default is `1`.\n",