ADD: voc2coco

veraposeidon · Nov 21, 2021 · 13e34db · 13e34db
1 parent c4748df
commit 13e34db
Show file tree

Hide file tree

Showing 14 changed files with 292 additions and 10 deletions.
diff --git a/README.md b/README.md
@@ -155,24 +155,34 @@ if the baseline in this project not work for your datasets, you can install in d
 ## Usage
 
 - convert a single json into dataset. (`labelme_json2dataset.py`)
-    ```sh
+    ```shell
     labelme_json2dataset --json_file=data/test.json \
       --output_dir=output/test_single_output
     ```
 
 - convert a folder of jsons into voc-format dataset. (`labelme_bbox_json2voc.py`)
   - without label conversion
-    ```sh
+    ```shell
     labelme_bbox_json2voc --json_dir=data/test_jsons \
       --output_dir=output/test_voc_output --labels data/label_names.txt
     ```
   - with label conversion
-    ```sh
+    ```shell
     labelme_bbox_json2voc --json_dir=data/test_jsons \
       --output_dir=output/test_voc_output \
       --labels data/label_names.txt \
       --label_dict data/label_dict.txt
     ```
+- splitting voc datasets into train set and test set. (`split_voc_datasets.py`)
+  ```shell
+    split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42
+  ```
+  `train.txt` and `test.txt` should be generated in `voc_dir/ImageSets/Main/`.
+
+- turn voc format dataset into coco style dataset. (`voc2coco.py`)
+  ```shell
+    voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output
+  ```
 
 <p align="right">(<a href="#top">back to top</a>)</p>
 

diff --git a/README.zh.md b/README.zh.md
@@ -171,11 +171,18 @@
       --labels data/label_names.txt \
       --label_dict data/label_dict.txt
     ```
-
+- 分割 VOC 数据的训练集和测试集。 (`split_voc_datasets.py`)
+  ```sh
+    split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42
+  ```
+  `train.txt` 和 `test.txt` 会出现在 `voc_dir/ImageSets/Main/` 文件夹下。
+
+- 将 VOC 数据集转换为 COCO 数据集 (`voc2coco.py`)
+  ```shell
+    voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output
+  ```
 <p align="right">(<a href="#top">back to top</a>)</p>
 
-
-
 <!-- ROADMAP -->
 ## Roadmap
 

diff --git a/labelme2datasets/split_voc_datasets.py b/labelme2datasets/split_voc_datasets.py
@@ -0,0 +1,59 @@
+"""splitting voc format datasets into training set and test set"""
+# coding=utf-8
+
+import argparse
+import sys
+import os
+import os.path as osp
+import glob
+from pathlib import Path
+from sklearn.model_selection import train_test_split
+
+
+def main():
+    """splitting voc format datasets into training set and test set"""
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--voc_dir', help='input annotated directory')
+    parser.add_argument('--test_ratio', help='test set ratio', default=0.3)
+    parser.add_argument('--random_seed', help='random seed ', default=42)
+    args = parser.parse_args()
+
+    if not osp.exists(args.voc_dir):
+        print('directory not exists:', args.voc_dir)
+        sys.exit(1)
+
+    annotation_dir = osp.join(args.voc_dir, 'Annotations')
+    if not osp.exists(annotation_dir):
+        print('annotation directory not exists:', annotation_dir)
+        sys.exit(1)
+
+    output_dir = osp.join(args.voc_dir, 'ImageSets', 'Main')
+    if not osp.exists(output_dir):
+        os.makedirs(output_dir)
+
+    train_file = osp.join(output_dir, 'train.txt')
+    test_file = osp.join(output_dir, 'test.txt')
+    if osp.exists(train_file) or osp.exists(test_file):
+        print(f'train.txt: {train_file} exists or test.txt: {train_file} exists,please check!')
+        sys.exit(1)
+
+    total_files = glob.glob(osp.join(annotation_dir, '*.xml'))
+    total_files = [Path(o).stem for o in total_files]
+    train_set, test_set = train_test_split(total_files,
+                                           test_size=float(args.test_ratio),
+                                           random_state=int(args.random_seed))
+
+    with open(train_file, 'w', encoding='utf8') as train_f:
+        for file in train_set:
+            train_f.write(file + "\n")
+
+    with open(test_file, 'w', encoding='utf8') as test_f:
+        for file in test_set:
+            test_f.write(file + "\n")
+
+    print(f"split Completed. Number of Train Samples: {len(train_set)}."
+          f" Number of Test Samples: {len(test_set)}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/labelme2datasets/utils.py b/labelme2datasets/utils.py
@@ -4,6 +4,7 @@
 
 # coding=utf-8
 
+import os.path as osp
 
 def get_label_conversion_dict(dict_file):
     """
@@ -21,3 +22,19 @@ def get_label_conversion_dict(dict_file):
             words = line.split(":")
             label_dict[words[0].strip()] = words[1].strip()
     return label_dict
+
+
+def get_coco_category(labels_file):
+    """生成标签字典，用于生成COCO数据集时供查询"""
+    if not osp.exists(labels_file):
+        print('file not exists:', labels_file)
+        return None
+    attr_dict = {"categories": []}
+    label_id = 0
+    with open(labels_file, "r", encoding='UTF-8') as label_f:
+        for line in label_f:
+            label = line.strip()
+            label_item = {"supercategory": "defect", "id": label_id, "name": label}
+            attr_dict["categories"].append(label_item)
+            label_id += 1
+    return attr_dict
diff --git a/labelme2datasets/voc2coco.py b/labelme2datasets/voc2coco.py
@@ -0,0 +1,175 @@
+"""turn voc format datasets into coco format datasets"""
+# coding = utf-8
+
+import argparse
+import sys
+import os
+import os.path as osp
+from pathlib import Path
+import json
+import glob
+from collections import OrderedDict
+import shutil
+import xmltodict
+from labelme2datasets.utils import get_coco_category
+
+
+def get_xml_anno_list(set_file, voc_dir, coco_dir):
+    """get_xml_anno_list and copy source image to coco dir."""
+    voc_split = Path(set_file).stem  # e.g. train.txt -> train
+    anno_list = []
+    with open(set_file, "r", encoding='UTF-8') as f_open:
+        for line in f_open:
+            base = line.strip()
+            # absolute path of file
+            anno_list.append(osp.join(voc_dir, 'Annotations', base + ".xml"))
+            # copy image to COCO dataset
+            # MARK: jpg or png or other pic suffix
+            image_from = osp.join(voc_dir, "JPEGImages", base + ".jpg")
+            if not osp.exists(image_from):
+                print(f"some thing wrong, file not exists: {image_from}")
+            image_dest = osp.join(coco_dir, voc_split)
+            shutil.copy(image_from, image_dest)
+            # print("copy image {} to {}".format(base, image_dest))
+    print("build anno_list. total samples:", len(anno_list))
+    return anno_list
+
+
+def get_image_with_anno(anno_file):
+    """get image with annotation"""
+    image = {}
+    with open(anno_file, 'r', encoding='utf8') as f_open:
+        doc = xmltodict.parse(f_open.read())
+        image['file_name'] = str(doc['annotation']['filename'])
+        image['height'] = int(doc['annotation']['size']['height'])
+        image['width'] = int(doc['annotation']['size']['width'])
+    return image
+
+
+def get_coco_anno_with_file(anno_file, image_id, attr_dict):
+    """get coco annotation with file"""
+    annotations = []
+    with open(anno_file, 'r', encoding='utf8') as f_open:
+        doc = xmltodict.parse(f_open.read())
+        anno_id = 1
+        if 'object' in doc['annotation']:
+            objects = doc['annotation']['object']
+            if isinstance(objects, OrderedDict):
+                obj = objects
+                objects = [obj]
+
+            for obj in objects:
+                for value in attr_dict["categories"]:
+                    if str(obj['name']) != value["name"]:
+                        continue
+                    annotation = {"iscrowd": 0, "image_id": image_id}
+                    # annotation["segmentation"] = []
+                    box_x = int(float(obj["bndbox"]["xmin"]))
+                    box_y = int(float(obj["bndbox"]["ymin"]))
+                    box_w = int(float(obj["bndbox"]["xmax"])) - box_x + 1
+                    box_h = int(float(obj["bndbox"]["ymax"])) - box_y + 1
+                    annotation["bbox"] = [box_x, box_y, box_w, box_h]
+                    annotation["area"] = float(box_w * box_h)
+                    annotation["category_id"] = value["id"]
+                    annotation["ignore"] = 0
+                    annotation["segmentation"] = [[box_x, box_y, box_x, (box_y + box_h - 1),
+                                                   (box_x + box_w - 1), (box_y + box_h - 1),
+                                                   (box_x + box_w - 1), box_y]]
+                    annotation["id"] = anno_id
+                    anno_id += 1
+                    annotations.append(annotation)
+        else:
+            print(f"File: {anno_file} doesn't have any object")
+
+    return annotations
+
+
+def save_coco_json(attr_dict, anno_path):
+    """save coco json file"""
+    json_string = json.dumps(attr_dict)
+    with open(anno_path, "w", encoding="utf8") as anno_f:
+        anno_f.write(json_string)
+
+
+def generate_coco_annotation(set_file, voc_dir, coco_dir):
+    """
+    generate coco annotation from voc annotation
+    """
+    voc_split = Path(set_file).stem  # e.g. train.txt -> train
+    anno_file = voc_split + '.json'
+
+    if osp.exists(osp.join(coco_dir, voc_split)):
+        print("directory not supposed to exist: ", osp.join(coco_dir, voc_split))
+        sys.exit(1)
+    os.makedirs(osp.join(coco_dir, voc_split))
+
+    anno_path = osp.join(coco_dir, 'annotations', anno_file)
+    if osp.exists(anno_path):
+        print('anno file exists:', anno_path)
+        sys.exit(1)
+
+    # check class_names.txt in voc dataset
+    attr_dict = get_coco_category(osp.join(voc_dir, 'class_names.txt'))
+    if attr_dict is None:
+        print('class_names.txt not found')
+        sys.exit(1)
+
+    anno_list = get_xml_anno_list(set_file, voc_dir, coco_dir)
+
+    image_id = 0
+    images = []
+    annotations = []
+
+    for file in anno_list:
+        if not osp.exists(file):
+            print("file not exists", file)
+            continue
+
+        image_id += 1
+
+        # get image info
+        image = get_image_with_anno(file)
+        image['id'] = image_id
+        images.append(image)
+
+        # get annotation info
+        current_annotations = get_coco_anno_with_file(file, image_id, attr_dict)
+        for item in current_annotations:
+            annotations.append(item)
+
+    attr_dict["images"] = images
+    attr_dict["annotations"] = annotations
+    attr_dict["type"] = "instances"
+
+    # save file
+    save_coco_json(attr_dict, anno_path)
+
+
+def main():
+    """turn voc format datasets into coco format datasets."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--voc_dir', help='INPUT: voc style dataset root directory')
+    parser.add_argument('--coco_dir', help='OUTPUT: coco style dataset root directory')
+    args = parser.parse_args()
+
+    if not osp.exists(args.voc_dir):
+        print('directory not exists:', args.voc_dir)
+        sys.exit(1)
+
+    set_files = glob.glob(osp.join(args.voc_dir, 'ImageSets', 'Main', '*.txt'))
+    if len(set_files) == 0:
+        print(f"set file not exists: {osp.join(args.voc_dir, 'ImageSets', 'Main')}")
+        sys.exit(1)
+
+    if not osp.exists(args.coco_dir):
+        os.makedirs(args.coco_dir)
+        os.makedirs(osp.join(args.coco_dir, 'annotations'))
+
+    # iterate every set file(eg. train.txt、test.txt)
+    for set_file in set_files:
+        generate_coco_annotation(set_file, args.voc_dir, args.coco_dir)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/output/test_coco_output/annotations/test.json b/output/test_coco_output/annotations/test.json
@@ -0,0 +1 @@
+{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115522.jpg", "height": 1920, "width": 2560, "id": 1}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1090, 2, 218, 2559], "area": 557862.0, "category_id": 3, "ignore": 0, "segmentation": [[1090, 2, 1090, 2560, 1307, 2560, 1307, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1322, 0, 331, 2561], "area": 847691.0, "category_id": 1, "ignore": 0, "segmentation": [[1322, 0, 1322, 2560, 1652, 2560, 1652, 0]], "id": 2}], "type": "instances"}
diff --git a/output/test_coco_output/annotations/train.json b/output/test_coco_output/annotations/train.json
@@ -0,0 +1 @@
+{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115506.jpg", "height": 1920, "width": 2560, "id": 1}, {"file_name": "20180928115538.jpg", "height": 1920, "width": 2560, "id": 2}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1178, 2, 269, 2559], "area": 688371.0, "category_id": 1, "ignore": 0, "segmentation": [[1178, 2, 1178, 2560, 1446, 2560, 1446, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1017, 0, 156, 2561], "area": 399516.0, "category_id": 3, "ignore": 0, "segmentation": [[1017, 0, 1017, 2560, 1172, 2560, 1172, 0]], "id": 2}, {"iscrowd": 0, "image_id": 2, "bbox": [1075, 0, 183, 2561], "area": 468663.0, "category_id": 3, "ignore": 0, "segmentation": [[1075, 0, 1075, 2560, 1257, 2560, 1257, 0]], "id": 1}, {"iscrowd": 0, "image_id": 2, "bbox": [1293, 0, 295, 2561], "area": 755495.0, "category_id": 1, "ignore": 0, "segmentation": [[1293, 0, 1293, 2560, 1587, 2560, 1587, 0]], "id": 2}], "type": "instances"}
diff --git a/output/test_coco_output/test/20180928115522.jpg b/output/test_coco_output/test/20180928115522.jpg
diff --git a/output/test_coco_output/train/20180928115506.jpg b/output/test_coco_output/train/20180928115506.jpg
diff --git a/output/test_coco_output/train/20180928115538.jpg b/output/test_coco_output/train/20180928115538.jpg
diff --git a/output/test_voc_output/ImageSets/Main/test.txt b/output/test_voc_output/ImageSets/Main/test.txt
@@ -0,0 +1 @@
+20180928115522
diff --git a/output/test_voc_output/ImageSets/Main/train.txt b/output/test_voc_output/ImageSets/Main/train.txt
@@ -0,0 +1,2 @@
+20180928115506
+20180928115538
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,8 @@ imgviz~=1.4.1
 pillow~=8.4.0
 labelme~=4.5.13
 lxml~=4.6.4
-progressbar~=2.5
+progressbar~=2.5
+setuptools~=58.0.4
+xmltodict~=0.12.0
+sklearn~=0.0
+scikit-learn~=0.24.2
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='labelme2datasets',
-    version='0.0.1',
+    version='0.0.2',
     description='python scripts to convert labelme-generated-jsons to voc/coco style datasets.',
     long_description=long_description,
     long_description_content_type="text/markdown",
@@ -18,12 +18,17 @@
         'pillow~=8.4.0',
         'labelme~=4.5.13',
         'lxml~=4.6.4',
-        'progressbar~=2.5'
+        'progressbar~=2.5',
+        'xmltodict~=0.12.0',
+        'sklearn~=0.0',
+        'scikit-learn~=0.24.2',
     ],
     entry_points={
         'console_scripts': [
             'labelme_json2dataset = labelme2datasets.labelme_json2dataset:main',
-            'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main'
+            'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main',
+            'split_voc_datasets = labelme2datasets.split_voc_datasets:main',
+            'voc2coco = labelme2datasets.voc2coco:main',
         ]
     },
     classifiers=[
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115522.jpg", "height": 1920, "width": 2560, "id": 1}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1090, 2, 218, 2559], "area": 557862.0, "category_id": 3, "ignore": 0, "segmentation": [[1090, 2, 1090, 2560, 1307, 2560, 1307, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1322, 0, 331, 2561], "area": 847691.0, "category_id": 1, "ignore": 0, "segmentation": [[1322, 0, 1322, 2560, 1652, 2560, 1652, 0]], "id": 2}], "type": "instances"}