diff --git a/README.md b/README.md index 5de1419..44d6513 100644 --- a/README.md +++ b/README.md @@ -155,24 +155,34 @@ if the baseline in this project not work for your datasets, you can install in d ## Usage - convert a single json into dataset. (`labelme_json2dataset.py`) - ```sh + ```shell labelme_json2dataset --json_file=data/test.json \ --output_dir=output/test_single_output ``` - convert a folder of jsons into voc-format dataset. (`labelme_bbox_json2voc.py`) - without label conversion - ```sh + ```shell labelme_bbox_json2voc --json_dir=data/test_jsons \ --output_dir=output/test_voc_output --labels data/label_names.txt ``` - with label conversion - ```sh + ```shell labelme_bbox_json2voc --json_dir=data/test_jsons \ --output_dir=output/test_voc_output \ --labels data/label_names.txt \ --label_dict data/label_dict.txt ``` +- splitting voc datasets into train set and test set. (`split_voc_datasets.py`) + ```shell + split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42 + ``` + `train.txt` and `test.txt` should be generated in `voc_dir/ImageSets/Main/`. + +- turn voc format dataset into coco style dataset. (`voc2coco.py`) + ```shell + voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output + ```
diff --git a/README.zh.md b/README.zh.md index 24f1ab2..398af34 100644 --- a/README.zh.md +++ b/README.zh.md @@ -171,11 +171,18 @@ --labels data/label_names.txt \ --label_dict data/label_dict.txt ``` - +- 分割 VOC 数据的训练集和测试集。 (`split_voc_datasets.py`) + ```sh + split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42 + ``` + `train.txt` 和 `test.txt` 会出现在 `voc_dir/ImageSets/Main/` 文件夹下。 + +- 将 VOC 数据集转换为 COCO 数据集 (`voc2coco.py`) + ```shell + voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output + ``` - - ## Roadmap diff --git a/labelme2datasets/split_voc_datasets.py b/labelme2datasets/split_voc_datasets.py new file mode 100644 index 0000000..e42f970 --- /dev/null +++ b/labelme2datasets/split_voc_datasets.py @@ -0,0 +1,59 @@ +"""splitting voc format datasets into training set and test set""" +# coding=utf-8 + +import argparse +import sys +import os +import os.path as osp +import glob +from pathlib import Path +from sklearn.model_selection import train_test_split + + +def main(): + """splitting voc format datasets into training set and test set""" + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--voc_dir', help='input annotated directory') + parser.add_argument('--test_ratio', help='test set ratio', default=0.3) + parser.add_argument('--random_seed', help='random seed ', default=42) + args = parser.parse_args() + + if not osp.exists(args.voc_dir): + print('directory not exists:', args.voc_dir) + sys.exit(1) + + annotation_dir = osp.join(args.voc_dir, 'Annotations') + if not osp.exists(annotation_dir): + print('annotation directory not exists:', annotation_dir) + sys.exit(1) + + output_dir = osp.join(args.voc_dir, 'ImageSets', 'Main') + if not osp.exists(output_dir): + os.makedirs(output_dir) + + train_file = osp.join(output_dir, 'train.txt') + test_file = osp.join(output_dir, 'test.txt') + if osp.exists(train_file) or osp.exists(test_file): + print(f'train.txt: {train_file} exists or test.txt: {train_file} exists,please check!') + sys.exit(1) + + total_files = glob.glob(osp.join(annotation_dir, '*.xml')) + total_files = [Path(o).stem for o in total_files] + train_set, test_set = train_test_split(total_files, + test_size=float(args.test_ratio), + random_state=int(args.random_seed)) + + with open(train_file, 'w', encoding='utf8') as train_f: + for file in train_set: + train_f.write(file + "\n") + + with open(test_file, 'w', encoding='utf8') as test_f: + for file in test_set: + test_f.write(file + "\n") + + print(f"split Completed. Number of Train Samples: {len(train_set)}." + f" Number of Test Samples: {len(test_set)}") + + +if __name__ == '__main__': + main() diff --git a/labelme2datasets/utils.py b/labelme2datasets/utils.py index 18e361b..da1c667 100644 --- a/labelme2datasets/utils.py +++ b/labelme2datasets/utils.py @@ -4,6 +4,7 @@ # coding=utf-8 +import os.path as osp def get_label_conversion_dict(dict_file): """ @@ -21,3 +22,19 @@ def get_label_conversion_dict(dict_file): words = line.split(":") label_dict[words[0].strip()] = words[1].strip() return label_dict + + +def get_coco_category(labels_file): + """生成标签字典,用于生成COCO数据集时供查询""" + if not osp.exists(labels_file): + print('file not exists:', labels_file) + return None + attr_dict = {"categories": []} + label_id = 0 + with open(labels_file, "r", encoding='UTF-8') as label_f: + for line in label_f: + label = line.strip() + label_item = {"supercategory": "defect", "id": label_id, "name": label} + attr_dict["categories"].append(label_item) + label_id += 1 + return attr_dict diff --git a/labelme2datasets/voc2coco.py b/labelme2datasets/voc2coco.py new file mode 100644 index 0000000..6bf4cf1 --- /dev/null +++ b/labelme2datasets/voc2coco.py @@ -0,0 +1,175 @@ +"""turn voc format datasets into coco format datasets""" +# coding = utf-8 + +import argparse +import sys +import os +import os.path as osp +from pathlib import Path +import json +import glob +from collections import OrderedDict +import shutil +import xmltodict +from labelme2datasets.utils import get_coco_category + + +def get_xml_anno_list(set_file, voc_dir, coco_dir): + """get_xml_anno_list and copy source image to coco dir.""" + voc_split = Path(set_file).stem # e.g. train.txt -> train + anno_list = [] + with open(set_file, "r", encoding='UTF-8') as f_open: + for line in f_open: + base = line.strip() + # absolute path of file + anno_list.append(osp.join(voc_dir, 'Annotations', base + ".xml")) + # copy image to COCO dataset + # MARK: jpg or png or other pic suffix + image_from = osp.join(voc_dir, "JPEGImages", base + ".jpg") + if not osp.exists(image_from): + print(f"some thing wrong, file not exists: {image_from}") + image_dest = osp.join(coco_dir, voc_split) + shutil.copy(image_from, image_dest) + # print("copy image {} to {}".format(base, image_dest)) + print("build anno_list. total samples:", len(anno_list)) + return anno_list + + +def get_image_with_anno(anno_file): + """get image with annotation""" + image = {} + with open(anno_file, 'r', encoding='utf8') as f_open: + doc = xmltodict.parse(f_open.read()) + image['file_name'] = str(doc['annotation']['filename']) + image['height'] = int(doc['annotation']['size']['height']) + image['width'] = int(doc['annotation']['size']['width']) + return image + + +def get_coco_anno_with_file(anno_file, image_id, attr_dict): + """get coco annotation with file""" + annotations = [] + with open(anno_file, 'r', encoding='utf8') as f_open: + doc = xmltodict.parse(f_open.read()) + anno_id = 1 + if 'object' in doc['annotation']: + objects = doc['annotation']['object'] + if isinstance(objects, OrderedDict): + obj = objects + objects = [obj] + + for obj in objects: + for value in attr_dict["categories"]: + if str(obj['name']) != value["name"]: + continue + annotation = {"iscrowd": 0, "image_id": image_id} + # annotation["segmentation"] = [] + box_x = int(float(obj["bndbox"]["xmin"])) + box_y = int(float(obj["bndbox"]["ymin"])) + box_w = int(float(obj["bndbox"]["xmax"])) - box_x + 1 + box_h = int(float(obj["bndbox"]["ymax"])) - box_y + 1 + annotation["bbox"] = [box_x, box_y, box_w, box_h] + annotation["area"] = float(box_w * box_h) + annotation["category_id"] = value["id"] + annotation["ignore"] = 0 + annotation["segmentation"] = [[box_x, box_y, box_x, (box_y + box_h - 1), + (box_x + box_w - 1), (box_y + box_h - 1), + (box_x + box_w - 1), box_y]] + annotation["id"] = anno_id + anno_id += 1 + annotations.append(annotation) + else: + print(f"File: {anno_file} doesn't have any object") + + return annotations + + +def save_coco_json(attr_dict, anno_path): + """save coco json file""" + json_string = json.dumps(attr_dict) + with open(anno_path, "w", encoding="utf8") as anno_f: + anno_f.write(json_string) + + +def generate_coco_annotation(set_file, voc_dir, coco_dir): + """ + generate coco annotation from voc annotation + """ + voc_split = Path(set_file).stem # e.g. train.txt -> train + anno_file = voc_split + '.json' + + if osp.exists(osp.join(coco_dir, voc_split)): + print("directory not supposed to exist: ", osp.join(coco_dir, voc_split)) + sys.exit(1) + os.makedirs(osp.join(coco_dir, voc_split)) + + anno_path = osp.join(coco_dir, 'annotations', anno_file) + if osp.exists(anno_path): + print('anno file exists:', anno_path) + sys.exit(1) + + # check class_names.txt in voc dataset + attr_dict = get_coco_category(osp.join(voc_dir, 'class_names.txt')) + if attr_dict is None: + print('class_names.txt not found') + sys.exit(1) + + anno_list = get_xml_anno_list(set_file, voc_dir, coco_dir) + + image_id = 0 + images = [] + annotations = [] + + for file in anno_list: + if not osp.exists(file): + print("file not exists", file) + continue + + image_id += 1 + + # get image info + image = get_image_with_anno(file) + image['id'] = image_id + images.append(image) + + # get annotation info + current_annotations = get_coco_anno_with_file(file, image_id, attr_dict) + for item in current_annotations: + annotations.append(item) + + attr_dict["images"] = images + attr_dict["annotations"] = annotations + attr_dict["type"] = "instances" + + # save file + save_coco_json(attr_dict, anno_path) + + +def main(): + """turn voc format datasets into coco format datasets.""" + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--voc_dir', help='INPUT: voc style dataset root directory') + parser.add_argument('--coco_dir', help='OUTPUT: coco style dataset root directory') + args = parser.parse_args() + + if not osp.exists(args.voc_dir): + print('directory not exists:', args.voc_dir) + sys.exit(1) + + set_files = glob.glob(osp.join(args.voc_dir, 'ImageSets', 'Main', '*.txt')) + if len(set_files) == 0: + print(f"set file not exists: {osp.join(args.voc_dir, 'ImageSets', 'Main')}") + sys.exit(1) + + if not osp.exists(args.coco_dir): + os.makedirs(args.coco_dir) + os.makedirs(osp.join(args.coco_dir, 'annotations')) + + # iterate every set file(eg. train.txt、test.txt) + for set_file in set_files: + generate_coco_annotation(set_file, args.voc_dir, args.coco_dir) + + +if __name__ == '__main__': + main() diff --git a/output/test_coco_output/annotations/test.json b/output/test_coco_output/annotations/test.json new file mode 100644 index 0000000..4ef51bd --- /dev/null +++ b/output/test_coco_output/annotations/test.json @@ -0,0 +1 @@ +{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115522.jpg", "height": 1920, "width": 2560, "id": 1}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1090, 2, 218, 2559], "area": 557862.0, "category_id": 3, "ignore": 0, "segmentation": [[1090, 2, 1090, 2560, 1307, 2560, 1307, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1322, 0, 331, 2561], "area": 847691.0, "category_id": 1, "ignore": 0, "segmentation": [[1322, 0, 1322, 2560, 1652, 2560, 1652, 0]], "id": 2}], "type": "instances"} \ No newline at end of file diff --git a/output/test_coco_output/annotations/train.json b/output/test_coco_output/annotations/train.json new file mode 100644 index 0000000..ff146ce --- /dev/null +++ b/output/test_coco_output/annotations/train.json @@ -0,0 +1 @@ +{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115506.jpg", "height": 1920, "width": 2560, "id": 1}, {"file_name": "20180928115538.jpg", "height": 1920, "width": 2560, "id": 2}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1178, 2, 269, 2559], "area": 688371.0, "category_id": 1, "ignore": 0, "segmentation": [[1178, 2, 1178, 2560, 1446, 2560, 1446, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1017, 0, 156, 2561], "area": 399516.0, "category_id": 3, "ignore": 0, "segmentation": [[1017, 0, 1017, 2560, 1172, 2560, 1172, 0]], "id": 2}, {"iscrowd": 0, "image_id": 2, "bbox": [1075, 0, 183, 2561], "area": 468663.0, "category_id": 3, "ignore": 0, "segmentation": [[1075, 0, 1075, 2560, 1257, 2560, 1257, 0]], "id": 1}, {"iscrowd": 0, "image_id": 2, "bbox": [1293, 0, 295, 2561], "area": 755495.0, "category_id": 1, "ignore": 0, "segmentation": [[1293, 0, 1293, 2560, 1587, 2560, 1587, 0]], "id": 2}], "type": "instances"} \ No newline at end of file diff --git a/output/test_coco_output/test/20180928115522.jpg b/output/test_coco_output/test/20180928115522.jpg new file mode 100644 index 0000000..f06a1c6 Binary files /dev/null and b/output/test_coco_output/test/20180928115522.jpg differ diff --git a/output/test_coco_output/train/20180928115506.jpg b/output/test_coco_output/train/20180928115506.jpg new file mode 100644 index 0000000..01b4253 Binary files /dev/null and b/output/test_coco_output/train/20180928115506.jpg differ diff --git a/output/test_coco_output/train/20180928115538.jpg b/output/test_coco_output/train/20180928115538.jpg new file mode 100644 index 0000000..3f99b9f Binary files /dev/null and b/output/test_coco_output/train/20180928115538.jpg differ diff --git a/output/test_voc_output/ImageSets/Main/test.txt b/output/test_voc_output/ImageSets/Main/test.txt new file mode 100644 index 0000000..4964c85 --- /dev/null +++ b/output/test_voc_output/ImageSets/Main/test.txt @@ -0,0 +1 @@ +20180928115522 diff --git a/output/test_voc_output/ImageSets/Main/train.txt b/output/test_voc_output/ImageSets/Main/train.txt new file mode 100644 index 0000000..9bc3c14 --- /dev/null +++ b/output/test_voc_output/ImageSets/Main/train.txt @@ -0,0 +1,2 @@ +20180928115506 +20180928115538 diff --git a/requirements.txt b/requirements.txt index 7530b8b..8daba0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,8 @@ imgviz~=1.4.1 pillow~=8.4.0 labelme~=4.5.13 lxml~=4.6.4 -progressbar~=2.5 \ No newline at end of file +progressbar~=2.5 +setuptools~=58.0.4 +xmltodict~=0.12.0 +sklearn~=0.0 +scikit-learn~=0.24.2 \ No newline at end of file diff --git a/setup.py b/setup.py index 8a1f54e..142914f 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ setup( name='labelme2datasets', - version='0.0.1', + version='0.0.2', description='python scripts to convert labelme-generated-jsons to voc/coco style datasets.', long_description=long_description, long_description_content_type="text/markdown", @@ -18,12 +18,17 @@ 'pillow~=8.4.0', 'labelme~=4.5.13', 'lxml~=4.6.4', - 'progressbar~=2.5' + 'progressbar~=2.5', + 'xmltodict~=0.12.0', + 'sklearn~=0.0', + 'scikit-learn~=0.24.2', ], entry_points={ 'console_scripts': [ 'labelme_json2dataset = labelme2datasets.labelme_json2dataset:main', - 'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main' + 'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main', + 'split_voc_datasets = labelme2datasets.split_voc_datasets:main', + 'voc2coco = labelme2datasets.voc2coco:main', ] }, classifiers=[