Skip to content

Commit

Permalink
ADD: voc2coco
Browse files Browse the repository at this point in the history
  • Loading branch information
veraposeidon committed Nov 21, 2021
1 parent c4748df commit 13e34db
Show file tree
Hide file tree
Showing 14 changed files with 292 additions and 10 deletions.
16 changes: 13 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,24 +155,34 @@ if the baseline in this project not work for your datasets, you can install in d
## Usage

- convert a single json into dataset. (`labelme_json2dataset.py`)
```sh
```shell
labelme_json2dataset --json_file=data/test.json \
--output_dir=output/test_single_output
```

- convert a folder of jsons into voc-format dataset. (`labelme_bbox_json2voc.py`)
- without label conversion
```sh
```shell
labelme_bbox_json2voc --json_dir=data/test_jsons \
--output_dir=output/test_voc_output --labels data/label_names.txt
```
- with label conversion
```sh
```shell
labelme_bbox_json2voc --json_dir=data/test_jsons \
--output_dir=output/test_voc_output \
--labels data/label_names.txt \
--label_dict data/label_dict.txt
```
- splitting voc datasets into train set and test set. (`split_voc_datasets.py`)
```shell
split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42
```
`train.txt` and `test.txt` should be generated in `voc_dir/ImageSets/Main/`.

- turn voc format dataset into coco style dataset. (`voc2coco.py`)
```shell
voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output
```

<p align="right">(<a href="#top">back to top</a>)</p>

Expand Down
13 changes: 10 additions & 3 deletions README.zh.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,18 @@
--labels data/label_names.txt \
--label_dict data/label_dict.txt
```

- 分割 VOC 数据的训练集和测试集。 (`split_voc_datasets.py`)
```sh
split_voc_datasets --voc_dir output/test_voc_output --test_ratio 0.3 --random_seed 42
```
`train.txt``test.txt` 会出现在 `voc_dir/ImageSets/Main/` 文件夹下。

- 将 VOC 数据集转换为 COCO 数据集 (`voc2coco.py`)
```shell
voc2coco --voc_dir output/test_voc_output --coco_dir output/test_coco_output
```
<p align="right">(<a href="#top">back to top</a>)</p>



<!-- ROADMAP -->
## Roadmap

Expand Down
59 changes: 59 additions & 0 deletions labelme2datasets/split_voc_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""splitting voc format datasets into training set and test set"""
# coding=utf-8

import argparse
import sys
import os
import os.path as osp
import glob
from pathlib import Path
from sklearn.model_selection import train_test_split


def main():
"""splitting voc format datasets into training set and test set"""
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--voc_dir', help='input annotated directory')
parser.add_argument('--test_ratio', help='test set ratio', default=0.3)
parser.add_argument('--random_seed', help='random seed ', default=42)
args = parser.parse_args()

if not osp.exists(args.voc_dir):
print('directory not exists:', args.voc_dir)
sys.exit(1)

annotation_dir = osp.join(args.voc_dir, 'Annotations')
if not osp.exists(annotation_dir):
print('annotation directory not exists:', annotation_dir)
sys.exit(1)

output_dir = osp.join(args.voc_dir, 'ImageSets', 'Main')
if not osp.exists(output_dir):
os.makedirs(output_dir)

train_file = osp.join(output_dir, 'train.txt')
test_file = osp.join(output_dir, 'test.txt')
if osp.exists(train_file) or osp.exists(test_file):
print(f'train.txt: {train_file} exists or test.txt: {train_file} exists,please check!')
sys.exit(1)

total_files = glob.glob(osp.join(annotation_dir, '*.xml'))
total_files = [Path(o).stem for o in total_files]
train_set, test_set = train_test_split(total_files,
test_size=float(args.test_ratio),
random_state=int(args.random_seed))

with open(train_file, 'w', encoding='utf8') as train_f:
for file in train_set:
train_f.write(file + "\n")

with open(test_file, 'w', encoding='utf8') as test_f:
for file in test_set:
test_f.write(file + "\n")

print(f"split Completed. Number of Train Samples: {len(train_set)}."
f" Number of Test Samples: {len(test_set)}")


if __name__ == '__main__':
main()
17 changes: 17 additions & 0 deletions labelme2datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

# coding=utf-8

import os.path as osp

def get_label_conversion_dict(dict_file):
"""
Expand All @@ -21,3 +22,19 @@ def get_label_conversion_dict(dict_file):
words = line.split(":")
label_dict[words[0].strip()] = words[1].strip()
return label_dict


def get_coco_category(labels_file):
"""生成标签字典,用于生成COCO数据集时供查询"""
if not osp.exists(labels_file):
print('file not exists:', labels_file)
return None
attr_dict = {"categories": []}
label_id = 0
with open(labels_file, "r", encoding='UTF-8') as label_f:
for line in label_f:
label = line.strip()
label_item = {"supercategory": "defect", "id": label_id, "name": label}
attr_dict["categories"].append(label_item)
label_id += 1
return attr_dict
175 changes: 175 additions & 0 deletions labelme2datasets/voc2coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""turn voc format datasets into coco format datasets"""
# coding = utf-8

import argparse
import sys
import os
import os.path as osp
from pathlib import Path
import json
import glob
from collections import OrderedDict
import shutil
import xmltodict
from labelme2datasets.utils import get_coco_category


def get_xml_anno_list(set_file, voc_dir, coco_dir):
"""get_xml_anno_list and copy source image to coco dir."""
voc_split = Path(set_file).stem # e.g. train.txt -> train
anno_list = []
with open(set_file, "r", encoding='UTF-8') as f_open:
for line in f_open:
base = line.strip()
# absolute path of file
anno_list.append(osp.join(voc_dir, 'Annotations', base + ".xml"))
# copy image to COCO dataset
# MARK: jpg or png or other pic suffix
image_from = osp.join(voc_dir, "JPEGImages", base + ".jpg")
if not osp.exists(image_from):
print(f"some thing wrong, file not exists: {image_from}")
image_dest = osp.join(coco_dir, voc_split)
shutil.copy(image_from, image_dest)
# print("copy image {} to {}".format(base, image_dest))
print("build anno_list. total samples:", len(anno_list))
return anno_list


def get_image_with_anno(anno_file):
"""get image with annotation"""
image = {}
with open(anno_file, 'r', encoding='utf8') as f_open:
doc = xmltodict.parse(f_open.read())
image['file_name'] = str(doc['annotation']['filename'])
image['height'] = int(doc['annotation']['size']['height'])
image['width'] = int(doc['annotation']['size']['width'])
return image


def get_coco_anno_with_file(anno_file, image_id, attr_dict):
"""get coco annotation with file"""
annotations = []
with open(anno_file, 'r', encoding='utf8') as f_open:
doc = xmltodict.parse(f_open.read())
anno_id = 1
if 'object' in doc['annotation']:
objects = doc['annotation']['object']
if isinstance(objects, OrderedDict):
obj = objects
objects = [obj]

for obj in objects:
for value in attr_dict["categories"]:
if str(obj['name']) != value["name"]:
continue
annotation = {"iscrowd": 0, "image_id": image_id}
# annotation["segmentation"] = []
box_x = int(float(obj["bndbox"]["xmin"]))
box_y = int(float(obj["bndbox"]["ymin"]))
box_w = int(float(obj["bndbox"]["xmax"])) - box_x + 1
box_h = int(float(obj["bndbox"]["ymax"])) - box_y + 1
annotation["bbox"] = [box_x, box_y, box_w, box_h]
annotation["area"] = float(box_w * box_h)
annotation["category_id"] = value["id"]
annotation["ignore"] = 0
annotation["segmentation"] = [[box_x, box_y, box_x, (box_y + box_h - 1),
(box_x + box_w - 1), (box_y + box_h - 1),
(box_x + box_w - 1), box_y]]
annotation["id"] = anno_id
anno_id += 1
annotations.append(annotation)
else:
print(f"File: {anno_file} doesn't have any object")

return annotations


def save_coco_json(attr_dict, anno_path):
"""save coco json file"""
json_string = json.dumps(attr_dict)
with open(anno_path, "w", encoding="utf8") as anno_f:
anno_f.write(json_string)


def generate_coco_annotation(set_file, voc_dir, coco_dir):
"""
generate coco annotation from voc annotation
"""
voc_split = Path(set_file).stem # e.g. train.txt -> train
anno_file = voc_split + '.json'

if osp.exists(osp.join(coco_dir, voc_split)):
print("directory not supposed to exist: ", osp.join(coco_dir, voc_split))
sys.exit(1)
os.makedirs(osp.join(coco_dir, voc_split))

anno_path = osp.join(coco_dir, 'annotations', anno_file)
if osp.exists(anno_path):
print('anno file exists:', anno_path)
sys.exit(1)

# check class_names.txt in voc dataset
attr_dict = get_coco_category(osp.join(voc_dir, 'class_names.txt'))
if attr_dict is None:
print('class_names.txt not found')
sys.exit(1)

anno_list = get_xml_anno_list(set_file, voc_dir, coco_dir)

image_id = 0
images = []
annotations = []

for file in anno_list:
if not osp.exists(file):
print("file not exists", file)
continue

image_id += 1

# get image info
image = get_image_with_anno(file)
image['id'] = image_id
images.append(image)

# get annotation info
current_annotations = get_coco_anno_with_file(file, image_id, attr_dict)
for item in current_annotations:
annotations.append(item)

attr_dict["images"] = images
attr_dict["annotations"] = annotations
attr_dict["type"] = "instances"

# save file
save_coco_json(attr_dict, anno_path)


def main():
"""turn voc format datasets into coco format datasets."""
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--voc_dir', help='INPUT: voc style dataset root directory')
parser.add_argument('--coco_dir', help='OUTPUT: coco style dataset root directory')
args = parser.parse_args()

if not osp.exists(args.voc_dir):
print('directory not exists:', args.voc_dir)
sys.exit(1)

set_files = glob.glob(osp.join(args.voc_dir, 'ImageSets', 'Main', '*.txt'))
if len(set_files) == 0:
print(f"set file not exists: {osp.join(args.voc_dir, 'ImageSets', 'Main')}")
sys.exit(1)

if not osp.exists(args.coco_dir):
os.makedirs(args.coco_dir)
os.makedirs(osp.join(args.coco_dir, 'annotations'))

# iterate every set file(eg. train.txt、test.txt)
for set_file in set_files:
generate_coco_annotation(set_file, args.voc_dir, args.coco_dir)


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions output/test_coco_output/annotations/test.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115522.jpg", "height": 1920, "width": 2560, "id": 1}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1090, 2, 218, 2559], "area": 557862.0, "category_id": 3, "ignore": 0, "segmentation": [[1090, 2, 1090, 2560, 1307, 2560, 1307, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1322, 0, 331, 2561], "area": 847691.0, "category_id": 1, "ignore": 0, "segmentation": [[1322, 0, 1322, 2560, 1652, 2560, 1652, 0]], "id": 2}], "type": "instances"}
1 change: 1 addition & 0 deletions output/test_coco_output/annotations/train.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"categories": [{"supercategory": "defect", "id": 0, "name": "_background_"}, {"supercategory": "defect", "id": 1, "name": "BuDaoDian"}, {"supercategory": "defect", "id": 2, "name": "CaHua"}, {"supercategory": "defect", "id": 3, "name": "JiaoWeiLouDi"}, {"supercategory": "defect", "id": 4, "name": "JuPi"}, {"supercategory": "defect", "id": 5, "name": "LouDi"}, {"supercategory": "defect", "id": 6, "name": "PengLiu"}, {"supercategory": "defect", "id": 7, "name": "QiPao"}, {"supercategory": "defect", "id": 8, "name": "QiKeng"}, {"supercategory": "defect", "id": 9, "name": "ZaSe"}, {"supercategory": "defect", "id": 10, "name": "ZangDian"}], "images": [{"file_name": "20180928115506.jpg", "height": 1920, "width": 2560, "id": 1}, {"file_name": "20180928115538.jpg", "height": 1920, "width": 2560, "id": 2}], "annotations": [{"iscrowd": 0, "image_id": 1, "bbox": [1178, 2, 269, 2559], "area": 688371.0, "category_id": 1, "ignore": 0, "segmentation": [[1178, 2, 1178, 2560, 1446, 2560, 1446, 2]], "id": 1}, {"iscrowd": 0, "image_id": 1, "bbox": [1017, 0, 156, 2561], "area": 399516.0, "category_id": 3, "ignore": 0, "segmentation": [[1017, 0, 1017, 2560, 1172, 2560, 1172, 0]], "id": 2}, {"iscrowd": 0, "image_id": 2, "bbox": [1075, 0, 183, 2561], "area": 468663.0, "category_id": 3, "ignore": 0, "segmentation": [[1075, 0, 1075, 2560, 1257, 2560, 1257, 0]], "id": 1}, {"iscrowd": 0, "image_id": 2, "bbox": [1293, 0, 295, 2561], "area": 755495.0, "category_id": 1, "ignore": 0, "segmentation": [[1293, 0, 1293, 2560, 1587, 2560, 1587, 0]], "id": 2}], "type": "instances"}
Binary file added output/test_coco_output/test/20180928115522.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added output/test_coco_output/train/20180928115506.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added output/test_coco_output/train/20180928115538.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions output/test_voc_output/ImageSets/Main/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
20180928115522
2 changes: 2 additions & 0 deletions output/test_voc_output/ImageSets/Main/train.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
20180928115506
20180928115538
6 changes: 5 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@ imgviz~=1.4.1
pillow~=8.4.0
labelme~=4.5.13
lxml~=4.6.4
progressbar~=2.5
progressbar~=2.5
setuptools~=58.0.4
xmltodict~=0.12.0
sklearn~=0.0
scikit-learn~=0.24.2
11 changes: 8 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='labelme2datasets',
version='0.0.1',
version='0.0.2',
description='python scripts to convert labelme-generated-jsons to voc/coco style datasets.',
long_description=long_description,
long_description_content_type="text/markdown",
Expand All @@ -18,12 +18,17 @@
'pillow~=8.4.0',
'labelme~=4.5.13',
'lxml~=4.6.4',
'progressbar~=2.5'
'progressbar~=2.5',
'xmltodict~=0.12.0',
'sklearn~=0.0',
'scikit-learn~=0.24.2',
],
entry_points={
'console_scripts': [
'labelme_json2dataset = labelme2datasets.labelme_json2dataset:main',
'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main'
'labelme_bbox_json2voc = labelme2datasets.labelme_bbox_json2voc:main',
'split_voc_datasets = labelme2datasets.split_voc_datasets:main',
'voc2coco = labelme2datasets.voc2coco:main',
]
},
classifiers=[
Expand Down

0 comments on commit 13e34db

Please sign in to comment.