diff --git a/doc/jsk_perception/nodes/detection_node.md b/doc/jsk_perception/nodes/detection_node.md new file mode 100644 index 0000000000..e8846b8a12 --- /dev/null +++ b/doc/jsk_perception/nodes/detection_node.md @@ -0,0 +1,89 @@ +# detection_node.py + +![](images/dino.png) + +The ROS node for Open-Vocabulary Object Detection with GroundingDINO. + +## System Configuration +![](images/large_scale_vil_system.png) + +This node requires to work with the Docker Container for inference. Please build the container at first following Setup instruction. + +### Prerequisite +This node requires NVIDIA GPU and more than 4GB GRAM to work properly. +You have to install nvidia-container-toolkit for using GPU with docker. Please follow [official instruction](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). + +### Build the docker image +You have to build the docker image of GroundingDINO + +```shell +roscd jsk_perception/docker +make +``` + +## Subscribing topic +* `~image` (`sensor_msgs/Image`) + + Input image + +## Publishing topic +* `~output/image` (`sensor_msgs/Image`) + + Image drawing the detected bounding box + +* `~rects` (`jsk_recognition_msgs/RectArray`) + + Array of detected bounding box regions + +* `~result` (`jsk_recognition_msgs/DetectionResult`) + + Detection result + +* `~result/image` (`sensor_msgs/Image`) + + Images used for inference + +* `~visualize` (`std_msgs/String`) + + Detection result to visualize + +## Action topic +* `~inference_server/goal` (`jsk_recognition_msgs/DetectionTaskActionGoal`) + + Detection request with custom categories and image + +* `~inference_server/result` (`jsk_recognition_msgs/DetectionTaskActionResult`) + + Detection result of `~inference_server/goal` + +## Parameters +* `~host` (String, default: `localhost`) + + The host name or IP of inference container + +* `~port` (Integer, default: `8080`) + + The HTTP port of inference container + +## Dynamic Reconfigure Parameters +* `~queries` (string, default: `human;kettle;cup;glass`) + + Default categories used for subscribing image topic. + +### Run inference container on another host or another terminal +In the remote GPU machine, +```shell +cd jsk_recognition/jsk_perception/docker +./run_jsk_vil_api dino --port (Your vacant port) +``` + +In the ROS machine, +```shell +roslaunch jsk_perception detection.launch port:=(Your inference container port) host:=(Your inference container host) DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true +``` + + +### Run both inference container and ros node in single host +``` +roslaunch jsk_perception detection.launch run_api:=true DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true +``` \ No newline at end of file diff --git a/doc/jsk_perception/nodes/images/dino.png b/doc/jsk_perception/nodes/images/dino.png new file mode 100644 index 0000000000..b3c8159eac Binary files /dev/null and b/doc/jsk_perception/nodes/images/dino.png differ diff --git a/jsk_perception/docker/Makefile b/jsk_perception/docker/Makefile index 83eb2800ab..43b3abec8b 100644 --- a/jsk_perception/docker/Makefile +++ b/jsk_perception/docker/Makefile @@ -5,9 +5,11 @@ # api directories OFAPROJECT = ofa CLIPPROJECT = clip +DINOPROJECT = dino # image names OFAIMAGE = jsk-ofa-server CLIPIMAGE = jsk-clip-server +DINOIMAGE = jsk-dino-server # commands BUILDIMAGE = docker build REMOVEIMAGE = docker rmi @@ -23,7 +25,7 @@ PARAMURLS = parameter_urls.txt # OFA parameters OFAPARAMFILES = $(foreach param, $(OFAPARAMS), $(PARAMDIR)/$(param)) -all: ofa clip +all: ofa clip dino # TODO check command wget exists, nvidia-driver version @@ -41,6 +43,9 @@ ofa: $(PARAMDIR)/.download clip: $(PARAMDIR)/.download $(BUILDIMAGE) $(CLIPPROJECT) -t $(CLIPIMAGE) -f $(CLIPPROJECT)/Dockerfile +dino: $(PARAMDIR)/.download + $(BUILDIMAGE) $(DINOPROJECT) -t $(DINOIMAGE) -f $(DINOPROJECT)/Dockerfile + # TODO add clip, glip clean: @$(REMOVEIMAGE) $(OFAIMAGE) @@ -48,4 +53,4 @@ clean: wipe: clean rm -fr $(PARAMDIR) -.PHONY: clean wipe ofa clip +.PHONY: clean wipe ofa clip dino diff --git a/jsk_perception/docker/dino/Dockerfile b/jsk_perception/docker/dino/Dockerfile new file mode 100644 index 0000000000..b5a18ef314 --- /dev/null +++ b/jsk_perception/docker/dino/Dockerfile @@ -0,0 +1,27 @@ +# FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel +FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-devel +# FROm pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel +ARG DEBIAN_FRONTEND=noninteractive +RUN apt -o Acquire::AllowInsecureRepositories=true update \ + && apt-get install -y \ + curl \ + git \ + libopencv-dev \ + wget \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* +ENV CUDA_HOME /usr/local/cuda +ENV TORCH_CUDA_ARCH_LIST 8.0+PTX +RUN git clone https://github.com/IDEA-Research/GroundingDINO.git +RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc +RUN echo 'TORCH_CUDA_ARCH_LIST=8.0+PTX' >> ~/.bashrc +RUN pip install flask opencv-python \ + && pip install "numpy>=1.20" +RUN cd GroundingDINO \ + && pip install -r requirements.txt \ + && pip install -e . +RUN mkdir -p GroundingDINO/weights \ + && cd GroundingDINO/weights \ + && wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth +COPY server.py /workspace/GroundingDINO +ENTRYPOINT cd /workspace/GroundingDINO && python server.py \ No newline at end of file diff --git a/jsk_perception/docker/dino/server.py b/jsk_perception/docker/dino/server.py new file mode 100644 index 0000000000..84f9014dd4 --- /dev/null +++ b/jsk_perception/docker/dino/server.py @@ -0,0 +1,99 @@ +from groundingdino.util.inference import load_model, load_image, predict, annotate +import groundingdino.datasets.transforms as T +from torchvision.ops import box_convert + +import cv2 +import numpy as np +from PIL import Image as PLImage +import torch + +# web server +from flask import Flask, request, Response +import json +import base64 + + +def apply_half(t): + if t.dtype is torch.float32: + return t.to(dtype=torch.half) + return t + +class Inference: + def __init__(self, gpu_id=None): + self.gpu_id = gpu_id + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth") + self.BOX_TRESHOLD = 0.35 + self.TEXT_TRESHOLD = 0.25 + + def convert_to_string(self, input_list): + output_string = "" + for item in input_list: + output_string += item + " . " + return output_string.strip() + + def infer(self, img, texts): + # get cv2 image + # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely + # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + image_source = PLImage.fromarray(image) + image = np.asarray(image_source) + transform = T.Compose( + [ + T.RandomResize([800], max_size=1333), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + image_transformed, _ = transform(image_source, None) + + image_source = image + image = image_transformed + + TEXT_PROMPT = self.convert_to_string(texts) + + boxes, logits, phrases = predict( + model=self.model, + image=image, + caption=TEXT_PROMPT, + box_threshold=self.BOX_TRESHOLD, + text_threshold=self.TEXT_TRESHOLD, + device = self.device + ) + + h, w, _ = image_source.shape + boxes = boxes * torch.Tensor([w, h, w, h]) + xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy() + + results = {} + for i in range(len(xyxy)): + box = xyxy[i].tolist() + logit = logits[i].item() + results[i] = {"box": box, "logit": logit, "phrase": phrases[i]} + + return results + +# run +if __name__ == "__main__": + app = Flask(__name__) + infer = Inference() + + @app.route("/detection", methods=['POST']) + def detection_request(): + data = request.data.decode("utf-8") + data_json = json.loads(data) + # process image + image_b = data_json['image'] + image_dec = base64.b64decode(image_b) + data_np = np.fromstring(image_dec, dtype='uint8') + img = cv2.imdecode(data_np, 1) + # get text + texts = data_json['queries'] + infer_results = infer.infer(img, texts) + results = [] + for i in range(len(infer_results)): + results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]}) + return Response(response=json.dumps({"results": results}), status=200) + + app.run("0.0.0.0", 8080, threaded=True) diff --git a/jsk_perception/docker/ofa/server.py b/jsk_perception/docker/ofa/server.py index 6c667e3328..82bb636486 100644 --- a/jsk_perception/docker/ofa/server.py +++ b/jsk_perception/docker/ofa/server.py @@ -61,7 +61,7 @@ def __init__(self, task, model_scale): utils.split_paths(param_path), arg_overrides=overrides) elif task == "refcoco": - tasks.register_task(self.task, RefcocoTask) + tasks.register_task(task, RefcocoTask) self.models, self.cfg, self.task = checkpoint_utils.load_model_ensemble_and_task( utils.split_paths(param_path), arg_overrides=overrides) @@ -140,6 +140,15 @@ def encode_text(self, text, length=None, append_bos=False, append_eos=False): s = torch.cat([s, eos_item]) return s + def convert_objects_to_text(self, text): + if len(text) == 1: + object_text = text[0] + elif len(text) >= 2: + object_text = ', '.join(text[:-1]) + f' or {text[-1]}' + else: + object_text = '' + return object_text + def construct_sample(self, image, text): if self.task_name == "caption" or self.task_name == "vqa_gen": patch_image = self.patch_resize_transform(image).unsqueeze(0) @@ -176,7 +185,8 @@ def construct_sample(self, image, text): h_resize_ratio = torch.tensor(patch_image_size / h).unsqueeze(0) patch_image = self.patch_resize_transform(image).unsqueeze(0) patch_mask = torch.tensor([True]) - src_text = self.encode_text(' which region does the text " {} " describe?'.format(text), append_bos=True, + object_text = self.convert_objects_to_text(text) + src_text = self.encode_text(' which region does the text " {} " describe?'.format(object_text), append_bos=True, append_eos=True).unsqueeze(0) src_length = torch.LongTensor([s.ne(self.pad_idx).long().sum() for s in src_text]) sample = { @@ -214,7 +224,24 @@ def infer(self, img, text): text = result[0]['answer'] return text elif self.task_name == "refcoco": - pass + # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely + # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + image = Image.fromarray(image) + # Construct input sample & preprocess for GPU if cuda available for VG + sample = self.construct_sample(image, text) + sample = utils.move_to_cuda(sample) if self.use_cuda else sample + sample = utils.apply_to_sample(apply_half, sample) if self.use_fp16 else sample + with torch.no_grad(): + result, scores = eval_step(self.task, self.generator, self.models, sample) + results = {} + object_text = self.convert_objects_to_text(text) + for i in range(len(result)): + box = result[i]["box"] + logit = scores[i].item() + results[i] = {"box": box, "logit": logit, "phrase": object_text} + + return results # run if __name__ == "__main__": @@ -232,6 +259,9 @@ def infer(self, img, text): elif ofa_task == "vqa_gen": vqa_infer = Inference("vqa_gen", ofa_model_scale) + elif ofa_task == "detection": + detection_infer = Inference("refcoco", ofa_model_scale) + else: raise RuntimeError("No application is available") @@ -274,5 +304,25 @@ def vqa_request(): return Response(response=json.dumps({"results": results}), status=200) except NameError: print("Skipping create vqa_gen app") - + + try: + @app.route("/detection", methods=['POST']) + def detection_request(): + data = request.data.decode("utf-8") + data_json = json.loads(data) + # process image + image_b = data_json['image'] + image_dec = base64.b64decode(image_b) + data_np = np.fromstring(image_dec, dtype='uint8') + img = cv2.imdecode(data_np, 1) + # get text + texts = data_json['queries'] + infer_results = detection_infer.infer(img, texts) + results = [] + for i in range(len(infer_results)): + results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]}) + return Response(response=json.dumps({"results": results}), status=200) + except NameError: + print("Skipping create detection app") + app.run("0.0.0.0", 8080, threaded=True) diff --git a/jsk_perception/docker/run_jsk_vil_api b/jsk_perception/docker/run_jsk_vil_api index acef636280..8c91444ef3 100755 --- a/jsk_perception/docker/run_jsk_vil_api +++ b/jsk_perception/docker/run_jsk_vil_api @@ -10,7 +10,8 @@ import subprocess import sys CONTAINERS = {"ofa": "jsk-ofa-server", - "clip": "jsk-clip-server"} + "clip": "jsk-clip-server", + "dino": "jsk-dino-server"} OFA_MODEL_SCALES = ["base", "large", "huge"] parser = argparse.ArgumentParser(description="JSK Vision and Language API runner") diff --git a/jsk_perception/launch/detection.launch b/jsk_perception/launch/detection.launch new file mode 100644 index 0000000000..4feec17522 --- /dev/null +++ b/jsk_perception/launch/detection.launch @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + host: $(arg host) + port: $(arg port) + model: $(arg model) + + + + + + diff --git a/jsk_perception/node_scripts/detection_node.py b/jsk_perception/node_scripts/detection_node.py new file mode 100755 index 0000000000..aada431e11 --- /dev/null +++ b/jsk_perception/node_scripts/detection_node.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python + +import rospy +from jsk_perception.vil_inference_client import DINOClientNode + + +def main(): + rospy.init_node("dino") + node = DINOClientNode() + rospy.spin() + +if __name__ == "__main__": + main() diff --git a/jsk_perception/sample/config/sample_ofa_config.rviz b/jsk_perception/sample/config/sample_ofa_config.rviz index 458138e652..bdbdb70408 100644 --- a/jsk_perception/sample/config/sample_ofa_config.rviz +++ b/jsk_perception/sample/config/sample_ofa_config.rviz @@ -5,7 +5,7 @@ Panels: Property Tree Widget: Expanded: ~ Splitter Ratio: 0.4870370328426361 - Tree Height: 509 + Tree Height: 625 - Class: rviz/Selection Name: Selection - Class: rviz/Tool Properties @@ -113,6 +113,38 @@ Visualization Manager: text size: 12 top: 320 width: 512 + - Class: jsk_rviz_plugin/OverlayImage + Enabled: true + Name: ObjectDetection/Output/Image + Topic: /detection/output/image + Value: true + alpha: 0.800000011920929 + height: 128 + keep aspect ratio: true + left: 530 + overwrite alpha value: false + top: 10 + transport hint: raw + width: 320 + - Align Bottom: false + Background Alpha: 0.800000011920929 + Background Color: 0; 0; 0 + Class: jsk_rviz_plugin/String + Enabled: true + Foreground Alpha: 0.800000011920929 + Foreground Color: 255; 255; 255 + Name: ObjectDetection/Visualize + Overtake Color Properties: true + Overtake Position Properties: true + Topic: /detection/visualize + Value: true + font: DejaVu Sans Mono + height: 500 + left: 530 + line width: 2 + text size: 12 + top: 320 + width: 512 Enabled: true Global Options: Background Color: 48; 48; 48 @@ -164,10 +196,10 @@ Visualization Manager: Window Geometry: Displays: collapsed: false - Height: 1025 + Height: 1016 Hide Left Dock: false Hide Right Dock: true - QMainWindow State: 000000ff00000000fd00000004000000000000021e000002f7fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b100fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000006f000002f70000018400fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e0000013500fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000078000000060fc0100000002fb0000000800540069006d0065010000000000000780000005cd00fffffffb0000000800540069006d0065010000000000000450000000000000000000000556000002f700000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000 + QMainWindow State: 000000ff00000000fd00000004000000000000021e00000338fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005c00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003d00000338000000c900fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e000000a400fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000073800000060fc0100000002fb0000000800540069006d0065010000000000000738000003bc00fffffffb0000000800540069006d00650100000000000004500000000000000000000005140000033800000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000 Selection: collapsed: false Time: @@ -176,6 +208,6 @@ Window Geometry: collapsed: false Views: collapsed: true - Width: 1920 - X: 1440 - Y: 1096 + Width: 1848 + X: 72 + Y: 27 diff --git a/jsk_perception/src/jsk_perception/vil_inference_client.py b/jsk_perception/src/jsk_perception/vil_inference_client.py index e42948a2c7..76233faf5f 100644 --- a/jsk_perception/src/jsk_perception/vil_inference_client.py +++ b/jsk_perception/src/jsk_perception/vil_inference_client.py @@ -6,6 +6,9 @@ import actionlib import requests import rospy +import matplotlib +import matplotlib.cm +import numpy as np from cv_bridge import CvBridge from dynamic_reconfigure.server import Server from jsk_perception.cfg import ClassificationConfig, VQAConfig @@ -13,6 +16,11 @@ ClassificationTaskAction, ClassificationTaskFeedback, ClassificationTaskResult, + DetectionResult, + DetectionTaskAction, + DetectionTaskFeedback, + DetectionTaskResult, + Rect, RectArray, QuestionAndAnswerText, VQAResult, VQATaskAction, VQATaskFeedback, VQATaskResult) @@ -56,13 +64,18 @@ def __init__(self, action, self.reconfigure_server = Server(server_config, self.config_cb) self.action_server.start() - def ros_img_to_base(self, ros_img): + def ros_img_to_cv(self, ros_img, encoding="bgr8"): + # convert to cv2 if type(ros_img) is CompressedImage: - cv_img = self._bridge.compressed_imgmsg_to_cv2(ros_img, desired_encoding="bgr8") + cv_img = self._bridge.compressed_imgmsg_to_cv2(ros_img, desired_encoding=encoding) elif type(ros_img) is Image: - cv_img = self._bridge.imgmsg_to_cv2(ros_img, desired_encoding="bgr8") + cv_img = self._bridge.imgmsg_to_cv2(ros_img, desired_encoding=encoding) else: raise RuntimeError("Unknown type {}".format(type(ros_img))) + return cv_img + + def ros_img_to_base(self, ros_img): + cv_img = self.ros_img_to_cv(ros_img) # convert to base64 encimg = cv2.imencode(".png", cv_img)[1] img_str = encimg.tostring() @@ -189,6 +202,99 @@ def inference(self, img_msg, queries): msg.target_names = queries return msg +class DINOClientNode(DockerInferenceClientBase): + def __init__(self): + DockerInferenceClientBase.__init__(self, + DetectionTaskAction, + ClassificationConfig, + DetectionResult, + DetectionTaskFeedback, + DetectionTaskResult, + "detection") + self.model_name = rospy.get_param("~model", default="dino") + self.pub_class = rospy.Publisher('~class', ClassificationResult, queue_size=1) + self.pub_rects = rospy.Publisher('~rects', RectArray, queue_size=1) + self.pub_image = rospy.Publisher('~output/image', Image, queue_size=1) + + def topic_cb(self, data): + if not self.config: rospy.logwarn("No queries"); return + if not self.config.queries: rospy.logwarn("No queries"); return + queries = self.config.queries.split(";") + try: + msg = self.inference(data, queries) + except Exception: return + # publish debug image + self.image_pub.publish(data) + # publish detection result + msg.header = data.header + self.result_pub.publish(msg) + # publish probabilities result as string + vis_msg = "" + for i, label in enumerate(msg.classification.label_names): + vis_msg += "{}: {:.2f}% ".format(label, msg.classification.probabilities[i]*100) + self.vis_pub.publish(vis_msg) + + def create_queries(self, goal): + return goal.queries + + def inference(self, img_msg, queries): + img_byte = self.ros_img_to_base(img_msg) + req = json.dumps({"image": img_byte, + "queries": queries}).encode("utf-8") + response = self.send_request(req) + result_dic = json.loads(response.text)["results"] + + boxes = [] + scores = [] + labels = [] + for r in result_dic: + boxes.append(r["box"]) + scores.append(r["logit"]) + labels.append(r["phrase"]) + classification_msg = ClassificationResult(header=img_msg.header) + classification_msg.labels = list(range(len(labels))) + classification_msg.label_names = labels + classification_msg.label_proba = scores # cosine similarities + classification_msg.probabilities = scores # sum(probabilities) is 1 + classification_msg.classifier = self.model_name + classification_msg.target_names = queries + self.pub_class.publish(classification_msg) + + rect_msg = RectArray(header=img_msg.header) + vis_img = self.ros_img_to_cv(img_msg, encoding="rgb8") + cmap = matplotlib.cm.get_cmap('hsv') + n = max(len(boxes) - 1, 10) + rects = [] + for i in range(len(boxes)): + box = boxes[i] + rgba = np.array(cmap(1. * i / n)) + color = rgba[:3] * 255 + label_text = '{}, {:.2f}'.format(labels[i], scores[i]) + x_min = max(int(box[0]), 0) + y_min = max(int(box[1]), 0) + x_max = min(int(box[2]), vis_img.shape[1]) + y_max = min(int(box[3]), vis_img.shape[0]) + cv2.rectangle( + vis_img, (x_min, y_min), (x_max, y_max), + color, thickness=3, lineType=cv2.LINE_AA) + cv2.putText( + vis_img, label_text, (x_min, max(y_min - 10, 0)), + cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, + thickness=2, lineType=cv2.LINE_AA) + rect = Rect( + x=x_min, y=y_min, + width=x_max - x_min, height=y_max - y_min) + rect_msg.rects.append(rect) + + self.pub_rects.publish(rect_msg) + vis_msg = self._bridge.cv2_to_imgmsg(vis_img, 'rgb8') + vis_msg.header = img_msg.header + self.pub_image.publish(vis_msg) + + msg = self.result_topic_type() + msg.classification = classification_msg + msg.rects = rect_msg + return msg class OFAClientNode(DockerInferenceClientBase): def __init__(self): diff --git a/jsk_recognition_msgs/CMakeLists.txt b/jsk_recognition_msgs/CMakeLists.txt index 86e6164943..7603c95698 100644 --- a/jsk_recognition_msgs/CMakeLists.txt +++ b/jsk_recognition_msgs/CMakeLists.txt @@ -21,6 +21,7 @@ add_message_files( ColorHistogram.msg DepthCalibrationParameter.msg DepthErrorResult.msg + DetectionResult.msg ExifTags.msg ExifGPSInfo.msg HeightmapConfig.msg @@ -116,6 +117,7 @@ add_service_files(FILES add_action_files(FILES VQATask.action + DetectionTask.action ClassificationTask.action ) diff --git a/jsk_recognition_msgs/action/DetectionTask.action b/jsk_recognition_msgs/action/DetectionTask.action new file mode 100644 index 0000000000..09b589a3bf --- /dev/null +++ b/jsk_recognition_msgs/action/DetectionTask.action @@ -0,0 +1,8 @@ +sensor_msgs/Image image +sensor_msgs/CompressedImage compressed_image +string[] queries +--- +jsk_recognition_msgs/DetectionResult result +bool done +--- +string status diff --git a/jsk_recognition_msgs/msg/DetectionResult.msg b/jsk_recognition_msgs/msg/DetectionResult.msg new file mode 100644 index 0000000000..0fee48cd92 --- /dev/null +++ b/jsk_recognition_msgs/msg/DetectionResult.msg @@ -0,0 +1,8 @@ +# information about frame and timestamp +Header header + +# Classification results of detected objects +jsk_recognition_msgs/ClassificationResult classification + +# Rectangles of detected objects +jsk_recognition_msgs/RectArray rects