diff --git a/doc/jsk_perception/nodes/detection_node.md b/doc/jsk_perception/nodes/detection_node.md
new file mode 100644
index 0000000000..e8846b8a12
--- /dev/null
+++ b/doc/jsk_perception/nodes/detection_node.md
@@ -0,0 +1,89 @@
+# detection_node.py
+
+![](images/dino.png)
+
+The ROS node for Open-Vocabulary Object Detection with GroundingDINO.
+
+## System Configuration
+![](images/large_scale_vil_system.png)
+
+This node requires to work with the Docker Container for inference. Please build the container at first following Setup instruction.
+
+### Prerequisite
+This node requires NVIDIA GPU and more than 4GB GRAM to work properly.
+You have to install nvidia-container-toolkit for using GPU with docker. Please follow [official instruction](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
+
+### Build the docker image
+You have to build the docker image of GroundingDINO
+
+```shell
+roscd jsk_perception/docker
+make
+```
+
+## Subscribing topic
+* `~image` (`sensor_msgs/Image`)
+
+  Input image
+
+## Publishing topic
+* `~output/image` (`sensor_msgs/Image`)
+
+  Image drawing the detected bounding box
+
+* `~rects` (`jsk_recognition_msgs/RectArray`)
+
+  Array of detected bounding box regions
+
+* `~result` (`jsk_recognition_msgs/DetectionResult`)
+
+  Detection result
+
+* `~result/image` (`sensor_msgs/Image`)
+
+  Images used for inference
+
+* `~visualize` (`std_msgs/String`)
+
+  Detection result to visualize
+
+## Action topic
+* `~inference_server/goal` (`jsk_recognition_msgs/DetectionTaskActionGoal`) 
+
+  Detection request with custom categories and image
+
+* `~inference_server/result` (`jsk_recognition_msgs/DetectionTaskActionResult`)
+
+  Detection result of `~inference_server/goal`
+
+## Parameters
+* `~host` (String, default: `localhost`)
+
+  The host name or IP of inference container 
+
+* `~port` (Integer, default: `8080`)
+
+  The HTTP port of inference container
+
+## Dynamic Reconfigure Parameters
+* `~queries` (string, default: `human;kettle;cup;glass`) 
+
+  Default categories used for subscribing image topic.
+
+### Run inference container on another host or another terminal
+In the remote GPU machine,
+```shell
+cd jsk_recognition/jsk_perception/docker
+./run_jsk_vil_api dino --port (Your vacant port)
+```
+
+In the ROS machine,
+```shell
+roslaunch jsk_perception detection.launch port:=(Your inference container port) host:=(Your inference container host) DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true 
+```
+
+
+### Run both inference container and ros node in single host 
+```
+roslaunch jsk_perception detection.launch run_api:=true DETECTION_INPUT_IMAGE:=(Your image topic name) gui:=true 
+```
\ No newline at end of file
diff --git a/doc/jsk_perception/nodes/images/dino.png b/doc/jsk_perception/nodes/images/dino.png
new file mode 100644
index 0000000000..b3c8159eac
Binary files /dev/null and b/doc/jsk_perception/nodes/images/dino.png differ
diff --git a/jsk_perception/docker/Makefile b/jsk_perception/docker/Makefile
index 83eb2800ab..43b3abec8b 100644
--- a/jsk_perception/docker/Makefile
+++ b/jsk_perception/docker/Makefile
@@ -5,9 +5,11 @@
 # api directories
 OFAPROJECT = ofa
 CLIPPROJECT = clip
+DINOPROJECT = dino
 # image names
 OFAIMAGE = jsk-ofa-server
 CLIPIMAGE = jsk-clip-server
+DINOIMAGE = jsk-dino-server
 # commands
 BUILDIMAGE = docker build
 REMOVEIMAGE = docker rmi
@@ -23,7 +25,7 @@ PARAMURLS = parameter_urls.txt
 # OFA parameters
 OFAPARAMFILES = $(foreach param, $(OFAPARAMS), $(PARAMDIR)/$(param))
 
-all: ofa clip
+all: ofa clip dino
 
 # TODO check command wget exists, nvidia-driver version
 
@@ -41,6 +43,9 @@ ofa: $(PARAMDIR)/.download
 clip: $(PARAMDIR)/.download
 	$(BUILDIMAGE) $(CLIPPROJECT) -t $(CLIPIMAGE) -f $(CLIPPROJECT)/Dockerfile
 
+dino: $(PARAMDIR)/.download
+	$(BUILDIMAGE) $(DINOPROJECT) -t $(DINOIMAGE) -f $(DINOPROJECT)/Dockerfile
+
 # TODO add clip, glip
 clean:
 	@$(REMOVEIMAGE) $(OFAIMAGE)
@@ -48,4 +53,4 @@ clean:
 wipe: clean
 	rm -fr $(PARAMDIR)
 
-.PHONY: clean wipe ofa clip
+.PHONY: clean wipe ofa clip dino
diff --git a/jsk_perception/docker/dino/Dockerfile b/jsk_perception/docker/dino/Dockerfile
new file mode 100644
index 0000000000..b5a18ef314
--- /dev/null
+++ b/jsk_perception/docker/dino/Dockerfile
@@ -0,0 +1,27 @@
+# FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-devel
+FROM pytorch/pytorch:1.9.1-cuda11.1-cudnn8-devel
+# FROm pytorch/pytorch:1.13.1-cuda11.6-cudnn8-devel
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt -o Acquire::AllowInsecureRepositories=true update \
+    && apt-get install -y \
+    curl \
+    git \
+    libopencv-dev \
+    wget \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+ENV CUDA_HOME /usr/local/cuda
+ENV TORCH_CUDA_ARCH_LIST 8.0+PTX
+RUN git clone https://github.com/IDEA-Research/GroundingDINO.git
+RUN echo 'export CUDA_HOME=/usr/local/cuda' >> ~/.bashrc
+RUN echo 'TORCH_CUDA_ARCH_LIST=8.0+PTX' >> ~/.bashrc
+RUN pip install flask opencv-python \
+    && pip install "numpy>=1.20"
+RUN cd GroundingDINO \
+    && pip install -r requirements.txt \
+    && pip install -e .
+RUN mkdir -p GroundingDINO/weights \
+    && cd GroundingDINO/weights \
+    && wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+COPY server.py /workspace/GroundingDINO
+ENTRYPOINT cd /workspace/GroundingDINO && python server.py
\ No newline at end of file
diff --git a/jsk_perception/docker/dino/server.py b/jsk_perception/docker/dino/server.py
new file mode 100644
index 0000000000..84f9014dd4
--- /dev/null
+++ b/jsk_perception/docker/dino/server.py
@@ -0,0 +1,99 @@
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import groundingdino.datasets.transforms as T
+from torchvision.ops import box_convert
+
+import cv2
+import numpy as np
+from PIL import Image as PLImage
+import torch
+
+# web server
+from flask import Flask, request, Response
+import json
+import base64
+
+
+def apply_half(t):
+    if t.dtype is torch.float32:
+        return t.to(dtype=torch.half)
+    return t
+
+class Inference:
+    def __init__(self, gpu_id=None):
+        self.gpu_id = gpu_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/groundingdino_swint_ogc.pth")
+        self.BOX_TRESHOLD = 0.35
+        self.TEXT_TRESHOLD = 0.25
+
+    def convert_to_string(self, input_list):
+        output_string = ""
+        for item in input_list:
+            output_string += item + " . "
+        return output_string.strip()
+
+    def infer(self, img, texts):
+        # get cv2 image
+        # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely
+        # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        image_source = PLImage.fromarray(image)
+        image = np.asarray(image_source)
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image_transformed, _ = transform(image_source, None)
+
+        image_source = image
+        image = image_transformed
+
+        TEXT_PROMPT = self.convert_to_string(texts)
+
+        boxes, logits, phrases = predict(
+            model=self.model,
+            image=image,
+            caption=TEXT_PROMPT,
+            box_threshold=self.BOX_TRESHOLD,
+            text_threshold=self.TEXT_TRESHOLD,
+            device = self.device
+        )
+
+        h, w, _ = image_source.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+
+        results = {}
+        for i in range(len(xyxy)):
+            box = xyxy[i].tolist()
+            logit = logits[i].item()
+            results[i] = {"box": box, "logit": logit, "phrase": phrases[i]}
+
+        return results
+
+# run
+if __name__ == "__main__":
+    app = Flask(__name__)
+    infer = Inference()
+
+    @app.route("/detection", methods=['POST'])
+    def detection_request():
+        data = request.data.decode("utf-8")
+        data_json = json.loads(data)
+        # process image
+        image_b = data_json['image']
+        image_dec = base64.b64decode(image_b)
+        data_np = np.fromstring(image_dec, dtype='uint8')
+        img = cv2.imdecode(data_np, 1)
+        # get text
+        texts = data_json['queries']
+        infer_results = infer.infer(img, texts)
+        results = []
+        for i in range(len(infer_results)):
+            results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]})
+        return Response(response=json.dumps({"results": results}), status=200)
+
+    app.run("0.0.0.0", 8080, threaded=True)
diff --git a/jsk_perception/docker/ofa/server.py b/jsk_perception/docker/ofa/server.py
index 6c667e3328..82bb636486 100644
--- a/jsk_perception/docker/ofa/server.py
+++ b/jsk_perception/docker/ofa/server.py
@@ -61,7 +61,7 @@ def __init__(self, task, model_scale):
                 utils.split_paths(param_path),
                 arg_overrides=overrides)
         elif task == "refcoco":
-            tasks.register_task(self.task, RefcocoTask)
+            tasks.register_task(task, RefcocoTask)
             self.models, self.cfg, self.task = checkpoint_utils.load_model_ensemble_and_task(
                 utils.split_paths(param_path),
                 arg_overrides=overrides)
@@ -140,6 +140,15 @@ def encode_text(self, text, length=None, append_bos=False, append_eos=False):
             s = torch.cat([s, eos_item])
         return s
 
+    def convert_objects_to_text(self, text):
+        if len(text) == 1:
+            object_text = text[0]
+        elif len(text) >= 2:
+            object_text = ', '.join(text[:-1]) + f' or {text[-1]}'
+        else:
+            object_text = ''
+        return object_text
+
     def construct_sample(self, image, text):
         if self.task_name == "caption" or self.task_name == "vqa_gen":
             patch_image = self.patch_resize_transform(image).unsqueeze(0)
@@ -176,7 +185,8 @@ def construct_sample(self, image, text):
             h_resize_ratio = torch.tensor(patch_image_size / h).unsqueeze(0)
             patch_image = self.patch_resize_transform(image).unsqueeze(0)
             patch_mask = torch.tensor([True])
-            src_text = self.encode_text(' which region does the text " {} " describe?'.format(text), append_bos=True,
+            object_text = self.convert_objects_to_text(text)
+            src_text = self.encode_text(' which region does the text " {} " describe?'.format(object_text), append_bos=True,
                                    append_eos=True).unsqueeze(0)
             src_length = torch.LongTensor([s.ne(self.pad_idx).long().sum() for s in src_text])
             sample = {
@@ -214,7 +224,24 @@ def infer(self, img, text):
                 text = result[0]['answer']
                 return text
         elif self.task_name == "refcoco":
-            pass
+            # image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely
+            # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+            image = Image.fromarray(image)
+            # Construct input sample & preprocess for GPU if cuda available for VG
+            sample = self.construct_sample(image, text)
+            sample = utils.move_to_cuda(sample) if self.use_cuda else sample
+            sample = utils.apply_to_sample(apply_half, sample) if self.use_fp16 else sample
+            with torch.no_grad():
+                result, scores = eval_step(self.task, self.generator, self.models, sample)
+            results = {}
+            object_text = self.convert_objects_to_text(text)
+            for i in range(len(result)):
+                box = result[i]["box"]
+                logit = scores[i].item()
+                results[i] = {"box": box, "logit": logit, "phrase": object_text}
+
+            return results
 
 # run
 if __name__ == "__main__":
@@ -232,6 +259,9 @@ def infer(self, img, text):
     elif ofa_task == "vqa_gen":
         vqa_infer = Inference("vqa_gen", ofa_model_scale)
 
+    elif ofa_task == "detection":
+        detection_infer = Inference("refcoco", ofa_model_scale)
+
     else:
         raise RuntimeError("No application is available")
 
@@ -274,5 +304,25 @@ def vqa_request():
             return Response(response=json.dumps({"results": results}), status=200)
     except NameError:
         print("Skipping create vqa_gen app")
-    
+
+    try:
+        @app.route("/detection", methods=['POST'])
+        def detection_request():
+            data = request.data.decode("utf-8")
+            data_json = json.loads(data)
+            # process image
+            image_b = data_json['image']
+            image_dec = base64.b64decode(image_b)
+            data_np = np.fromstring(image_dec, dtype='uint8')
+            img = cv2.imdecode(data_np, 1)
+            # get text
+            texts = data_json['queries']
+            infer_results = detection_infer.infer(img, texts)
+            results = []
+            for i in range(len(infer_results)):
+                results.append({"id": i, "box": infer_results[i]["box"], "logit": infer_results[i]["logit"], "phrase": infer_results[i]["phrase"]})
+            return Response(response=json.dumps({"results": results}), status=200)
+    except NameError:
+        print("Skipping create detection app")
+
     app.run("0.0.0.0", 8080, threaded=True)
diff --git a/jsk_perception/docker/run_jsk_vil_api b/jsk_perception/docker/run_jsk_vil_api
index acef636280..8c91444ef3 100755
--- a/jsk_perception/docker/run_jsk_vil_api
+++ b/jsk_perception/docker/run_jsk_vil_api
@@ -10,7 +10,8 @@ import subprocess
 import sys
 
 CONTAINERS = {"ofa": "jsk-ofa-server",
-              "clip": "jsk-clip-server"}
+              "clip": "jsk-clip-server",
+              "dino": "jsk-dino-server"}
 OFA_MODEL_SCALES = ["base", "large", "huge"]
 
 parser = argparse.ArgumentParser(description="JSK Vision and Language API runner")
diff --git a/jsk_perception/launch/detection.launch b/jsk_perception/launch/detection.launch
new file mode 100644
index 0000000000..4feec17522
--- /dev/null
+++ b/jsk_perception/launch/detection.launch
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<launch>
+  <arg name="host" default="localhost" />
+  <arg name="port" default="8888" />
+  <arg name="gui" default="false" />
+  <arg name="run_api" default="false" />
+  <arg name="model" default="dino" />
+  <arg name="DETECTION_INPUT_IMAGE" default="image" />
+
+  <node name="detection_api" pkg="jsk_perception" type="run_jsk_vil_api" output="log"
+        args="$(arg model) -p $(arg port)" if="$(arg run_api)" />
+
+  <node name="detection" pkg="jsk_perception" type="detection_node.py" output="screen">
+    <remap from="~image" to="$(arg DETECTION_INPUT_IMAGE)" />
+    <rosparam subst_value="true">
+      host: $(arg host)
+      port: $(arg port)
+      model: $(arg model)
+    </rosparam>
+  </node>
+
+  <include file="$(find jsk_perception)/launch/ofa_gui.launch" if="$(arg gui)" />
+
+</launch>
diff --git a/jsk_perception/node_scripts/detection_node.py b/jsk_perception/node_scripts/detection_node.py
new file mode 100755
index 0000000000..aada431e11
--- /dev/null
+++ b/jsk_perception/node_scripts/detection_node.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+
+import rospy
+from jsk_perception.vil_inference_client import DINOClientNode
+
+
+def main():
+    rospy.init_node("dino")
+    node = DINOClientNode()
+    rospy.spin()
+
+if __name__ == "__main__":
+    main()
diff --git a/jsk_perception/sample/config/sample_ofa_config.rviz b/jsk_perception/sample/config/sample_ofa_config.rviz
index 458138e652..bdbdb70408 100644
--- a/jsk_perception/sample/config/sample_ofa_config.rviz
+++ b/jsk_perception/sample/config/sample_ofa_config.rviz
@@ -5,7 +5,7 @@ Panels:
     Property Tree Widget:
       Expanded: ~
       Splitter Ratio: 0.4870370328426361
-    Tree Height: 509
+    Tree Height: 625
   - Class: rviz/Selection
     Name: Selection
   - Class: rviz/Tool Properties
@@ -113,6 +113,38 @@ Visualization Manager:
       text size: 12
       top: 320
       width: 512
+    - Class: jsk_rviz_plugin/OverlayImage
+      Enabled: true
+      Name: ObjectDetection/Output/Image
+      Topic: /detection/output/image
+      Value: true
+      alpha: 0.800000011920929
+      height: 128
+      keep aspect ratio: true
+      left: 530
+      overwrite alpha value: false
+      top: 10
+      transport hint: raw
+      width: 320
+    - Align Bottom: false
+      Background Alpha: 0.800000011920929
+      Background Color: 0; 0; 0
+      Class: jsk_rviz_plugin/String
+      Enabled: true
+      Foreground Alpha: 0.800000011920929
+      Foreground Color: 255; 255; 255
+      Name: ObjectDetection/Visualize
+      Overtake Color Properties: true
+      Overtake Position Properties: true
+      Topic: /detection/visualize
+      Value: true
+      font: DejaVu Sans Mono
+      height: 500
+      left: 530
+      line width: 2
+      text size: 12
+      top: 320
+      width: 512
   Enabled: true
   Global Options:
     Background Color: 48; 48; 48
@@ -164,10 +196,10 @@ Visualization Manager:
 Window Geometry:
   Displays:
     collapsed: false
-  Height: 1025
+  Height: 1016
   Hide Left Dock: false
   Hide Right Dock: true
-  QMainWindow State: 000000ff00000000fd00000004000000000000021e000002f7fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b000000b100fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000006f000002f70000018400fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e0000013500fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000078000000060fc0100000002fb0000000800540069006d0065010000000000000780000005cd00fffffffb0000000800540069006d0065010000000000000450000000000000000000000556000002f700000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
+  QMainWindow State: 000000ff00000000fd00000004000000000000021e00000338fc0200000008fb0000001200530065006c0065006300740069006f006e00000001e10000009b0000005c00fffffffb0000001e0054006f006f006c002000500072006f007000650072007400690065007302000001ed000001df00000185000000b1fb000000120056006900650077007300200054006f006f02000001df000002110000018500000122fb000000200054006f006f006c002000500072006f0070006500720074006900650073003203000002880000011d000002210000017afb000000100044006900730070006c006100790073010000003d00000338000000c900fffffffb0000002000730065006c0065006300740069006f006e00200062007500660066006500720200000138000000aa0000023a00000294fb00000014005700690064006500530074006500720065006f02000000e6000000d2000003ee0000030bfb0000000c004b0069006e0065006300740200000186000001060000030c00000261000000010000010f0000037efc0200000003fb0000001e0054006f006f006c002000500072006f00700065007200740069006500730100000041000000780000000000000000fb0000000a00560069006500770073000000003d0000037e000000a400fffffffb0000001200530065006c0065006300740069006f006e010000025a000000b200000000000000000000000200000490000000a9fc0100000001fb0000000a00560069006500770073030000004e00000080000002e100000197000000030000073800000060fc0100000002fb0000000800540069006d0065010000000000000738000003bc00fffffffb0000000800540069006d00650100000000000004500000000000000000000005140000033800000004000000040000000800000008fc0000000100000002000000010000000a0054006f006f006c00730100000000ffffffff0000000000000000
   Selection:
     collapsed: false
   Time:
@@ -176,6 +208,6 @@ Window Geometry:
     collapsed: false
   Views:
     collapsed: true
-  Width: 1920
-  X: 1440
-  Y: 1096
+  Width: 1848
+  X: 72
+  Y: 27
diff --git a/jsk_perception/src/jsk_perception/vil_inference_client.py b/jsk_perception/src/jsk_perception/vil_inference_client.py
index e42948a2c7..76233faf5f 100644
--- a/jsk_perception/src/jsk_perception/vil_inference_client.py
+++ b/jsk_perception/src/jsk_perception/vil_inference_client.py
@@ -6,6 +6,9 @@
 import actionlib
 import requests
 import rospy
+import matplotlib
+import matplotlib.cm
+import numpy as np
 from cv_bridge import CvBridge
 from dynamic_reconfigure.server import Server
 from jsk_perception.cfg import ClassificationConfig, VQAConfig
@@ -13,6 +16,11 @@
                                       ClassificationTaskAction,
                                       ClassificationTaskFeedback,
                                       ClassificationTaskResult,
+                                      DetectionResult,
+                                      DetectionTaskAction,
+                                      DetectionTaskFeedback,
+                                      DetectionTaskResult,
+                                      Rect, RectArray,
                                       QuestionAndAnswerText, VQAResult,
                                       VQATaskAction, VQATaskFeedback,
                                       VQATaskResult)
@@ -56,13 +64,18 @@ def __init__(self, action,
         self.reconfigure_server = Server(server_config, self.config_cb)
         self.action_server.start()
 
-    def ros_img_to_base(self, ros_img):
+    def ros_img_to_cv(self, ros_img, encoding="bgr8"):
+        # convert to cv2
         if type(ros_img) is CompressedImage:
-            cv_img = self._bridge.compressed_imgmsg_to_cv2(ros_img, desired_encoding="bgr8")
+            cv_img = self._bridge.compressed_imgmsg_to_cv2(ros_img, desired_encoding=encoding)
         elif type(ros_img) is Image:
-            cv_img = self._bridge.imgmsg_to_cv2(ros_img, desired_encoding="bgr8")
+            cv_img = self._bridge.imgmsg_to_cv2(ros_img, desired_encoding=encoding)
         else:
             raise RuntimeError("Unknown type {}".format(type(ros_img)))
+        return cv_img
+
+    def ros_img_to_base(self, ros_img):
+        cv_img = self.ros_img_to_cv(ros_img)
         # convert to base64
         encimg = cv2.imencode(".png", cv_img)[1]
         img_str = encimg.tostring()
@@ -189,6 +202,99 @@ def inference(self, img_msg, queries):
         msg.target_names = queries
         return msg
 
+class DINOClientNode(DockerInferenceClientBase):
+    def __init__(self):
+        DockerInferenceClientBase.__init__(self,
+                                           DetectionTaskAction,
+                                           ClassificationConfig,
+                                           DetectionResult,
+                                           DetectionTaskFeedback,
+                                           DetectionTaskResult,
+                                           "detection")
+        self.model_name = rospy.get_param("~model", default="dino")
+        self.pub_class = rospy.Publisher('~class', ClassificationResult, queue_size=1)
+        self.pub_rects = rospy.Publisher('~rects', RectArray, queue_size=1)
+        self.pub_image = rospy.Publisher('~output/image', Image, queue_size=1)
+
+    def topic_cb(self, data):
+        if not self.config: rospy.logwarn("No queries"); return
+        if not self.config.queries: rospy.logwarn("No queries"); return
+        queries = self.config.queries.split(";")
+        try:
+            msg = self.inference(data, queries)
+        except Exception: return
+        # publish debug image
+        self.image_pub.publish(data)
+        # publish detection result
+        msg.header = data.header
+        self.result_pub.publish(msg)
+        # publish probabilities result as string
+        vis_msg = ""
+        for i, label in enumerate(msg.classification.label_names):
+            vis_msg += "{}: {:.2f}% ".format(label, msg.classification.probabilities[i]*100)
+        self.vis_pub.publish(vis_msg)
+
+    def create_queries(self, goal):
+        return goal.queries
+
+    def inference(self, img_msg, queries):
+        img_byte = self.ros_img_to_base(img_msg)
+        req = json.dumps({"image": img_byte,
+                          "queries": queries}).encode("utf-8")
+        response = self.send_request(req)
+        result_dic = json.loads(response.text)["results"]
+
+        boxes = []
+        scores = []
+        labels = []
+        for r in result_dic:
+            boxes.append(r["box"])
+            scores.append(r["logit"])
+            labels.append(r["phrase"])
+        classification_msg = ClassificationResult(header=img_msg.header)
+        classification_msg.labels = list(range(len(labels)))
+        classification_msg.label_names = labels
+        classification_msg.label_proba = scores     # cosine similarities
+        classification_msg.probabilities = scores   # sum(probabilities) is 1
+        classification_msg.classifier = self.model_name
+        classification_msg.target_names = queries
+        self.pub_class.publish(classification_msg)
+
+        rect_msg = RectArray(header=img_msg.header)
+        vis_img = self.ros_img_to_cv(img_msg, encoding="rgb8")
+        cmap = matplotlib.cm.get_cmap('hsv')
+        n = max(len(boxes) - 1, 10)
+        rects = []
+        for i in range(len(boxes)):
+            box = boxes[i]
+            rgba = np.array(cmap(1. * i / n))
+            color = rgba[:3] * 255
+            label_text = '{}, {:.2f}'.format(labels[i], scores[i])
+            x_min = max(int(box[0]), 0)
+            y_min = max(int(box[1]), 0)
+            x_max = min(int(box[2]), vis_img.shape[1])
+            y_max = min(int(box[3]), vis_img.shape[0])
+            cv2.rectangle(
+                vis_img, (x_min, y_min), (x_max, y_max),
+                color, thickness=3, lineType=cv2.LINE_AA)
+            cv2.putText(
+                vis_img, label_text, (x_min, max(y_min - 10, 0)),
+                cv2.FONT_HERSHEY_SIMPLEX, 0.7, color,
+                thickness=2, lineType=cv2.LINE_AA)
+            rect = Rect(
+                x=x_min, y=y_min,
+                width=x_max - x_min, height=y_max - y_min)
+            rect_msg.rects.append(rect)
+
+        self.pub_rects.publish(rect_msg)
+        vis_msg = self._bridge.cv2_to_imgmsg(vis_img, 'rgb8')
+        vis_msg.header = img_msg.header
+        self.pub_image.publish(vis_msg)
+
+        msg = self.result_topic_type()
+        msg.classification = classification_msg
+        msg.rects = rect_msg
+        return msg
 
 class OFAClientNode(DockerInferenceClientBase):
     def __init__(self):
diff --git a/jsk_recognition_msgs/CMakeLists.txt b/jsk_recognition_msgs/CMakeLists.txt
index 86e6164943..7603c95698 100644
--- a/jsk_recognition_msgs/CMakeLists.txt
+++ b/jsk_recognition_msgs/CMakeLists.txt
@@ -21,6 +21,7 @@ add_message_files(
   ColorHistogram.msg
   DepthCalibrationParameter.msg
   DepthErrorResult.msg
+  DetectionResult.msg
   ExifTags.msg
   ExifGPSInfo.msg
   HeightmapConfig.msg
@@ -116,6 +117,7 @@ add_service_files(FILES
 
 add_action_files(FILES
   VQATask.action
+  DetectionTask.action
   ClassificationTask.action
   )
 
diff --git a/jsk_recognition_msgs/action/DetectionTask.action b/jsk_recognition_msgs/action/DetectionTask.action
new file mode 100644
index 0000000000..09b589a3bf
--- /dev/null
+++ b/jsk_recognition_msgs/action/DetectionTask.action
@@ -0,0 +1,8 @@
+sensor_msgs/Image image
+sensor_msgs/CompressedImage compressed_image
+string[] queries
+---
+jsk_recognition_msgs/DetectionResult result
+bool done
+---
+string status
diff --git a/jsk_recognition_msgs/msg/DetectionResult.msg b/jsk_recognition_msgs/msg/DetectionResult.msg
new file mode 100644
index 0000000000..0fee48cd92
--- /dev/null
+++ b/jsk_recognition_msgs/msg/DetectionResult.msg
@@ -0,0 +1,8 @@
+# information about frame and timestamp
+Header header
+
+# Classification results of detected objects
+jsk_recognition_msgs/ClassificationResult classification
+
+# Rectangles of detected objects
+jsk_recognition_msgs/RectArray rects