From 82a62747b5cec85d730907e1c10989d4313ac259 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Tue, 28 Jan 2025 01:00:29 +0800
Subject: [PATCH] fix

---
 ...11\346\225\260\346\215\256\351\233\206.md" |  4 +--
 swift/llm/dataset/dataset/mllm.py             | 29 ++++++++++---------
 swift/llm/template/grounding.py               |  6 +++-
 3 files changed, 23 insertions(+), 16 deletions(-)
diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
index e19e153b7..c0806b928 100644
--- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
+++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md"
@@ -128,8 +128,8 @@ RLHF的数据格式可以参考纯文本大模型的格式。
 该格式比通用格式多了objects字段，该字段包含的字段有：
  - ref：用于替换`<ref-object>`
  - bbox：用于替换`<bbox>`
- - bbox_type: 可选项为'real'，'norm1'。默认为real，即bbox为真实bbox值。若是'norm1'，则bbox已经归一化为0~1
- - image_id: 该参数只有当bbox_type为real时生效。代表bbox对应的图片是第几张，用于缩放bbox。索引从0开始，默认全为第0张
+ - bbox_type: 可选项为'real'，'norm1'。默认为'real'，即bbox为真实bbox值。若是'norm1'，则bbox已经归一化为0~1
+ - image_id: 该参数只有当bbox_type为'real'时生效。代表bbox对应的图片是第几张，用于缩放bbox。索引从0开始，默认全为第0张
 
 ### 文生图格式
 
diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py
index 164cf45da..0d11fbf75 100644
--- a/swift/llm/dataset/dataset/mllm.py
+++ b/swift/llm/dataset/dataset/mllm.py
@@ -777,12 +777,10 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
             bbox[i] = round(float(bbox[i]))
         res = {}
 
-        objects = [{
-            'caption': caption,
-            'bbox': bbox,
-            'bbox_type': 'real',
-            'image': 0,
-        }]
+        objects = {
+            'ref': [caption],
+            'bbox': [bbox],
+        }
         res['query'], res['response'] = self.construct_grounding_prompt()
         res['images'] = [image_path]
         res['objects'] = objects
@@ -996,10 +994,14 @@ def replace_intervals_with_tags(response, start_ends):
         return ''.join(result)
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-        images = row['url']
+        images = row['images']
         caption = row['caption']
         ref_exps = row['ref_exps']
-        objects = []
+        objects = {
+            'ref': [],
+            'bbox': [],
+            'bbox_type': 'norm1'
+        }
         start_end_pairs = []
         for ref_exp in ref_exps:
             start = ref_exp[0]
@@ -1008,10 +1010,11 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
             start_end_pairs.append(ref_exp[0:2])
 
             object_part = caption[int(start):int(end)]
-            objects.append({'caption': object_part, 'bbox': ref_exp[2:6], 'bbox_type': 'real', 'image': 0})
+            objects['ref'].append(object_part)
+            objects['bbox'].append(ref_exp[2:6])
 
         start_end_pairs.sort(key=lambda x: (x[0], x[1]))
-        if self.has_overlap(start_end_pairs) or not objects:
+        if self.has_overlap(start_end_pairs) or not ref_exps:
             return
 
         if self.task_type in ('grounding', 'caption'):
@@ -1038,15 +1041,15 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         hf_dataset_id='zzliang/GRIT',
         subsets=[
             SubsetDataset(
-                subset='caption',
+                name='caption',
                 preprocess_func=GritPreprocessor('caption', columns_mapping={'url': 'images'}),
             ),
             SubsetDataset(
-                subset='grounding',
+                name='grounding',
                 preprocess_func=GritPreprocessor('grounding', columns_mapping={'url': 'images'}),
             ),
             SubsetDataset(
-                subset='vqa',
+                name='vqa',
                 preprocess_func=GritPreprocessor('vqa', columns_mapping={'url': 'images'}),
             )
         ],
diff --git a/swift/llm/template/grounding.py b/swift/llm/template/grounding.py
index e6c7490e7..4d28b2e78 100644
--- a/swift/llm/template/grounding.py
+++ b/swift/llm/template/grounding.py
@@ -10,12 +10,16 @@ def normalize_bbox(images: List[Image.Image],
         return
     bbox_list = objects['bbox']
     ref_list = objects['ref']
+    bbox_type = objects.get('bbox_type') or 'real'
     image_id_list = objects.get('image_id') or []
     image_id_list += [0] * (len(ref_list) - len(image_id_list))
     for bbox, ref, image_id in zip(bbox_list, ref_list, image_id_list):
         image = images[image_id]
         if norm_bbox == 'norm1000':
-            width, height = image.width, image.height
+            if bbox_type == 'norm1':
+                width, height = 1, 1
+            else:
+                width, height = image.width, image.height
             for i, (x, y) in enumerate(zip(bbox[::2], bbox[1::2])):
                 bbox[2 * i] = int(x / width * 1000)
                 bbox[2 * i + 1] = int(y / height * 1000)