From 82a62747b5cec85d730907e1c10989d4313ac259 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Tue, 28 Jan 2025 01:00:29 +0800 Subject: [PATCH] fix --- ...11\346\225\260\346\215\256\351\233\206.md" | 4 +-- swift/llm/dataset/dataset/mllm.py | 29 ++++++++++--------- swift/llm/template/grounding.py | 6 +++- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" index e19e153b7..c0806b928 100644 --- "a/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" +++ "b/docs/source/Customization/\350\207\252\345\256\232\344\271\211\346\225\260\346\215\256\351\233\206.md" @@ -128,8 +128,8 @@ RLHF的数据格式可以参考纯文本大模型的格式。 该格式比通用格式多了objects字段,该字段包含的字段有: - ref:用于替换`` - bbox:用于替换`` - - bbox_type: 可选项为'real','norm1'。默认为real,即bbox为真实bbox值。若是'norm1',则bbox已经归一化为0~1 - - image_id: 该参数只有当bbox_type为real时生效。代表bbox对应的图片是第几张,用于缩放bbox。索引从0开始,默认全为第0张 + - bbox_type: 可选项为'real','norm1'。默认为'real',即bbox为真实bbox值。若是'norm1',则bbox已经归一化为0~1 + - image_id: 该参数只有当bbox_type为'real'时生效。代表bbox对应的图片是第几张,用于缩放bbox。索引从0开始,默认全为第0张 ### 文生图格式 diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py index 164cf45da..0d11fbf75 100644 --- a/swift/llm/dataset/dataset/mllm.py +++ b/swift/llm/dataset/dataset/mllm.py @@ -777,12 +777,10 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: bbox[i] = round(float(bbox[i])) res = {} - objects = [{ - 'caption': caption, - 'bbox': bbox, - 'bbox_type': 'real', - 'image': 0, - }] + objects = { + 'ref': [caption], + 'bbox': [bbox], + } res['query'], res['response'] = self.construct_grounding_prompt() res['images'] = [image_path] res['objects'] = objects @@ -996,10 +994,14 @@ def replace_intervals_with_tags(response, start_ends): return ''.join(result) def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: - images = row['url'] + images = row['images'] caption = row['caption'] ref_exps = row['ref_exps'] - objects = [] + objects = { + 'ref': [], + 'bbox': [], + 'bbox_type': 'norm1' + } start_end_pairs = [] for ref_exp in ref_exps: start = ref_exp[0] @@ -1008,10 +1010,11 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: start_end_pairs.append(ref_exp[0:2]) object_part = caption[int(start):int(end)] - objects.append({'caption': object_part, 'bbox': ref_exp[2:6], 'bbox_type': 'real', 'image': 0}) + objects['ref'].append(object_part) + objects['bbox'].append(ref_exp[2:6]) start_end_pairs.sort(key=lambda x: (x[0], x[1])) - if self.has_overlap(start_end_pairs) or not objects: + if self.has_overlap(start_end_pairs) or not ref_exps: return if self.task_type in ('grounding', 'caption'): @@ -1038,15 +1041,15 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: hf_dataset_id='zzliang/GRIT', subsets=[ SubsetDataset( - subset='caption', + name='caption', preprocess_func=GritPreprocessor('caption', columns_mapping={'url': 'images'}), ), SubsetDataset( - subset='grounding', + name='grounding', preprocess_func=GritPreprocessor('grounding', columns_mapping={'url': 'images'}), ), SubsetDataset( - subset='vqa', + name='vqa', preprocess_func=GritPreprocessor('vqa', columns_mapping={'url': 'images'}), ) ], diff --git a/swift/llm/template/grounding.py b/swift/llm/template/grounding.py index e6c7490e7..4d28b2e78 100644 --- a/swift/llm/template/grounding.py +++ b/swift/llm/template/grounding.py @@ -10,12 +10,16 @@ def normalize_bbox(images: List[Image.Image], return bbox_list = objects['bbox'] ref_list = objects['ref'] + bbox_type = objects.get('bbox_type') or 'real' image_id_list = objects.get('image_id') or [] image_id_list += [0] * (len(ref_list) - len(image_id_list)) for bbox, ref, image_id in zip(bbox_list, ref_list, image_id_list): image = images[image_id] if norm_bbox == 'norm1000': - width, height = image.width, image.height + if bbox_type == 'norm1': + width, height = 1, 1 + else: + width, height = image.width, image.height for i, (x, y) in enumerate(zip(bbox[::2], bbox[1::2])): bbox[2 * i] = int(x / width * 1000) bbox[2 * i + 1] = int(y / height * 1000)