diff --git a/src/generate_lib/llama32.py b/src/generate_lib/llama32.py new file mode 100644 index 0000000..51b4202 --- /dev/null +++ b/src/generate_lib/llama32.py @@ -0,0 +1,34 @@ +import requests +import torch +from PIL import Image +from tqdm import tqdm +from transformers import MllamaForConditionalGeneration, AutoProcessor + +def generate_response(queries, model_path): + model = MllamaForConditionalGeneration.from_pretrained(model_path, + torch_dtype=torch.bfloat16, + device_map="auto") + processor = AutoProcessor.from_pretrained(model_path) + + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + image = Image.open(image).convert('RGB') + messages = [ + {"role": "user", "content": [ + {"type": "image"}, + {"type": "text", "text": query} + ]} + ] + input_text = processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = processor( + image, + input_text, + add_special_tokens=False, + return_tensors="pt" + ).to(model.device) + + output = model.generate(**inputs, max_new_tokens=1024) + response = processor.decode(output[0]) + response = response.split("<|start_header_id|>assistant<|end_header_id|>")[1].replace("<|eot_id|>", "").strip() + queries[k]['response'] = response diff --git a/src/generate_lib/llavaov.py b/src/generate_lib/llavaov.py new file mode 100644 index 0000000..2894467 --- /dev/null +++ b/src/generate_lib/llavaov.py @@ -0,0 +1,50 @@ +# pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git +from llava.model.builder import load_pretrained_model +from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX +from llava.conversation import conv_templates, SeparatorStyle + +from PIL import Image +import requests +import copy +import torch + +import sys +import warnings +from tqdm import tqdm + +warnings.filterwarnings("ignore") +def generate_response(queries, model_path): + model_name = "llava_qwen" + device = "cuda" + device_map = "auto" + tokenizer, model, image_processor, max_length = load_pretrained_model(model_path, None, model_name, device_map=device_map) # Add any other thing you want to pass in llava_model_args + + model.eval() + + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + image = Image.open(image).convert('RGB') + image_tensor = process_images([image], image_processor, model.config) + image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor] + + conv_template = "qwen_1_5" # Make sure you use correct chat template for different models + question = DEFAULT_IMAGE_TOKEN + "\n{}".format(query) + conv = copy.deepcopy(conv_templates[conv_template]) + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device) + image_sizes = [image.size] + cont = model.generate( + input_ids, + images=image_tensor, + image_sizes=image_sizes, + do_sample=False, + temperature=0, + max_new_tokens=4096, + ) + text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True)[0] + queries[k]['response'] = text_outputs diff --git a/src/generate_lib/molmo.py b/src/generate_lib/molmo.py new file mode 100644 index 0000000..ced153f --- /dev/null +++ b/src/generate_lib/molmo.py @@ -0,0 +1,36 @@ +from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig +from PIL import Image +from tqdm import tqdm + +def generate_response(queries, model_path): + processor = AutoProcessor.from_pretrained( + model_path, + trust_remote_code=True, + torch_dtype='auto', + device_map='auto' + ) + model = AutoModelForCausalLM.from_pretrained( + model_path, + trust_remote_code=True, + torch_dtype='auto', + device_map='auto' + ) + + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + image = Image.open(image).convert('RGB') + inputs = processor.process( + images=[image], + text=query + ) + inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()} + output = model.generate_from_batch( + inputs, + GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"), + tokenizer=processor.tokenizer + ) + generated_tokens = output[0,inputs['input_ids'].size(1):] + generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) + + queries[k]['response'] = generated_text diff --git a/src/generate_lib/nvlm.py b/src/generate_lib/nvlm.py new file mode 100644 index 0000000..3200116 --- /dev/null +++ b/src/generate_lib/nvlm.py @@ -0,0 +1,132 @@ +import torch +from transformers import AutoTokenizer, AutoModel +import math +from PIL import Image +import torchvision.transforms as T +from torchvision.transforms.functional import InterpolationMode +from tqdm import tqdm + +def split_model(): + device_map = {} + world_size = torch.cuda.device_count() + num_layers = 80 + # Since the first GPU will be used for ViT, treat it as half a GPU. + num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) + num_layers_per_gpu = [num_layers_per_gpu] * world_size + num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) + layer_cnt = 0 + for i, num_layer in enumerate(num_layers_per_gpu): + for j in range(num_layer): + device_map[f'language_model.model.layers.{layer_cnt}'] = i + layer_cnt += 1 + device_map['vision_model'] = 0 + device_map['mlp1'] = 0 + device_map['language_model.model.tok_embeddings'] = 0 + device_map['language_model.model.embed_tokens'] = 0 + device_map['language_model.output'] = 0 + device_map['language_model.model.norm'] = 0 + device_map['language_model.lm_head'] = 0 + device_map['language_model.model.rotary_emb'] = 0 + device_map[f'language_model.model.layers.{num_layers - 1}'] = 0 + + return device_map + + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def load_image(image_file, input_size=448, max_num=12): + image = Image.open(image_file).convert('RGB') + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + +def generate_response(queries, model_path): + device_map = split_model() + model = AutoModel.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=False, + trust_remote_code=True, + device_map=device_map).eval() + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) + generation_config = dict(max_new_tokens=1024, do_sample=False) + + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + query = f'\n{query}' + pixel_values = load_image(image, max_num=12).to(torch.bfloat16) + response = model.chat(tokenizer, pixel_values, query, generation_config) + queries[k]['response'] = response diff --git a/src/generate_lib/o1.py b/src/generate_lib/o1.py new file mode 100644 index 0000000..64b3960 --- /dev/null +++ b/src/generate_lib/o1.py @@ -0,0 +1,68 @@ +import base64 +import requests + +def get_client_model(model_path, api_key): + assert api_key is not None, "API key is required for using GPT" + assert model_path is not None, "Model name is required for using GPT" + model = model_path + client = None + return client, model + +def generate_response(image_path, query, model, media_type="image/jpeg", api_key=None, client=None, random_baseline=False): + def encode_image(image_path): + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode('utf-8') + + # Getting the base64 string + base64_image = encode_image(image_path) + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}" + } + if not random_baseline: + payload = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": query + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}" + } + } + ] + } + ], + "temperature": 1.0, + "top_p": 1.0, + "seed": 42 + } + else: + payload = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": query + } + ] + } + ], + "temperature": 1.0, + "top_p": 1.0, + "seed": 42 + } + + response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) + response = response.json() + return response['choices'][0]['message']['content'] \ No newline at end of file diff --git a/src/generate_lib/phi35.py b/src/generate_lib/phi35.py new file mode 100644 index 0000000..7313ba6 --- /dev/null +++ b/src/generate_lib/phi35.py @@ -0,0 +1,41 @@ +from PIL import Image +import requests +from transformers import AutoModelForCausalLM +from transformers import AutoProcessor +from tqdm import tqdm + +def generate_response(queries, model_path): + model = AutoModelForCausalLM.from_pretrained(model_path, + device_map="cuda", + trust_remote_code=True, + torch_dtype="auto", + _attn_implementation='flash_attention_2') + processor = AutoProcessor.from_pretrained(model_path, + trust_remote_code=True, + num_crops=16) + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + image = Image.open(image).convert('RGB') + images = [image] + query = f"<|image_1|>\n{query}" + messages = [ + {"role": "user", "content": query} + ] + prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + inputs = processor(prompt, images, return_tensors="pt").to("cuda:0") + generation_args = { + "max_new_tokens": 1000, + "temperature": 0.0, + "do_sample": False + } + generate_ids = model.generate(**inputs, + eos_token_id=processor.tokenizer.eos_token_id, + **generation_args) + generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] + response = processor.batch_decode(generate_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=False)[0] + print(response) + queries[k]['response'] = response + diff --git a/src/generate_lib/pixtral.py b/src/generate_lib/pixtral.py new file mode 100644 index 0000000..2864a22 --- /dev/null +++ b/src/generate_lib/pixtral.py @@ -0,0 +1,24 @@ +from mistral_inference.transformer import Transformer +from mistral_inference.generate import generate + +from mistral_common.tokens.tokenizers.mistral import MistralTokenizer +from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk, ImageChunk +from mistral_common.protocol.instruct.request import ChatCompletionRequest + +from PIL import Image +from tqdm import tqdm + +def generate_response(queries, model_path): + tokenizer = MistralTokenizer.from_file(f"{model_path}/tekken.json") + model = Transformer.from_folder(model_path) + for k in tqdm(queries): + query = queries[k]['question'] + image = queries[k]["figure_path"] + image = Image.open(image).convert('RGB') + completion_request = ChatCompletionRequest(messages=[UserMessage(content=[ImageChunk(image=image), TextChunk(text=query)])]) + encoded = tokenizer.encode_chat_completion(completion_request) + images = encoded.images + tokens = encoded.tokens + out_tokens, _ = generate([tokens], model, images=[images], max_tokens=1024, temperature=0., eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id) + response = tokenizer.decode(out_tokens[0]) + queries[k]['response'] = response diff --git a/src/generate_lib/qwen2.py b/src/generate_lib/qwen2.py new file mode 100644 index 0000000..911357e --- /dev/null +++ b/src/generate_lib/qwen2.py @@ -0,0 +1,55 @@ +from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor +from qwen_vl_utils import process_vision_info +from PIL import Image +from tqdm import tqdm +import torch + +def generate_response(queries, model_path): + # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios. + model = Qwen2VLForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + device_map="auto", + ) + + # default processer + processor = AutoProcessor.from_pretrained(model_path) + + for k in tqdm(queries): + query = queries[k]["question"] + image = queries[k]["figure_path"] + image = Image.open(image).convert("RGB") + + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": query}, + ], + } + ] + + text = processor.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + image_inputs, video_inputs = process_vision_info(messages) + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to("cuda") + + generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] + for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + response = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + queries[k]["response"] = response diff --git a/src/generate_lib/utils.py b/src/generate_lib/utils.py index efcd6d0..501e0fa 100644 --- a/src/generate_lib/utils.py +++ b/src/generate_lib/utils.py @@ -44,6 +44,11 @@ def get_client_fn(model_path): 'gpt-4-turbo-2024-04-09', 'gpt-4o-mini-2024-07-18']: from .gpt import get_client_model + # o1 + elif model_path in ['o1-preview', + 'o1-mini', + 'o1-2024-12-17']: + from .o1 import get_client_model # reka elif model_path in ['reka-core-20240415', 'reka-flash-20240226', @@ -74,6 +79,35 @@ def get_generate_fn(model_path): 'claude-3-haiku-20240307', 'claude-3-5-sonnet-20240620']: from .claude import generate_response + # llama 3.2 + elif model_name in ['Llama-3.2-11B-Vision-Instruct', + 'Llama-3.2-90B-Vision-Instruct']: + from .llama32 import generate_response + # llavaov + elif model_name in ['llava-onevision-qwen2-0.5b-ov', + 'llava-onevision-qwen2-7b-ov', + 'llava-onevision-qwen2-72b-ov-chat']: + from .llavaov import generate_response + # molmo + elif model_name in ['Molmo-7B-D-0924', + 'Molmo-7B-O-0924', + 'Molmo-72B-0924', + 'MolmoE-1B-0924',]: + from .molmo import generate_response + # nvlm + elif model_name in ['NVLM-D-72B']: + from .nvlm import generate_response + # phi35 + elif model_name in ['Phi-3.5-vision-instruct']: + from .phi35 import generate_response + # pixtral + elif model_name in ['Pixtral-12B-2409']: + from .pixtral import generate_response + # qwen2 + elif model_name in ['Qwen2-VL-2B-Instruct', + 'Qwen2-VL-7B-Instruct', + 'Qwen2-VL-72B-Instruct']: + from .qwen2 import generate_response # deepseekvl elif model_name in ['deepseek-vl-7b-chat']: from .deepseekvl import generate_response @@ -90,6 +124,11 @@ def get_generate_fn(model_path): 'gpt-4-turbo-2024-04-09', 'gpt-4o-mini-2024-07-18']: from .gpt import generate_response + # o1 + elif model_name in ['o1-preview', + 'o1-mini', + 'o1-2024-12-17']: + from .o1 import generate_response # idefics2 elif model_name in ['idefics2-8b', 'idefics2-8b-chatty',