-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_layout_ollama.py
99 lines (77 loc) · 2.69 KB
/
generate_layout_ollama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from pydantic import BaseModel
from ollama import chat
import argparse
import time
from utils import *
def generate_layout(description,model,eval_mode=False):
# using separate fields
class Object(BaseModel):
name: str
x0: int
y0: int
x1: int
y1: int
# overall layout structure
class ObjectLayout(BaseModel):
objects: list[Object]
# the instruction prompt
instruction = "Provide box coordinates for an image with "
# instruction = "Provide box coordinates (x0,y0,x1,y1) for an image with "
# setting up messages following the example json files in the attention refocusing repo but without in-context example (modified for gpt-4o)
messages=[
{
"role": "system",
"content": "Your goal is to assist users by providing helpful and relevant information. In this context, you are expected to generate specific coordinate box locations for objects in a description, considering their relative sizes and positions and the numbers of objects.Size of image is 512*512"
},
]
# appending the message
message = instruction + description
messages.append(
{"role": "user", "content": message},
)
response = chat(
model=model,
messages=messages,
format=ObjectLayout.model_json_schema(),
)
layout = ObjectLayout.model_validate_json(response.message.content)
# printing out the description and model output if not in eval mode
if not eval_mode:
print("\n-----Image Description-----\n")
print(description)
print("\n-----Model Output-----\n")
print(layout)
# returning the names of objects and their corresponding bounding boxes
object_names = []
object_boxes = []
# iterate through object list of the returned layout
for object in layout.objects:
# appending object name
object_names.append(object.name)
# creating box from coordinates and appending
box = [object.x0,object.y0,object.x1,object.y1]
object_boxes.append(box)
return object_names, object_boxes
# running the script
if __name__ == "__main__":
# getting command line arguments
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
default="llama3",
help="The ollama model to use"
)
args = parser.parse_args()
model = args.model
# getting inputs
description = input("Please your describe image: ")
image_name = input("Enter a name for your image to save the layout: ")
start_time = time.time()
# getting layout
names,boxes = generate_layout(description, model)
end_time = time.time()
runtime = end_time - start_time
print(f"\nRuntime: {runtime:.2f} seconds")
output_folder = "./outputs"
draw_box(names,boxes,output_folder,image_name+".jpg")