forked from acon96/home-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathevaluate.py
129 lines (106 loc) · 5.52 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
import argparse, os, re, json
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from tqdm import tqdm
CTX_SIZE = 2048
def tokenize(tokenizer, prompt):
return tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=CTX_SIZE)
def generate(model, tokenizer, prompt):
inputs = tokenize(tokenizer, prompt)
with torch.no_grad():
outputs = model.generate(**inputs)
text = tokenizer.batch_decode(outputs)
return text
def main():
parser = argparse.ArgumentParser(description="Evaluate the function calling for a model")
parser.add_argument("model")
parser.add_argument("--dataset_file", default="./data/home_assistant_test.json")
parser.add_argument("--split", default="<|im_start|>assistant")
parser.add_argument("--batch-size", default=8)
args = parser.parse_args()
model_folder = f"./models/{args.model}"
split = args.split
dataset = load_dataset("json", data_files={ "train": args.dataset_file })["train"]
# filter out examples that are status requests
dataset = dataset.filter(lambda example: "```homeassistant" in example["text"])
service_call_regex = re.compile(r"```homeassistant\n([\S \t\n]*?)```")
torch.set_default_device("cuda")
print(f"Loading model from {model_folder}...")
trained_model = AutoModelForCausalLM.from_pretrained(model_folder, trust_remote_code=True, torch_dtype=torch.bfloat16) #, code_revision="834565c23f9b28b96ccbeabe614dd906b6db551a")
trained_tokenizer = AutoTokenizer.from_pretrained(model_folder, trust_remote_code=True, padding_side='left')
trained_model.generation_config = GenerationConfig(
max_new_tokens=128,
use_cache=True,
do_sample=True,
temperature=0.1,
top_k=40,
top_p=1.0,
repetition_penalty=1.15,
eos_token_id=trained_model.config.eos_token_id,
pad_token_id=trained_model.config.pad_token_id,
)
print("Evaluating...")
batch_size = int(args.batch_size)
correct_answers = 0
total_answers = 0
color_mismatches = 0
failed_examples = []
with tqdm(total=len(dataset), desc="Accuracy") as pbar:
for batch_start in range(0, len(dataset), batch_size):
batch = dataset[batch_start:batch_start + batch_size]
prompts = [ example.split(split)[0] + split for example in batch["text"] ]
expected_responses = [ example.split(split)[1] for example in batch["text"] ]
output = generate(trained_model, trained_tokenizer, prompts)
for model_output, expected_response in zip(output, expected_responses):
response = model_output.replace(trained_tokenizer.pad_token, "").replace(trained_tokenizer.eos_token, "").split(split)[1]
expected_service_calls = []
for block in service_call_regex.findall(expected_response.strip()):
for line in block.split("\n"):
if len(line) == 0:
continue
expected_service_calls.append(json.loads(line))
total_answers = total_answers + 1
for block in service_call_regex.findall(response.strip()):
for line in block.split("\n"):
if len(line) == 0:
continue
try:
json_output = json.loads(line)
except:
failed_examples.append({ "expected": expected_response, "actual": response, "invalid_json": True })
continue
if json_output in expected_service_calls:
expected_service_calls.pop(expected_service_calls.index(json_output))
correct_answers = correct_answers + 1
elif "rgb_color" in json_output:
for sc in expected_service_calls:
sc = { **sc }
json_output_copy = { **json_output }
if not "rgb_color" in sc:
continue
del sc["rgb_color"]
del json_output_copy["rgb_color"]
if sc == json_output_copy:
correct_answers = correct_answers + 1
color_mismatches = color_mismatches + 1
else:
failed_examples.append({ "expected": expected_response, "actual": response })
else:
failed_examples.append({ "expected": expected_response, "actual": response })
pbar.update(batch_size)
pbar.set_description(f"Accuracy: {correct_answers/total_answers*100:.2f}% ({correct_answers}/{total_answers})")
accuracy = correct_answers/total_answers
print(f"Final Accuracy Rating: {accuracy*100:.2f}%")
print(f"Color Mismatches: {color_mismatches}")
with open(os.path.join(model_folder, "eval_results.json"), "w") as f:
json.dump({
"possible_answers": total_answers,
"correct_answers": correct_answers,
"accuracy": accuracy,
"color_mismatches": color_mismatches,
"failed_examples": failed_examples,
}, f, indent=4)
if __name__ == "__main__":
main()