Skip to content

Commit

Permalink
support human evaluation task release
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk committed Feb 23, 2024
1 parent 7ba57e4 commit b72bd75
Show file tree
Hide file tree
Showing 326 changed files with 8,240 additions and 0 deletions.
1 change: 1 addition & 0 deletions human_eval/all_environment_profile.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
1 change: 1 addition & 0 deletions human_eval/complete_gpt_score.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions human_eval/complete_pk_agent_pairs.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[["01H8XN69V9XK68A7KM559RSTAN", "01H7VFHPDZVVCDZR3AARA547CY", "agent2"], ["01H8XY83Y9AZN63DY1NR40PXYE", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent1"], ["01HB5HC17Z50V4GAAXF90ES9F4", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent2"], ["01HB46D6FNQPAD9K41QWV7VT99", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent1"], ["01HB5G7RBJF4ZKWGKCDEJZMSGT", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent2"], ["01HB50E792RNNBFM6GSP974T2T", "01H7VFHPS5WJW2694R1MNC8JFY", "agent1"], ["01H8XJVAVYKYMM4ZHV40BS1DN1", "01H7VFHP8AN5643B0NR0NP00VE", "agent2"], ["01HB4653T5G9A0S2XCFGAQ7KW9", "01H7VFHPS5WJW2694R1MNC8JFY", "agent1"], ["01H8ZDKXKRGD9WMFPX3J7SC785", "01H7VFHNN7XTR99319DS8KZCQM", "agent1"], ["01HB46VXVS08JFK7WXGS3Q8HXX", "01H7VFHPS5WJW2694R1MNC8JFY", "agent1"], ["01H8XJPA54WBXGZX4YJ55N9BBY", "01H7VFHPDZVVCDZR3AARA547CY", "agent2"], ["01HB470E5HRTGRKNSMZXESZT1C", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent1"], ["01HB4SSKK5YR60PB6VTCEVBHZ0", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent1"], ["01H8XWB2D7FX5CCKNH0HWK5SPT", "01H7VFHNNYH3W0VRWVY178K2TK", "agent1"], ["01HB5H3HYBR5FBXK2YWYSJV1H0", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent2"], ["01H87H87X72A6HT9PXQRFPMPHB", "01H7VFHNNYH3W0VRWVY178K2TK", "agent1"], ["01H8WYTF2RRA8KDH0K95ADJQSF", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent2"], ["01H8XWSS6K4C43P4QTGXGXDY2J", "01H7VFHN5WVC5HKKVBHZBA553R", "agent1"], ["01H87HD66NJHMD2JD4FXRPRSHP", "01H7VFHP8AN5643B0NR0NP00VE", "agent1"], ["01H84WZDP4XQ1224ESJS2K0PC2", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent2"], ["01HB5JSJEHXW78MW4YT0WZ1K8A", "01H7VFHPS5WJW2694R1MNC8JFY", "agent2"], ["01HB45DSV3J8ZT8RKFK81ZFFVG", "01H7VFHPS5WJW2694R1MNC8JFY", "agent1"], ["01HB45T3DJPCBAE1XRZ5WA1T95", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent1"], ["01HB47WB6Q1Z0X2EA6EM1FH2ZE", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent1"], ["01HB44TFXYZ0TTJG3MPYB1QGYB", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent1"], ["01HB50ST0K2Z3XXH8AXX3KPEE3", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent1"], ["01H8ZNC0N5HGM6X1GYX9H1Q9RF", "01H7VFHNNYH3W0VRWVY178K2TK", "agent2"], ["01HB5J5VH7S9W1QJSMSE926J2R", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent2"], ["01H8XMYAX968BZT9MZRERJNYMS", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent2"], ["01H7VM4CJK6ZFQFT26FN8NSYME", "01H7VFHNNYH3W0VRWVY178K2TK", "agent2"], ["01H7ZCENRMAVPZ1DKBJW6CDHB9", "01H7VFHNN7XTR99319DS8KZCQM", "agent1"], ["01H87HM19XVGG1RN95WEKP8MXG", "01H7VFHN5WVC5HKKVBHZBA553R", "agent1"], ["01HB44THFBJFQPXRV92P8ZAMM5", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent1"], ["01H8XWV8V6YHQAACDM0QS2HVKZ", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent1"], ["01H8XWGW6KM9AKY52EKW0BX99P", "01H7VFHP8AN5643B0NR0NP00VE", "agent1"], ["01HB5H3GR354TGTW0Y60R894W5", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent2"], ["01HB50BDD53V8W0DACNJREFY96", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent1"], ["01H8WYAT5N79JNS50EKCPENH1T", "01H7VFHPDZVVCDZR3AARA547CY", "agent2"], ["01H8XN1Z9ACQZKV3QMJ9J84PXR", "01H7VFHNV13MHN97GAH73E3KM8", "agent2"], ["01HB45WEKW8T3QHHF74RX6PQ77", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent1"], ["01H8ZMXM0ERH0A7715WAHCHB2K", "01H7VFHN5WVC5HKKVBHZBA553R", "agent2"], ["01H7ZE6HNCQ5NY915JGQ7A5R87", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent1"], ["01HB450MC98QAFTYFCYGYSVFSA", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent1"], ["01H87JDYKP1FMK0X01KYA21RTQ", "01H7VFHNV13MHN97GAH73E3KM8", "agent1"], ["01H84SW7PCD8ACM4K0S2BTEXYJ", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent2"], ["01H7ZCCF5JPN28S5PNAWQJXENW", "01H7VFHNN7XTR99319DS8KZCQM", "agent2"], ["01H8XM9809AYET7QY65RMTDJPX", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent2"], ["01H8ZDPA6QMNG3F57G742R3FKV", "01H7VFHNN7XTR99319DS8KZCQM", "agent1"], ["01H8ZDMYV92X4GXW9N656ZX6HW", "01H7VFHNNYH3W0VRWVY178K2TK", "agent1"], ["01HB5J35B6TADMW4JWFEPMX4BW", "01H7VFHPS5WJW2694R1MNC8JFY", "agent2"], ["01HB5JW1T8K61NACS9E2N04XHQ", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent2"], ["01H8XQ3B6XJVQ5F28Q348Y5Y2H", "01H7VFHNV13MHN97GAH73E3KM8", "agent2"], ["01H87HMXBM2Q805VAFR5ASQJ17", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent1"], ["01H8WW3F7585KPJP4VKE6GMB07", "01H7VFHP8AN5643B0NR0NP00VE", "agent1"], ["01H8Y03TVHQSHN790HV9SJCSFP", "01H7VFHNV13MHN97GAH73E3KM8", "agent1"], ["01H7VNMYSSJT6YV27TV1EFZH62", "01H7VFHPDZVVCDZR3AARA547CY", "agent1"], ["01HB50WGG1Q86RPEAC310RPXAJ", "01H7VFHPS5WJW2694R1MNC8JFY", "agent1"], ["01H8XXPRAMM87VEMJE16Z4357D", "01H7VFHPDZVVCDZR3AARA547CY", "agent1"], ["01H8XNNSHQ7TWBB87786H32FJY", "01H7VFHN5WVC5HKKVBHZBA553R", "agent2"], ["01H8WZYKA9N71VHA5BEXJHXV7M", "01H7VFHNV13MHN97GAH73E3KM8", "agent2"], ["01H8XXCE9ERYVCSSG3FC4PF600", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent1"], ["01H84QNYWPEJC8DJAHFHCQQ633", "01H7VFHPDZVVCDZR3AARA547CY", "agent2"], ["01H8XY4NPGD83XJW1Q32R6KQWB", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent1"], ["01H8XY3HMB31PM0SAWXPCQG5R1", "01H7VFHN5WVC5HKKVBHZBA553R", "agent1"], ["01H8WWA56DXV1R7XS34D1W8PDW", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent1"], ["01H8WW03FRW2QQN2DJXHXR2DND", "01H7VFHPDZVVCDZR3AARA547CY", "agent1"], ["01H8WWBFS2P4033GKTEW5Z8ZKH", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent1"], ["01H84TPAKBWCM0JZ77RCYBYZTE", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent2"], ["01H87HRWZ4WJSBY2M16ZEMPGW6", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent1"], ["01H87HF7X0SZS9Y5PJ2QER6H55", "01H7VFHNN7XTR99319DS8KZCQM", "agent1"], ["01HB44HRVVCTVXXE1RN3AD0G8A", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent1"], ["01H8XNXPKRS3HXDV9KS60CGJ4H", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent2"], ["01HB508JX7MF04STRA9K8DR0Q7", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent1"], ["01HB4SANVJVDQH2V5K1GWXRM00", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent1"], ["01H8XNGF51SP201RZMNH7N6J6Y", "01H7VFHNN7XTR99319DS8KZCQM", "agent2"], ["01HB5HTJYKS9YF3SHV6M72XSJ3", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent2"], ["01H8WWSWCV2FPYHH8PGACB4BTP", "01H7VFHNV13MHN97GAH73E3KM8", "agent1"], ["01H7ZE1CPNJER8EPVEKXMZ90ZP", "01H7VFHN5WVC5HKKVBHZBA553R", "agent1"], ["01HB471NAZ2PRTQDKSBN2STCS0", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent1"], ["01H7ZE3VN3XX594BQDGW4159XM", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent2"], ["01H84QDBP7ZDYWM71Y2RTF9TNN", "01H7VFHNNYH3W0VRWVY178K2TK", "agent2"], ["01H7ZBQKAV28573QWESP12FPF9", "01H7VFHP8AN5643B0NR0NP00VE", "agent1"], ["01H8XNBWV836M3HEH449XN003H", "01H7VFHP8AN5643B0NR0NP00VE", "agent2"], ["01H84RP0VAV5W0G659EAP0K85R", "01H7VFHP8AN5643B0NR0NP00VE", "agent2"], ["01H87H9QMVEW19N4EGNDN84SRB", "01H7VFHPDZVVCDZR3AARA547CY", "agent1"], ["01H7VNJVBNGVG6DPPKB8ZAC13C", "01H7VFHPDZVVCDZR3AARA547CY", "agent2"], ["01HB5J5SVYNSTYMGFYDC5MNJ3Y", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent2"], ["01HB5JARXHQVRM4FB8V4RBTSQB", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent2"], ["01H8WW910XYJGSXF3D2JN12A8F", "01H7VFHN5WVC5HKKVBHZBA553R", "agent1"], ["01HB5HMNCZYCM6909TZCN3Q7MY", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent2"], ["01H8XWAT0YDSDT3K378PJ1P21Y", "01H7VFHPDZVVCDZR3AARA547CY", "agent1"], ["01HB45KTZM8V0RAYXJYS4RK471", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent1"], ["01H8XKKBSEKC0SPTDC6YQK1430", "01H7VFHN5WVC5HKKVBHZBA553R", "agent2"], ["01H8XXV421WZHT142BE4B1RDYY", "01H7VFHP8AN5643B0NR0NP00VE", "agent1"], ["01HB5GPMA9YPT60PNT85GEBQXS", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent2"], ["01H84SSH35DGRVAJBFK00XZ372", "01H7VFHN5WVC5HKKVBHZBA553R", "agent2"], ["01HB5JVKA362MYABAQMM14JD8P", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent2"], ["01H8XN702MP234MV9QS599T2MZ", "01H7VFHNNYH3W0VRWVY178K2TK", "agent2"], ["01H8XWYP2ZS8NKEHJSPESQR38A", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent1"], ["01H8XXGYDR2WSBJZ5AQMH6JF0B", "01H7VFHNV13MHN97GAH73E3KM8", "agent1"], ["01H7ZFZ9951QWAB3AEYGMZ0Q3W", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent2"], ["01H84S23ESE3V98D5TT7FBRHGH", "01H7VFHNN7XTR99319DS8KZCQM", "agent2"], ["01H8XXY7YQE9WWXD6XMDK23NST", "01H7VFHNN7XTR99319DS8KZCQM", "agent1"], ["01HB5G44S54TS5GG7CS1PFRJ5S", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent2"], ["01H8WZV4CPTR18B8T3VD30GXTJ", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent2"], ["01HB5GGJA6KJV9944ZCBTKWES8", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent2"], ["01H7ZN9GA4GQ0H38D60A0HKWAR", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent1"], ["01H7ZDZBEHQ3B2HGF1QGZ0AWV3", "01H7VFHN5WVC5HKKVBHZBA553R", "agent2"], ["01H8ZN2JYHZH9DZ056KRXS0GJX", "01H7VFHNNYH3W0VRWVY178K2TK", "agent2"], ["01HB5HN6DQ1F0EANTR640GMT2V", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent2"], ["01HB5HJG3QW5CTE0CS5EBPEZX5", "01H7VFHPS5WJW2694R1MNC8JFY", "agent2"], ["01H8WYR7TQ9J88QZHP45YDNX7C", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent2"], ["01HB5GT8SZ8ZHRYZBFMHPQFAKW", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent2"], ["01H87J87QCY7D3C23G415ZQBPW", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent1"], ["01H8ZNDK7QXDYYSM62FWAJG6K9", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent2"], ["01H8XYVE9EPSZD76N45YQEPYKB", "01H7VFHNV13MHN97GAH73E3KM8", "agent1"], ["01H8XYPF4E2C0A4PEEDWJ262T4", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent1"], ["01H7VM65KGGY7842XGE5A7PEK1", "01H7VFHNNYH3W0VRWVY178K2TK", "agent1"], ["01H8WYK5SQYJYD2MZNGJ2GS08P", "01H7VFHNN7XTR99319DS8KZCQM", "agent2"], ["01HB5HWR4VZFF96AY5F005D0S6", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent2"], ["01HB5GDES841SWM1GB6XP8T0XZ", "01H7VFHPS5WJW2694R1MNC8JFY", "agent2"], ["01H8XTNGK3J3REC4WPB90X37DK", "01H7VFHNV13MHN97GAH73E3KM8", "agent2"], ["01HB5GGCT4XZ3KVHMY36W6242E", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent2"], ["01HB5JJKT5R4DXZ81ERTFQYY56", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent2"], ["01H8XXN9185ATPJ3C5ABDNVCJP", "01H7VFHNNYH3W0VRWVY178K2TK", "agent1"], ["01H7ZG0QK0TJZS2BAVHQMQ63A0", "01H7VFHN7WJK7VWVRZZTQ6DX9T", "agent1"], ["01H8ZNHANA910ADSCPV61PD9MG", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent2"], ["01HB47WDFP783808SHY1FZYCNT", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent1"], ["01H8WWQDM9J4VDVCSWNXFRYT3T", "01H7VFHN7A1ZX5KSMT2YN9RXC4", "agent1"], ["01HB5HA6WWY92T037M8C4B4KMP", "01H7VFHN9W0WAFZCBT09PKJJNK", "agent2"], ["01H8XKMFZR88S9T2NBR2G4K3PQ", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent2"], ["01H8XNR40GVMD5MZ20GKS4THSN", "01H7VFHNF4G18PC9JHGRC8A1R6", "agent2"], ["01H7ZBMR7VXB86S4A5J7B0XMJJ", "01H7VFHP8AN5643B0NR0NP00VE", "agent2"], ["01HB50M5ZZC1FQX6HRBFYB722X", "01H7VFHPQQQY6H4DNC6NBQ8XTG", "agent1"], ["01H84XXWFJHDKB1T0XB4M3TTJZ", "01H7VFHNV13MHN97GAH73E3KM8", "agent2"], ["01HB4SFNE79625955A5P5JS0VW", "01H7VFHQ11NAMZS4A2RDGDB01V", "agent1"], ["01H8XKE7670H7FY848MZJA21QZ", "01H7VFHNN7XTR99319DS8KZCQM", "agent2"], ["01HB50GY0RQRX8T3YVRS3R9BK6", "01H7VFHPSWGDGEYRP63H2DJKV0", "agent1"], ["01H8WYG48PFQTH6XREAVS5WRTN", "01H7VFHP8AN5643B0NR0NP00VE", "agent2"], ["01HB5H0ETGQ16VCMSMQXF9PSVW", "01H7VFHPS5WJW2694R1MNC8JFY", "agent2"]]
55 changes: 55 additions & 0 deletions human_eval/hard_scenario_sampling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import json
from collections import defaultdict
import random

with open('complete_gpt_score.json', 'r') as f:
complete_gpt_score = json.load(f)

with open('complete_pk_agent_pairs.json', 'r') as f:
complete_pk_agent_pairs = json.load(f)

# build env_dict for each environment collect all the data
env_dict = defaultdict(list)
for key, value in complete_gpt_score.items():
env_dict[value['env_pk']].append({'agent1': value['agent1'], 'agent2': value['agent2'], 'pk': key, 'env_pk': value['env_pk']})

goal_score = []
for i in range(1000):
# random sample 2 datapoint for each environment, need to random
sampled_data = []
for key, value in env_dict.items():
scores = []
for data in value:
env_pk = data['env_pk']
pk = data['pk']
agent1_score = data['agent1']
agent2_score = data['agent2']
for pair in complete_pk_agent_pairs:
if pair[0] == pk:
if pair[-1] == 'agent1':
agent1_score['pk'] = pk
scores.append(agent1_score)
else:
agent2_score['pk'] = pk
scores.append(agent2_score)
sampled_data.extend(random.sample(scores, 2))

average_score = {'believability': 0, 'relationship': 0, 'knowledge': 0, 'secret': 0, 'social_rules': 0, 'financial_and_material_benefits': 0, 'goal': 0, 'overall_score': 0}
for dimension in average_score.keys():
average_score[dimension] = sum([score[dimension] for score in sampled_data]) / len(sampled_data)
goal_score.append(average_score['goal'])

pks = []
for data in sampled_data:
print(data['pk'])
pks.append(data['pk'])

if abs(average_score['goal'] - 5.9) < 0.1:
print(len(set(pks)))
with open('pk_list.json', 'w') as f:
json.dump(pks, f, indent=4)
import pdb; pdb.set_trace()

print(sum(goal_score) / len(goal_score))
print(max(goal_score))

Loading

0 comments on commit b72bd75

Please sign in to comment.