-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmmlu_avg.py
8 lines (7 loc) · 9.5 KB
/
mmlu_avg.py
1
2
3
4
5
6
7
8
result = {
'hendrycksTest-abstract_algebra': {'acc': 0.26, 'acc_stderr': 0.0440844002276808, 'acc_norm': 0.24, 'acc_norm_stderr': 0.04292346959909283}, 'hendrycksTest-anatomy': {'acc': 0.2740740740740741, 'acc_stderr': 0.03853254836552003, 'acc_norm': 0.23703703703703705, 'acc_norm_stderr': 0.03673731683969506}, 'hendrycksTest-astronomy': {'acc': 0.3618421052631579, 'acc_stderr': 0.03910525752849724, 'acc_norm': 0.40789473684210525, 'acc_norm_stderr': 0.03999309712777471}, 'hendrycksTest-business_ethics': {'acc': 0.43, 'acc_stderr': 0.049756985195624284, 'acc_norm': 0.33, 'acc_norm_stderr': 0.04725815626252604}, 'hendrycksTest-clinical_knowledge': {'acc': 0.32452830188679244, 'acc_stderr': 0.028815615713432115, 'acc_norm': 0.3622641509433962, 'acc_norm_stderr': 0.0295822451283843}, 'hendrycksTest-college_biology': {'acc': 0.22916666666666666, 'acc_stderr': 0.035146974678623884, 'acc_norm': 0.2569444444444444, 'acc_norm_stderr': 0.03653946969442099}, 'hendrycksTest-college_chemistry': {'acc': 0.21, 'acc_stderr': 0.040936018074033256, 'acc_norm': 0.3, 'acc_norm_stderr': 0.046056618647183814}, 'hendrycksTest-college_computer_science': {'acc': 0.29, 'acc_stderr': 0.045604802157206845, 'acc_norm': 0.28, 'acc_norm_stderr': 0.04512608598542128}, 'hendrycksTest-college_mathematics': {'acc': 0.21, 'acc_stderr': 0.040936018074033256, 'acc_norm': 0.29, 'acc_norm_stderr': 0.045604802157206845}, 'hendrycksTest-college_medicine': {'acc': 0.2832369942196532, 'acc_stderr': 0.034355680560478746, 'acc_norm': 0.28901734104046245, 'acc_norm_stderr': 0.03456425745087}, 'hendrycksTest-college_physics': {'acc': 0.2647058823529412, 'acc_stderr': 0.04389869956808778, 'acc_norm': 0.3333333333333333, 'acc_norm_stderr': 0.04690650298201943}, 'hendrycksTest-computer_security': {'acc': 0.3, 'acc_stderr': 0.046056618647183814, 'acc_norm': 0.33, 'acc_norm_stderr': 0.047258156262526045}, 'hendrycksTest-conceptual_physics': {'acc': 0.2851063829787234, 'acc_stderr': 0.02951319662553935, 'acc_norm': 0.2127659574468085, 'acc_norm_stderr': 0.026754391348039766}, 'hendrycksTest-econometrics': {'acc': 0.30701754385964913, 'acc_stderr': 0.04339138322579861, 'acc_norm': 0.20175438596491227, 'acc_norm_stderr': 0.037752050135836386}, 'hendrycksTest-electrical_engineering': {'acc': 0.36551724137931035, 'acc_stderr': 0.040131241954243856, 'acc_norm': 0.36551724137931035, 'acc_norm_stderr': 0.04013124195424386}, 'hendrycksTest-elementary_mathematics': {'acc': 0.2751322751322751, 'acc_stderr': 0.02300008685906865, 'acc_norm': 0.25925925925925924, 'acc_norm_stderr': 0.022569897074918407}, 'hendrycksTest-formal_logic': {'acc': 0.30158730158730157, 'acc_stderr': 0.04104947269903394, 'acc_norm': 0.29365079365079366, 'acc_norm_stderr': 0.040735243221471276}, 'hendrycksTest-global_facts': {'acc': 0.32, 'acc_stderr': 0.04688261722621504, 'acc_norm': 0.27, 'acc_norm_stderr': 0.044619604333847394}, 'hendrycksTest-high_school_biology': {'acc': 0.2870967741935484, 'acc_stderr': 0.02573654274559453, 'acc_norm': 0.3096774193548387, 'acc_norm_stderr': 0.026302774983517418}, 'hendrycksTest-high_school_chemistry': {'acc': 0.21674876847290642, 'acc_stderr': 0.02899033125251624, 'acc_norm': 0.2955665024630542, 'acc_norm_stderr': 0.032104944337514575}, 'hendrycksTest-high_school_computer_science': {'acc': 0.28, 'acc_stderr': 0.04512608598542127, 'acc_norm': 0.29, 'acc_norm_stderr': 0.04560480215720684}, 'hendrycksTest-high_school_european_history': {'acc': 0.34545454545454546, 'acc_stderr': 0.03713158067481912, 'acc_norm': 0.3333333333333333, 'acc_norm_stderr': 0.0368105086916155}, 'hendrycksTest-high_school_geography': {'acc': 0.31313131313131315, 'acc_stderr': 0.033042050878136525, 'acc_norm': 0.30808080808080807, 'acc_norm_stderr': 0.03289477330098616}, 'hendrycksTest-high_school_government_and_politics': {'acc': 0.2849740932642487, 'acc_stderr': 0.0325771407770966, 'acc_norm': 0.30569948186528495, 'acc_norm_stderr': 0.03324837939758159}, 'hendrycksTest-high_school_macroeconomics': {'acc': 0.31025641025641026, 'acc_stderr': 0.023454674889404295, 'acc_norm': 0.30256410256410254, 'acc_norm_stderr': 0.02329088805377274}, 'hendrycksTest-high_school_mathematics': {'acc': 0.2111111111111111, 'acc_stderr': 0.024882116857655092, 'acc_norm': 0.3148148148148148, 'acc_norm_stderr': 0.02831753349606648}, 'hendrycksTest-high_school_microeconomics': {'acc': 0.3277310924369748, 'acc_stderr': 0.030489911417673227, 'acc_norm': 0.37815126050420167, 'acc_norm_stderr': 0.03149930577784906}, 'hendrycksTest-high_school_physics': {'acc': 0.25165562913907286, 'acc_stderr': 0.035433042343899844, 'acc_norm': 0.26490066225165565, 'acc_norm_stderr': 0.036030385453603826}, 'hendrycksTest-high_school_psychology': {'acc': 0.326605504587156, 'acc_stderr': 0.020106990889937303, 'acc_norm': 0.25504587155963304, 'acc_norm_stderr': 0.018688500856535832}, 'hendrycksTest-high_school_statistics': {'acc': 0.3194444444444444, 'acc_stderr': 0.03179876342176852, 'acc_norm': 0.3148148148148148, 'acc_norm_stderr': 0.03167468706828979}, 'hendrycksTest-high_school_us_history': {'acc': 0.3284313725490196, 'acc_stderr': 0.03296245110172228, 'acc_norm': 0.31862745098039214, 'acc_norm_stderr': 0.03270287181482081}, 'hendrycksTest-high_school_world_history': {'acc': 0.28270042194092826, 'acc_stderr': 0.029312814153955924, 'acc_norm': 0.31223628691983124, 'acc_norm_stderr': 0.030165137867847008}, 'hendrycksTest-human_aging': {'acc': 0.2825112107623318, 'acc_stderr': 0.030216831011508762, 'acc_norm': 0.21973094170403587, 'acc_norm_stderr': 0.027790177064383605}, 'hendrycksTest-human_sexuality': {'acc': 0.4351145038167939, 'acc_stderr': 0.04348208051644858, 'acc_norm': 0.3282442748091603, 'acc_norm_stderr': 0.04118438565806298}, 'hendrycksTest-international_law': {'acc': 0.30578512396694213, 'acc_stderr': 0.04205953933884124, 'acc_norm': 0.5206611570247934, 'acc_norm_stderr': 0.04560456086387235}, 'hendrycksTest-jurisprudence': {'acc': 0.32407407407407407, 'acc_stderr': 0.04524596007030048, 'acc_norm': 0.4444444444444444, 'acc_norm_stderr': 0.04803752235190192}, 'hendrycksTest-logical_fallacies': {'acc': 0.27607361963190186, 'acc_stderr': 0.03512385283705051, 'acc_norm': 0.3374233128834356, 'acc_norm_stderr': 0.03714908409935573}, 'hendrycksTest-machine_learning': {'acc': 0.30357142857142855, 'acc_stderr': 0.04364226155841044, 'acc_norm': 0.25892857142857145, 'acc_norm_stderr': 0.041577515398656284}, 'hendrycksTest-management': {'acc': 0.2815533980582524, 'acc_stderr': 0.044532548363264673, 'acc_norm': 0.32038834951456313, 'acc_norm_stderr': 0.0462028408228004}, 'hendrycksTest-marketing': {'acc': 0.49145299145299143, 'acc_stderr': 0.032751303000970296, 'acc_norm': 0.44017094017094016, 'acc_norm_stderr': 0.032520741720630506}, 'hendrycksTest-medical_genetics': {'acc': 0.3, 'acc_stderr': 0.046056618647183814, 'acc_norm': 0.35, 'acc_norm_stderr': 0.0479372485441102}, 'hendrycksTest-miscellaneous': {'acc': 0.3716475095785441, 'acc_stderr': 0.01728080252213317, 'acc_norm': 0.3065134099616858, 'acc_norm_stderr': 0.01648695289304151}, 'hendrycksTest-moral_disputes': {'acc': 0.29190751445086704, 'acc_stderr': 0.024476994076247323, 'acc_norm': 0.3179190751445087, 'acc_norm_stderr': 0.025070713719153176}, 'hendrycksTest-moral_scenarios': {'acc': 0.23798882681564246, 'acc_stderr': 0.014242630070574915, 'acc_norm': 0.27262569832402234, 'acc_norm_stderr': 0.014893391735249588}, 'hendrycksTest-nutrition': {'acc': 0.3627450980392157, 'acc_stderr': 0.027530078447110314, 'acc_norm': 0.42810457516339867, 'acc_norm_stderr': 0.02833239748366427}, 'hendrycksTest-philosophy': {'acc': 0.2861736334405145, 'acc_stderr': 0.025670259242188947, 'acc_norm': 0.3022508038585209, 'acc_norm_stderr': 0.026082700695399662}, 'hendrycksTest-prehistory': {'acc': 0.3117283950617284, 'acc_stderr': 0.025773111169630436, 'acc_norm': 0.26851851851851855, 'acc_norm_stderr': 0.024659685185967284}, 'hendrycksTest-professional_accounting': {'acc': 0.23049645390070922, 'acc_stderr': 0.025123739226872405, 'acc_norm': 0.2801418439716312, 'acc_norm_stderr': 0.026789172351140242}, 'hendrycksTest-professional_law': {'acc': 0.2620599739243807, 'acc_stderr': 0.011231552795890394, 'acc_norm': 0.28748370273794005, 'acc_norm_stderr': 0.011559337355708509}, 'hendrycksTest-professional_medicine': {'acc': 0.25735294117647056, 'acc_stderr': 0.026556519470041503, 'acc_norm': 0.29044117647058826, 'acc_norm_stderr': 0.02757646862274052}, 'hendrycksTest-professional_psychology': {'acc': 0.28431372549019607, 'acc_stderr': 0.01824902441120766, 'acc_norm': 0.27124183006535946, 'acc_norm_stderr': 0.0179866153040303}, 'hendrycksTest-public_relations': {'acc': 0.2909090909090909, 'acc_stderr': 0.04350271442923243, 'acc_norm': 0.17272727272727273, 'acc_norm_stderr': 0.03620691833929218}, 'hendrycksTest-security_studies': {'acc': 0.42857142857142855, 'acc_stderr': 0.03168091161233882, 'acc_norm': 0.3346938775510204, 'acc_norm_stderr': 0.03020923522624231}, 'hendrycksTest-sociology': {'acc': 0.2885572139303483, 'acc_stderr': 0.032038410402133226, 'acc_norm': 0.263681592039801, 'acc_norm_stderr': 0.03115715086935555}, 'hendrycksTest-us_foreign_policy': {'acc': 0.46, 'acc_stderr': 0.05009082659620332, 'acc_norm': 0.43, 'acc_norm_stderr': 0.049756985195624284}, 'hendrycksTest-virology': {'acc': 0.3373493975903614, 'acc_stderr': 0.03680783690727581, 'acc_norm': 0.3072289156626506, 'acc_norm_stderr': 0.035915667978246635}, 'hendrycksTest-world_religions': {'acc': 0.45614035087719296, 'acc_stderr': 0.03820042586602966, 'acc_norm': 0.4619883040935672, 'acc_norm_stderr': 0.038237270928823064}
}
acc_values = [item['acc'] for item in result.values()]
mean_acc = sum(acc_values) / len(acc_values)
print(len(acc_values))
print(mean_acc)