medical exam changes and plotting, text2cypher

biocypher · Aug 8, 2024 · 46ce9a6 · 46ce9a6
1 parent a686d93
commit 46ce9a6
Show file tree

Hide file tree

Showing 31 changed files with 363 additions and 497 deletions.
diff --git a/benchmark/data/benchmark_data.yaml b/benchmark/data/benchmark_data.yaml
@@ -228,7 +228,7 @@ medical_exam:
 
   # Math relevant questions
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: A hollow organ of a patient contains 0.5 kg of liquid whose (mass) density is approximately the same as that of water. What is the approximate volume of this hollow organ? (A) 500 mm3 (B) 500 cm3 (C) 0,05 m3 (D) 0,5 m3 (E) 50 m3
       system_messages:
@@ -238,7 +238,7 @@ medical_exam:
     expected:
       answer: "b"
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: In a car accident, an occupant weighing 75 kg is restrained by the tight-fitting seat belt, whereby a belt force limiter limits the force acting on the occupant to a maximum of 5,000 N. The acceleration acting on the occupant is thus limited (in terms of amount) to approx. (A) 0.067 m/s2 (B) 15 m/s2 (C) 33 m/s2 (D) 67 m/s2 (E) 375 m/s2
       system_messages:
@@ -248,7 +248,7 @@ medical_exam:
     expected:
       answer: "d"
 
-  - case: single_answer:math:en
+  - case: single_choice:math:en
     input:
       prompt: Medical oxygen is supplied in pressurised gas cylinders with an internal pressure of usually 200 bar. Manufacturers state that these cylinders should not be heated to over 50 °C and must be stored in a protected environment. By approximately what percentage does this internal pressure increase when such a pressurised gas cylinder is heated from 30 °C to 90 °C? (A) 3 % (B) 20 % (C) 40 % (D) 60 % (E) 300 %
       system_messages:

diff --git a/benchmark/results/medical_exam.csv b/benchmark/results/medical_exam.csv
diff --git a/benchmark/results/medical_exam_failure_modes.csv b/benchmark/results/medical_exam_failure_modes.csv
diff --git a/benchmark/results/processed/correlations.txt b/benchmark/results/processed/correlations.txt
@@ -1,4 +1,4 @@
-Size vs accuracy Pearson correlation: 0.2196874410865915
-Size vs accuracy Pearson correlation p-value: 8.674722515042285e-09
-Quantisation vs accuracy Pearson correlation: 0.2427859964015104
-Quantisation vs accuracy Pearson correlation p-value: 1.797917250633135e-10
+Size vs accuracy Pearson correlation: 0.22108985916662796
+Size vs accuracy Pearson correlation p-value: 7.1171375567780455e-09
+Quantisation vs accuracy Pearson correlation: 0.2455410361574719
+Quantisation vs accuracy Pearson correlation p-value: 1.1369970729884332e-10
diff --git a/benchmark/results/processed/medical_exam.csv b/benchmark/results/processed/medical_exam.csv
@@ -17,9 +17,9 @@ openhermes-2.5:7:ggufv2:Q3_K_M,604.0,1071.0,1.7320508075688772,0.563958916900093
 openhermes-2.5:7:ggufv2:Q2_K,576.0,1071.0,0.0,0.5378151260504201,3
 llama-2-chat:13:ggufv2:Q8_0,462.0,1071.0,0.0,0.43137254901960786,3
 llama-2-chat:13:ggufv2:Q5_K_M,462.0,1071.0,0.0,0.43137254901960786,3
-llama-2-chat:13:ggufv2:Q6_K,459.0,1071.0,0.0,0.42857142857142855,3
 llama-2-chat:13:ggufv2:Q4_K_M,459.0,1071.0,0.0,0.42857142857142855,3
 llama-2-chat:13:ggufv2:Q3_K_M,459.0,1071.0,0.0,0.42857142857142855,3
+llama-2-chat:13:ggufv2:Q6_K,459.0,1071.0,0.0,0.42857142857142855,3
 chatglm3:6:ggmlv3:q4_0,457.0,1071.0,21.616171041461605,0.42670401493930904,3
 llama-2-chat:13:ggufv2:Q2_K,444.0,1071.0,0.0,0.41456582633053224,3
 llama-2-chat:7:ggufv2:Q6_K,435.0,1071.0,0.0,0.4061624649859944,3
@@ -29,14 +29,13 @@ llama-2-chat:7:ggufv2:Q5_K_M,429.0,1071.0,0.0,0.4005602240896359,3
 llama-2-chat:7:ggufv2:Q3_K_M,423.0,1071.0,0.0,0.3949579831932773,3
 llama-2-chat:7:ggufv2:Q2_K,396.0,1071.0,0.0,0.3697478991596639,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,395.0,1071.0,0.5773502691896257,0.36881419234360413,3
-mistral-instruct-v0.2:7:ggufv2:Q8_0,393.0,1071.0,0.0,0.36694677871148457,3
 mistral-instruct-v0.2:7:ggufv2:Q6_K,393.0,1071.0,0.0,0.36694677871148457,3
+mistral-instruct-v0.2:7:ggufv2:Q8_0,393.0,1071.0,0.0,0.36694677871148457,3
 mistral-instruct-v0.2:7:ggufv2:Q4_K_M,391.0,1071.0,0.5773502691896258,0.36507936507936506,3
 mistral-instruct-v0.2:7:ggufv2:Q5_K_M,390.0,1071.0,0.0,0.3641456582633053,3
 mistral-instruct-v0.2:7:ggufv2:Q3_K_M,386.0,1071.0,0.5773502691896258,0.3604108309990663,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,384.0,1071.0,0.0,0.3585434173669468,3
-mixtral-instruct-v0.1:46_7:ggufv2:Q5_K_M,378.0,1071.0,0.0,0.35294117647058826,3
 mistral-instruct-v0.2:7:ggufv2:Q2_K,378.0,1071.0,0.0,0.35294117647058826,3
+mixtral-instruct-v0.1:46_7:ggufv2:Q5_K_M,378.0,1071.0,0.0,0.35294117647058826,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q6_K,367.0,1071.0,0.5773502691896257,0.3426704014939309,3
 mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,353.0,1071.0,0.5773502691896258,0.3295985060690943,3
-gpt-3.5-turbo-0613,312.0,1071.0,0.0,0.2913165266106443,3
diff --git a/benchmark/results/processed/naive_vs_biochatter.txt b/benchmark/results/processed/naive_vs_biochatter.txt
@@ -1,4 +1,4 @@
-mean: 0.8269791666666666 vs 0.4603125
+mean: 0.8269791666666667 vs 0.4603125
 std: 0.11106303523411587 vs 0.11423809220552653
-p-value: 1.5661453605577185e-37
-t-statistic: 18.410684234461385
+p-value: 1.5661453605576738e-37
+t-statistic: 18.410684234461392
diff --git a/benchmark/results/processed/overview-model.csv b/benchmark/results/processed/overview-model.csv
@@ -2,8 +2,8 @@ Model name,Size,Median Accuracy,SD
 gpt-3.5-turbo-0125,175,0.87,0.21
 gpt-4-turbo-2024-04-09,Unknown,0.83,0.3
 gpt-4-0613,Unknown,0.78,0.18
+gpt-3.5-turbo-0613,175,0.76,0.21
 gpt-4o-2024-05-13,Unknown,0.73,0.35
-gpt-3.5-turbo-0613,175,0.73,0.24
 gpt-4-0125-preview,Unknown,0.73,0.3
 gpt-4o-mini-2024-07-18,Unknown,0.7,0.27
 openhermes-2.5,7,0.7,0.32

diff --git a/benchmark/results/processed/overview-quantisation.csv b/benchmark/results/processed/overview-quantisation.csv
@@ -2,10 +2,10 @@ Model name,Size,Version,Quantisation,Median Accuracy,SD
 gpt-3.5-turbo-0125,175,,,0.87,0.21
 gpt-4-turbo-2024-04-09,Unknown,,,0.83,0.3
 gpt-4-0613,Unknown,,,0.78,0.18
+gpt-3.5-turbo-0613,175,,,0.76,0.21
 gpt-4-0125-preview,Unknown,,,0.73,0.3
 gpt-4o-2024-05-13,Unknown,,,0.73,0.35
 openhermes-2.5,7,ggufv2,Q5_K_M,0.73,0.32
-gpt-3.5-turbo-0613,175,,,0.73,0.24
 openhermes-2.5,7,ggufv2,Q8_0,0.71,0.32
 openhermes-2.5,7,ggufv2,Q4_K_M,0.71,0.33
 gpt-4o-mini-2024-07-18,Unknown,,,0.7,0.27

diff --git a/benchmark/results/processed/overview.csv b/benchmark/results/processed/overview.csv
@@ -2,10 +2,10 @@ Full model name,property_selection,query_generation,multimodal_answer,api_callin
 gpt-3.5-turbo-0125,0.35625,0.9666666666666668,,0.7464788732394366,0.5100318961757607,1.0,1.0,1.0,0.9266666666666666,0.4866666666666667,0.9,0.6704014939309056,0.8666666666666667,0.7858190775010643,0.8666666666666667,0.2113612585450092
 gpt-4-turbo-2024-04-09,0.303125,0.8266666666666667,0.99,,0.6503689591957673,1.0,1.0,0.0,0.6,0.5,1.0,0.8395061728395061,1.0,0.725805566558495,0.8266666666666667,0.30141137449353134
 gpt-4-0613,0.359375,0.9666666666666668,,0.6190476190476191,0.6689027994568157,1.0,0.8888888888888888,0.65,0.88,0.68,1.0,0.7301587301587301,0.8888888888888888,0.7776607160923007,0.7776607160923007,0.17755825856670066
+gpt-3.5-turbo-0613,0.3625,0.9466666666666668,,,0.5753814654033865,1.0,0.8888888888888888,0.5,0.8333333333333334,0.5,1.0,,0.7555555555555555,0.7362325909847831,0.7555555555555555,0.21192578872727258
 gpt-4-0125-preview,0.0,0.8333333333333334,,0.7936507936507936,0.6897052189771663,1.0,0.7777777777777778,0.75,0.0,0.44,0.5,0.7759103641456583,0.7333333333333333,0.6078092351015052,0.7333333333333333,0.2951294777600531
 gpt-4o-2024-05-13,0.0,0.8,0.96,0.8095238095238095,0.6539462799425529,1.0,1.0,0.0,0.0,0.5333333333333333,0.7,0.7628384687208216,0.85,0.6207416839631167,0.7314192343604108,0.3510912882717199
 openhermes-2.5:7:ggufv2:Q5_K_M,0.125,0.9133333333333332,,,0.5799163100443383,1.0,0.8888888888888888,1.0,0.0,0.5866666666666667,1.0,0.5714285714285714,0.7777777777777778,0.6766374134672342,0.7272075956225059,0.3185927867352887
-gpt-3.5-turbo-0613,0.3625,0.9466666666666668,,,0.5753814654033865,1.0,0.8888888888888888,0.5,0.8333333333333334,0.5,1.0,0.2913165266106443,0.7555555555555555,0.6957856760416795,0.7256706157986175,0.23707855730412225
 openhermes-2.5:7:ggufv2:Q8_0,0.125,0.88,,,0.6008286779833671,1.0,0.8888888888888888,1.0,0.0,0.4666666666666667,1.0,0.5770308123249299,0.7555555555555555,0.6630882364926735,0.7093218960241146,0.3199188844960537
 openhermes-2.5:7:ggufv2:Q4_K_M,0.046875,0.8733333333333333,,,0.5972813161390413,1.0,0.8888888888888888,1.0,0.0,0.4666666666666667,1.0,0.5863678804855276,0.7555555555555555,0.655906240097183,0.7057308978263692,0.3309323511149121
 gpt-4o-mini-2024-07-18,0.365625,0.9666666666666668,0.98,0.7142857142857143,0.6845534288609352,0.8333333333333334,1.0,0.0,0.66,0.5333333333333333,0.5,0.8404984423676013,0.925,0.6925612245267373,0.7034234694062258,0.2670836626973781

diff --git a/docs/images/boxplot-medical-exam-domain.png b/docs/images/boxplot-medical-exam-domain.png
diff --git a/docs/images/boxplot-medical-exam-language-domain.png b/docs/images/boxplot-medical-exam-language-domain.png
diff --git a/docs/images/boxplot-medical-exam-language.png b/docs/images/boxplot-medical-exam-language.png
diff --git a/docs/images/boxplot-medical-exam-task.png b/docs/images/boxplot-medical-exam-task.png
diff --git a/docs/images/boxplot-naive-vs-biochatter.pdf b/docs/images/boxplot-naive-vs-biochatter.pdf
diff --git a/docs/images/boxplot-per-quantisation.png b/docs/images/boxplot-per-quantisation.png
diff --git a/docs/images/boxplot-tasks.png b/docs/images/boxplot-tasks.png
diff --git a/docs/images/boxplot-text2cypher.png b/docs/images/boxplot-text2cypher.png
diff --git a/docs/images/dotplot-per-task.pdf b/docs/images/dotplot-per-task.pdf
diff --git a/docs/images/dotplot-per-task.png b/docs/images/dotplot-per-task.png
diff --git a/docs/images/histogram-image-caption-confidence.png b/docs/images/histogram-image-caption-confidence.png
diff --git a/docs/images/scatter-per-quantisation-name.pdf b/docs/images/scatter-per-quantisation-name.pdf
diff --git a/docs/images/scatter-per-quantisation-name.png b/docs/images/scatter-per-quantisation-name.png
diff --git a/docs/images/scatter-quantisation-accuracy.pdf b/docs/images/scatter-quantisation-accuracy.pdf
diff --git a/docs/images/scatter-quantisation-accuracy.png b/docs/images/scatter-quantisation-accuracy.png
diff --git a/docs/images/scatter-size-accuracy.pdf b/docs/images/scatter-size-accuracy.pdf
diff --git a/docs/images/scatter-size-accuracy.png b/docs/images/scatter-size-accuracy.png
diff --git a/docs/images/stripplot-extraction-tasks.png b/docs/images/stripplot-extraction-tasks.png
diff --git a/docs/images/stripplot-per-model.png b/docs/images/stripplot-per-model.png
diff --git a/docs/images/stripplot-rag-tasks.pdf b/docs/images/stripplot-rag-tasks.pdf
diff --git a/docs/images/stripplot-rag-tasks.png b/docs/images/stripplot-rag-tasks.png
diff --git a/docs/scripts/hooks.py b/docs/scripts/hooks.py
@@ -1,3 +1,4 @@
+import math
 import os
 import re
 
@@ -33,7 +34,9 @@ def on_pre_build(config, **kwargs) -> None:
 
     overview = create_overview_table(result_files_path, result_file_names)
 
-    plot_exam_en_vs_de()
+    plot_text2cypher()
+    plot_image_caption_confidence()
+    plot_medical_exam()
     plot_accuracy_per_model(overview)
     plot_accuracy_per_quantisation(overview)
     plot_accuracy_per_task(overview)
@@ -45,6 +48,123 @@ def on_pre_build(config, **kwargs) -> None:
     calculate_stats(overview)
 
 
+def plot_text2cypher():
+    """
+
+    Get entity_selection, relationship_selection, property_selection,
+    property_exists, and end_to_end_query_generation results files, combine and
+    preprocess them and plot the accuracy for each model as a boxplot.
+
+    """
+    entity_selection = pd.read_csv("benchmark/results/entity_selection.csv")
+    entity_selection["task"] = "entity_selection"
+    relationship_selection = pd.read_csv(
+        "benchmark/results/relationship_selection.csv"
+    )
+    relationship_selection["task"] = "relationship_selection"
+    property_selection = pd.read_csv("benchmark/results/property_selection.csv")
+    property_selection["task"] = "property_selection"
+    property_exists = pd.read_csv("benchmark/results/property_exists.csv")
+    property_exists["task"] = "property_exists"
+    end_to_end_query_generation = pd.read_csv(
+        "benchmark/results/end_to_end_query_generation.csv"
+    )
+    end_to_end_query_generation["task"] = "end_to_end_query_generation"
+
+    # combine all results
+    results = pd.concat(
+        [
+            entity_selection,
+            relationship_selection,
+            property_selection,
+            property_exists,
+            end_to_end_query_generation,
+        ]
+    )
+
+    # calculate accuracy
+    results["score_possible"] = results["score"].apply(
+        lambda x: float(x.split("/")[1])
+    )
+    results["scores"] = results["score"].apply(lambda x: x.split("/")[0])
+    results["score_achieved"] = results["scores"].apply(
+        lambda x: (
+            np.mean([float(score) for score in x.split(";")])
+            if ";" in x
+            else float(x)
+        )
+    )
+    results["accuracy"] = results["score_achieved"] / results["score_possible"]
+    results["score_sd"] = results["scores"].apply(
+        lambda x: (
+            np.std([float(score) for score in x.split(";")], ddof=1)
+            if ";" in x
+            else 0
+        )
+    )
+
+    # plot results per task
+    sns.set_theme(style="whitegrid")
+    plt.figure(figsize=(6, 4))
+    sns.boxplot(
+        x="task",
+        y="accuracy",
+        data=results,
+    )
+    plt.savefig(
+        "docs/images/boxplot-text2cypher.png",
+        bbox_inches="tight",
+        dpi=300,
+    )
+
+
+def plot_image_caption_confidence():
+    """
+    Get multimodal_answer_confidence.csv file, preprocess it and plot the
+    confidence scores for correct and incorrect answers as histograms. Correct
+    answer confidence values are in the correct_confidence column and incorrect
+    answer confidence values are in the incorrect_confidence column; both
+    columns contain individual confidence values (integers between 1 and 10)
+    separated by semicolons.
+    """
+    results = pd.read_csv("benchmark/results/multimodal_answer_confidence.csv")
+    correct_values = results["correct_confidence"].to_list()
+    incorrect_values = results["incorrect_confidence"].to_list()
+    # flatten lists of confidence values
+    correct_values = [
+        int(value) for sublist in correct_values for value in sublist.split(";")
+    ]
+    for value in list(incorrect_values):
+        if math.isnan(value):
+            incorrect_values.remove(value)
+        if isinstance(value, float):
+            continue
+        if ";" in value:
+            incorrect_values.remove(value)
+            incorrect_values.extend([int(val) for val in value.split(";")])
+
+    incorrect_values = [int(value) for value in incorrect_values]
+
+    # plot histograms of both correct and incorrect confidence values with
+    # transparency, correct green, incorrect red
+    plt.figure(figsize=(6, 4))
+    plt.hist(
+        [correct_values, incorrect_values],
+        bins=range(1, 12),
+        color=["green", "red"],
+        label=["Correct", "Incorrect"],
+    )
+    plt.xlabel("Confidence")
+    plt.ylabel("Count")
+    plt.xticks(range(1, 11))
+    plt.legend()
+    plt.savefig(
+        "docs/images/histogram-image-caption-confidence.png",
+        bbox_inches="tight",
+        dpi=300,
+    )
+
+
 def preprocess_results_for_frontend(
     raw_results: pd.DataFrame, path: str, file_name: str
 ) -> None:
@@ -757,13 +877,14 @@ def plot_extraction_tasks():
     )
 
 
-def plot_exam_en_vs_de():
+def plot_medical_exam():
     """
     Load raw result for medical_exam; aggregate based on the language and
     calculate mean accuracy for each model. Plot a stripplot of the mean
     accuracy across models, coloured by language.
     """
     medical_exam = pd.read_csv("benchmark/results/medical_exam.csv")
+
     medical_exam["score_possible"] = medical_exam["score"].apply(
         lambda x: float(x.split("/")[1])
     )
@@ -797,6 +918,9 @@ def plot_exam_en_vs_de():
         lambda x: x.split(":")[2]
     )
 
+    # processing: remove "short_words" task, not informative
+    medical_exam = medical_exam[medical_exam["task"] != "short_words"]
+
     # plot language comparison
     aggregated_scores_language = medical_exam.groupby(
         ["model_name", "language"]