-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #102 from biocypher/benchmark
Benchmark & RAG agent, architecture changes (potentially breaking → minor version increase)
- Loading branch information
Showing
64 changed files
with
6,844 additions
and
3,413 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,5 @@ __pycache__/ | |
.idea/ | ||
*.env | ||
volumes/ | ||
benchmark/results/*.csv | ||
benchmark/encrypted_llm_test_data.json | ||
site/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
fail_fast: false | ||
default_language_version: | ||
python: python3 | ||
default_stages: | ||
- commit | ||
- push | ||
minimum_pre_commit_version: 2.7.1 | ||
repos: | ||
- repo: https://github.com/ambv/black | ||
rev: 23.7.0 | ||
hooks: | ||
- id: black | ||
- repo: https://github.com/timothycrosley/isort | ||
rev: 5.12.0 | ||
hooks: | ||
- id: isort | ||
additional_dependencies: [toml] | ||
- repo: https://github.com/snok/pep585-upgrade | ||
rev: v1.0 | ||
hooks: | ||
- id: upgrade-type-hints | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.4.0 | ||
hooks: | ||
- id: check-docstring-first | ||
- id: end-of-file-fixer | ||
- id: check-added-large-files | ||
- id: mixed-line-ending | ||
- id: trailing-whitespace | ||
exclude: ^.bumpversion.cfg$ | ||
- id: check-merge-conflict | ||
- id: check-case-conflict | ||
- id: check-symlinks | ||
- id: check-yaml | ||
args: [--unsafe] | ||
- id: check-ast | ||
- id: fix-encoding-pragma | ||
args: [--remove] # for Python3 codebase, it's not necessary | ||
- id: requirements-txt-fixer | ||
- repo: https://github.com/pre-commit/pygrep-hooks | ||
rev: v1.10.0 | ||
hooks: | ||
- id: python-no-eval | ||
- id: python-use-type-annotations | ||
- id: python-check-blanket-noqa | ||
- id: rst-backticks | ||
- id: rst-directive-colons | ||
- id: rst-inline-touching-normal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
import pytest | ||
|
||
import pandas as pd | ||
|
||
|
||
def benchmark_already_executed( | ||
model_name: str, | ||
task: str, | ||
subtask: str, | ||
) -> bool: | ||
""" | ||
Checks if the benchmark task and subtask test case for the model_name have already | ||
been executed. | ||
Args: | ||
task (str): The benchmark task, e.g. "biocypher_query_generation" | ||
subtask (str): The benchmark subtask test case, e.g. "0_entities" | ||
model_name (str): The model name, e.g. "gpt-3.5-turbo" | ||
Returns: | ||
bool: True if the benchmark task and subtask for the model_name has | ||
already been run, False otherwise | ||
""" | ||
task_results = return_or_create_result_file(task) | ||
task_results_subset = (task_results["model_name"] == model_name) & ( | ||
task_results["subtask"] == subtask | ||
) | ||
return task_results_subset.any() | ||
|
||
|
||
def skip_if_already_run( | ||
model_name: str, | ||
task: str, | ||
subtask: str, | ||
) -> None: | ||
"""Helper function to check if the test case is already executed. | ||
Args: | ||
model_name (str): The model name, e.g. "gpt-3.5-turbo" | ||
result_files (dict[str, pd.DataFrame]): The result files | ||
task (str): The benchmark task, e.g. "biocypher_query_generation" | ||
subtask (str): The benchmark subtask test case, e.g. "0_single_word" | ||
""" | ||
if benchmark_already_executed(model_name, task, subtask): | ||
pytest.skip( | ||
f"benchmark {task}: {subtask} with {model_name} already executed" | ||
) | ||
|
||
|
||
def return_or_create_result_file( | ||
task: str, | ||
): | ||
""" | ||
Returns the result file for the task or creates it if it does not exist. | ||
Args: | ||
task (str): The benchmark task, e.g. "biocypher_query_generation" | ||
Returns: | ||
pd.DataFrame: The result file for the task | ||
""" | ||
file_path = get_result_file_path(task) | ||
try: | ||
results = pd.read_csv(file_path, header=0) | ||
except (pd.errors.EmptyDataError, FileNotFoundError): | ||
results = pd.DataFrame( | ||
columns=["model_name", "subtask", "score", "iterations"] | ||
) | ||
results.to_csv(file_path, index=False) | ||
return results | ||
|
||
|
||
def write_results_to_file( | ||
model_name: str, subtask: str, score: str, iterations: str, file_path: str | ||
): | ||
"""Writes the benchmark results for the subtask to the result file. | ||
Args: | ||
model_name (str): The model name, e.g. "gpt-3.5-turbo" | ||
subtask (str): The benchmark subtask test case, e.g. "entities_0" | ||
score (str): The benchmark score, e.g. "1/1" | ||
iterations (str): The number of iterations, e.g. "1" | ||
""" | ||
results = pd.read_csv(file_path, header=0) | ||
new_row = pd.DataFrame( | ||
[[model_name, subtask, score, iterations]], columns=results.columns | ||
) | ||
results = pd.concat([results, new_row], ignore_index=True).sort_values( | ||
by=["model_name", "subtask"] | ||
) | ||
results.to_csv(file_path, index=False) | ||
|
||
|
||
# TODO should we use SQLite? An online database (REDIS)? | ||
def get_result_file_path(file_name: str) -> str: | ||
"""Returns the path to the result file. | ||
Args: | ||
file_name (str): The name of the result file | ||
Returns: | ||
str: The path to the result file | ||
""" | ||
return f"benchmark/results/{file_name}.csv" |
Oops, something went wrong.