From 98b56346e6e016220469a6e54a681f872f797435 Mon Sep 17 00:00:00 2001
From: debrupf2946 <f20212946@goa.bits-pilani.ac.in>
Date: Mon, 19 Aug 2024 12:26:22 +0530
Subject: [PATCH] added evaluation using ragas script

---
 .../QA_graphrag_testdataset.py                | 130 +++++++++++
 .../ragas_evaluation/evaluation_ragas.py      | 210 +++++++++---------
 2 files changed, 229 insertions(+), 111 deletions(-)
 create mode 100644 graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py

diff --git a/graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py b/graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py
new file mode 100644
index 0000000..f162e5c
--- /dev/null
+++ b/graph_rag/evaluation/ragas_evaluation/QA_graphrag_testdataset.py
@@ -0,0 +1,130 @@
+"""
+This script contains functions to generate question-answer pairs from input documents using a language model,
+and critique them based on various criteria like groundedness, relevance, and standalone quality.
+
+Functions:
+- get_response: Sends a request to a language model API to generate responses based on a provided prompt.
+- qa_generator: Generates a specified number of question-answer pairs from input documents.
+- critique_qa: Critiques the generated QA pairs based on groundedness, relevance, and standalone quality.
+"""
+
+from prompts import *
+import pandas as pd
+import random
+from tqdm.auto import tqdm
+import requests
+
+
+def get_response(
+    prompt: str, url: str = "http://localhost:11434/api/generate", model: str = "llama3"
+):
+    """
+    Sends a prompt ollama API and retrieves the generated response.
+
+    Args:
+        prompt:The text input that the model will use to generate a response.
+        url: The API endpoint for the model (default: "http://localhost:11434/api/generate").
+        model: The model to be used for generation (default: "llama3").
+
+    Returns:
+        The generated response from the language model as a string.
+    """
+
+    payload = {"model": model, "prompt": prompt, "stream": False}
+    response = requests.post(url, json=payload)
+    resp = response.json()
+    return resp["response"]
+
+
+def qa_generator(
+    documents: object,
+    N_GENERATIONS: int = 20,
+):
+    """
+    Generates a specified number of question-answer pairs from the provided documents.
+
+    Args:
+        documents: A collection of document objects to generate QA pairs from.
+        N_GENERATIONS: The number of question-answer pairs to generate (default: 20).
+
+    Returns:
+        A list of dictionaries, each containing the generated context, question, answer, and source document metadata.
+    """
+    print(f"Generating {N_GENERATIONS} QA couples...")
+
+    outputs = []
+    for sampled_context in tqdm(random.sample(documents, N_GENERATIONS)):
+        # Generate QA couple
+        output_QA_couple = get_response(
+            QA_generation_prompt.format(context=sampled_context.text)
+        )
+        try:
+            question = output_QA_couple.split("Factoid question: ")[-1].split(
+                "Answer: "
+            )[0]
+            answer = output_QA_couple.split("Answer: ")[-1]
+            assert len(answer) < 300, "Answer is too long"
+            outputs.append(
+                {
+                    "context": sampled_context.text,
+                    "question": question,
+                    "answer": answer,
+                    "source_doc": sampled_context.metadata,
+                }
+            )
+        except:
+            continue
+    df = pd.DataFrame(outputs)
+    df.to_csv("QA.csv")
+    return outputs
+
+
+def critique_qa(
+    outputs: list,
+):
+    """
+    Critiques the generated question-answer pairs based on groundedness, relevance, and standalone quality.
+
+    Args:
+        outputs: A list of dictionaries containing generated QA pairs to be critiqued.
+
+    Returns:
+        The critiqued QA pairs with additional fields for groundedness, relevance, and standalone quality scores and evaluations.
+    """
+    print("Generating critique for each QA couple...")
+    for output in tqdm(outputs):
+        evaluations = {
+            "groundedness": get_response(
+                question_groundedness_critique_prompt.format(
+                    context=output["context"], question=output["question"]
+                ),
+            ),
+            "relevance": get_response(
+                question_relevance_critique_prompt.format(question=output["question"]),
+            ),
+            "standalone": get_response(
+                question_standalone_critique_prompt.format(question=output["question"]),
+            ),
+        }
+        try:
+            for criterion, evaluation in evaluations.items():
+                score, eval = (
+                    int(evaluation.split("Total rating: ")[-1].strip()),
+                    evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
+                )
+                output.update(
+                    {
+                        f"{criterion}_score": score,
+                        f"{criterion}_eval": eval,
+                    }
+                )
+        except Exception as e:
+            continue
+        generated_questions = pd.DataFrame.from_dict(outputs)
+        generated_questions = generated_questions.loc[
+            (generated_questions["groundedness_score"] >= 4)
+            & (generated_questions["relevance_score"] >= 4)
+            & (generated_questions["standalone_score"] >= 4)
+        ]
+        generated_questions.to_csv("generated_questions.csv")
+        return outputs
diff --git a/graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py b/graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py
index f162e5c..99a218c 100644
--- a/graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py
+++ b/graph_rag/evaluation/ragas_evaluation/evaluation_ragas.py
@@ -1,130 +1,118 @@
 """
-This script contains functions to generate question-answer pairs from input documents using a language model,
-and critique them based on various criteria like groundedness, relevance, and standalone quality.
+This script loads a pre-processed dataset, slices it for batch evaluation, and runs a series of metrics to evaluate the
+performance of a query engine using a language model and embeddings.
 
 Functions:
-- get_response: Sends a request to a language model API to generate responses based on a provided prompt.
-- qa_generator: Generates a specified number of question-answer pairs from input documents.
-- critique_qa: Critiques the generated QA pairs based on groundedness, relevance, and standalone quality.
+- load_test_dataset: Loads a test dataset from a pickle file.
+- slice_data: Slices the dataset into batches for evaluation.
+- evaluate: Runs evaluation on the sliced dataset using specified metrics, LLMs, and embeddings.
+
 """
 
-from prompts import *
+import pickle
 import pandas as pd
-import random
-from tqdm.auto import tqdm
-import requests
-
-
-def get_response(
-    prompt: str, url: str = "http://localhost:11434/api/generate", model: str = "llama3"
+from datasets import Dataset
+from ragas.integrations.llama_index import evaluate
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from ragas.metrics.critique import harmfulness
+from llama_index.llms.ollama import Ollama
+from ragas.metrics import (
+    faithfulness,
+    answer_relevancy,
+    context_precision,
+    context_recall,
+)
+
+
+def load_test_dataset(
+    data: str,
 ):
     """
-    Sends a prompt ollama API and retrieves the generated response.
-
-    Args:
-        prompt:The text input that the model will use to generate a response.
-        url: The API endpoint for the model (default: "http://localhost:11434/api/generate").
-        model: The model to be used for generation (default: "llama3").
+       Loads a test dataset from a pickle file.
 
-    Returns:
-        The generated response from the language model as a string.
-    """
+       Args:
+           data: The path to the dataset file in pickle format.
 
-    payload = {"model": model, "prompt": prompt, "stream": False}
-    response = requests.post(url, json=payload)
-    resp = response.json()
-    return resp["response"]
+       Returns:
+           A dictionary representing the loaded dataset or an empty dictionary if loading fails due to EOFError.
+       """
+    try:
+        with open(data, "rb") as f:
+            dataset = pickle.load(f)
+    except EOFError:
+        print("EOFError: The file may be corrupted or incomplete loading empty dictionary.")
+        dataset = {}
+    return dataset
 
 
-def qa_generator(
-    documents: object,
-    N_GENERATIONS: int = 20,
+def slice_data(i: int, k: int, dataset: dict):
+    """
+        Slices the dataset into smaller chunks for batch processing.
+
+        Args:
+            i: The starting index for the slice.
+            k: The size of the slice (number of records to include in each batch).
+            dataset: The dictionary representing the dataset to be sliced.
+
+        Returns:
+            A dictionary containing the sliced dataset with renamed columns for consistency with the evaluation process.
+        """
+
+    hf_dataset = Dataset.from_list(dataset[i : i + k])
+    hf_dataset = hf_dataset.rename_column("context", "contexts")
+    hf_dataset = hf_dataset.rename_column("answer", "ground_truth")
+    ds_dict = hf_dataset.to_dict()
+    return ds_dict
+
+
+def evaluate(
+    query_engine: object,
+    dataset: object,
+    batch: int = 4,
+    metrics: list = [
+        faithfulness,
+        answer_relevancy,
+        context_precision,
+        context_recall,
+    ],
+    llm: object = Ollama(base_url="http://localhost:11434", model="codellama"),
+    embeddings=HuggingFaceEmbedding(model_name="microsoft/codebert-base"),
 ):
     """
-    Generates a specified number of question-answer pairs from the provided documents.
+       Evaluates the performance of a query engine on a dataset using various metrics and a language model.
 
-    Args:
-        documents: A collection of document objects to generate QA pairs from.
-        N_GENERATIONS: The number of question-answer pairs to generate (default: 20).
+       Args:
+           query_engine: The query engine to be evaluated.
+           dataset: The dataset to be evaluated against.
+           batch: The number of records to process in each batch (default: 4).
+           metrics: A list of metrics to be used for evaluation (default: faithfulness, answer relevancy, context precision, and context recall).
+           llm: The language model to be used for evaluation (default: Ollama with model 'codellama').
+           embeddings: The embedding model to be used (default: HuggingFaceEmbedding with 'microsoft/codebert-base').
 
-    Returns:
-        A list of dictionaries, each containing the generated context, question, answer, and source document metadata.
-    """
-    print(f"Generating {N_GENERATIONS} QA couples...")
+       Returns:
+           A pandas DataFrame containing the evaluation results for each batch.
+       """
 
-    outputs = []
-    for sampled_context in tqdm(random.sample(documents, N_GENERATIONS)):
-        # Generate QA couple
-        output_QA_couple = get_response(
-            QA_generation_prompt.format(context=sampled_context.text)
-        )
-        try:
-            question = output_QA_couple.split("Factoid question: ")[-1].split(
-                "Answer: "
-            )[0]
-            answer = output_QA_couple.split("Answer: ")[-1]
-            assert len(answer) < 300, "Answer is too long"
-            outputs.append(
-                {
-                    "context": sampled_context.text,
-                    "question": question,
-                    "answer": answer,
-                    "source_doc": sampled_context.metadata,
-                }
-            )
-        except:
-            continue
-    df = pd.DataFrame(outputs)
-    df.to_csv("QA.csv")
-    return outputs
-
-
-def critique_qa(
-    outputs: list,
-):
-    """
-    Critiques the generated question-answer pairs based on groundedness, relevance, and standalone quality.
+    rows_count = len(next(iter(dataset.values())))
 
-    Args:
-        outputs: A list of dictionaries containing generated QA pairs to be critiqued.
+    results_df = pd.DataFrame()
 
-    Returns:
-        The critiqued QA pairs with additional fields for groundedness, relevance, and standalone quality scores and evaluations.
-    """
-    print("Generating critique for each QA couple...")
-    for output in tqdm(outputs):
-        evaluations = {
-            "groundedness": get_response(
-                question_groundedness_critique_prompt.format(
-                    context=output["context"], question=output["question"]
-                ),
-            ),
-            "relevance": get_response(
-                question_relevance_critique_prompt.format(question=output["question"]),
-            ),
-            "standalone": get_response(
-                question_standalone_critique_prompt.format(question=output["question"]),
-            ),
-        }
-        try:
-            for criterion, evaluation in evaluations.items():
-                score, eval = (
-                    int(evaluation.split("Total rating: ")[-1].strip()),
-                    evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
-                )
-                output.update(
-                    {
-                        f"{criterion}_score": score,
-                        f"{criterion}_eval": eval,
-                    }
-                )
-        except Exception as e:
-            continue
-        generated_questions = pd.DataFrame.from_dict(outputs)
-        generated_questions = generated_questions.loc[
-            (generated_questions["groundedness_score"] >= 4)
-            & (generated_questions["relevance_score"] >= 4)
-            & (generated_questions["standalone_score"] >= 4)
-        ]
-        generated_questions.to_csv("generated_questions.csv")
-        return outputs
+    for i in range(0, rows_count, batch):
+
+        batch_data = slice_data(i, batch, dataset=dataset)
+
+        result = evaluate(
+            query_engine=query_engine,
+            metrics=metrics,
+            dataset=batch_data,
+            llm=llm,
+            embeddings=embeddings,
+        )
+
+        rdf = result.to_pandas()
+        results_df = pd.concat([results_df, rdf], ignore_index=True)
+        print(f"Processed batch {i // batch + 1}:")
+        print(rdf)
+    print(results_df)
+    results_df.to_csv("results.csv", index=False)
+    return results_df