added graph_rag_test_data generation module using ragas

c2siorg · Aug 14, 2024 · 5262d42 · 5262d42
1 parent 0cdf0ec
commit 5262d42
Show file tree

Hide file tree

Showing 2 changed files with 160 additions and 0 deletions.
diff --git a/graph_rag/evalution/ragas_evaluation/evaluation_ragas.py b/graph_rag/evalution/ragas_evaluation/evaluation_ragas.py
@@ -0,0 +1,85 @@
+from prompts import *
+import pandas as pd
+import random
+from tqdm.auto import tqdm
+import requests
+
+
+
+def get_response(prompt: str,url: str = "http://localhost:11434/api/generate",model: str= "llama3"):
+
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False
+    }
+    response = requests.post(url, json=payload)
+    resp=response.json()
+    return resp['response']
+
+
+
+
+def qa_generator(documents: object,
+                 N_GENERATIONS: int= 20,):
+
+    print(f"Generating {N_GENERATIONS} QA couples...")
+
+    outputs = []
+    for sampled_context in tqdm(random.sample(documents, N_GENERATIONS)):
+        # Generate QA couple
+        output_QA_couple = get_response (QA_generation_prompt.format(context=sampled_context.text))
+        try:
+            question = output_QA_couple.split("Factoid question: ")[-1].split("Answer: ")[0]
+            answer = output_QA_couple.split("Answer: ")[-1]
+            assert len(answer) < 300, "Answer is too long"
+            outputs.append(
+                {
+                    "context": sampled_context.text,
+                    "question": question,
+                    "answer": answer,
+                    "source_doc": sampled_context.metadata,
+                }
+            )
+        except:
+            continue
+    df=pd.DataFrame(outputs)
+    df.to_csv("QA.csv")
+    return outputs
+
+def critique_qa(outputs: list,):
+    print("Generating critique for each QA couple...")
+    for output in tqdm(outputs):
+        evaluations = {
+            "groundedness": get_response(
+                question_groundedness_critique_prompt.format(context=output["context"], question=output["question"]),
+            ),
+            "relevance": get_response(
+                question_relevance_critique_prompt.format(question=output["question"]),
+            ),
+            "standalone": get_response(
+                question_standalone_critique_prompt.format(question=output["question"]),
+            ),
+        }
+        try:
+            for criterion, evaluation in evaluations.items():
+                score, eval = (
+                    int(evaluation.split("Total rating: ")[-1].strip()),
+                    evaluation.split("Total rating: ")[-2].split("Evaluation: ")[1],
+                )
+                output.update(
+                    {
+                        f"{criterion}_score": score,
+                        f"{criterion}_eval": eval,
+                    }
+                )
+        except Exception as e:
+            continue
+        generated_questions = pd.DataFrame.from_dict(outputs)
+        generated_questions = generated_questions.loc[
+            (generated_questions["groundedness_score"] >= 4)
+            & (generated_questions["relevance_score"] >= 4)
+            & (generated_questions["standalone_score"] >= 4)
+            ]
+        generated_questions.to_csv("generated_questions.csv")
+        return outputs
diff --git a/graph_rag/evalution/ragas_evaluation/prompts.py b/graph_rag/evalution/ragas_evaluation/prompts.py
@@ -0,0 +1,75 @@
+QA_generation_prompt = """
+Your task is to write a factoid question and an answer given a context.
+Your factoid question should be answerable with a specific, concise piece of factual information from the context.
+Your factoid question should be formulated in the same style as questions users could ask in a search engine.
+This means that your factoid question MUST NOT mention something like "according to the passage" or "context".
+
+Provide your answer as follows:
+
+Output:::
+Factoid question: (your factoid question)
+Answer: (your answer to the factoid question)
+
+Now here is the context.
+
+Context: {context}\n
+Output:::"""
+
+question_groundedness_critique_prompt = """
+You will be given a context and a question.
+Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
+Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here are the question and context.
+
+Question: {question}\n
+Context: {context}\n
+Answer::: """
+
+question_relevance_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how useful this question can be to machine learning developers building NLP applications with the Hugging Face ecosystem.
+Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}\n
+Answer::: """
+
+question_standalone_critique_prompt = """
+You will be given a question.
+Your task is to provide a 'total rating' representing how context-independant this question is.
+Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
+For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
+The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.
+
+For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.
+
+Provide your answer as follows:
+
+Answer:::
+Evaluation: (your rationale for the rating, as a text)
+Total rating: (your rating, as a number between 1 and 5)
+
+You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+Now here is the question.
+
+Question: {question}\n
+Answer::: """