From dbdf54b3e416a338424a734564dc5e0333475f84 Mon Sep 17 00:00:00 2001
From: Jithin James <jamesjithin97@gmail.com>
Date: Fri, 26 Jan 2024 16:44:57 -0800
Subject: [PATCH] fix (metrics): changed ground_truths -> ground_truth (#521)

---
 docs/getstarted/evaluation.md                 |  7 ++--
 docs/howtos/customisations/aws-bedrock.ipynb  |  3 +-
 docs/howtos/customisations/azure-openai.ipynb |  6 +--
 docs/howtos/customisations/embeddings.ipynb   |  2 +-
 docs/howtos/customisations/gcp-vertexai.ipynb | 42 +++++++------------
 docs/howtos/customisations/llms.ipynb         |  4 +-
 src/ragas/llms/base.py                        |  4 +-
 src/ragas/metrics/_answer_correctness.py      |  4 +-
 src/ragas/metrics/_answer_similarity.py       | 18 ++++----
 src/ragas/metrics/_context_precision.py       |  2 +-
 src/ragas/metrics/_context_recall.py          |  3 +-
 src/ragas/testset/evolutions.py               |  8 ++--
 src/ragas/testset/generator.py                | 12 ++++--
 src/ragas/validation.py                       | 16 +++----
 tests/benchmarks/benchmark_eval.py            |  2 +-
 tests/unit/test_validation.py                 | 16 +++----
 16 files changed, 71 insertions(+), 78 deletions(-)

diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md
index 0091d1832..df4029620 100644
--- a/docs/getstarted/evaluation.md
+++ b/docs/getstarted/evaluation.md
@@ -36,8 +36,9 @@ Ideally your list of questions should reflect the questions your users give, inc
 :caption: import sample dataset
 from datasets import load_dataset
 
-fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
-fiqa_eval
+# loading the V2 dataset
+amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
+amnesty_qa
 ```
 
 :::{seealso}
@@ -85,7 +86,7 @@ Running the evaluation is as simple as calling evaluate on the `Dataset` with th
 from ragas import evaluate
 
 result = evaluate(
-    fiqa_eval["baseline"].select(range(3)), # selecting only 3
+    amnesty_qa["eval"],
     metrics=[
         context_precision,
         faithfulness,
diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb
index 9d1daee65..341e618a5 100644
--- a/docs/howtos/customisations/aws-bedrock.ipynb
+++ b/docs/howtos/customisations/aws-bedrock.ipynb
@@ -48,7 +48,8 @@
     "# data\n",
     "from datasets import load_dataset\n",
     "\n",
-    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")"
+    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
+    "amnesty_qa"
    ]
   },
   {
diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb
index 4a11b9bbd..c308841b2 100644
--- a/docs/howtos/customisations/azure-openai.ipynb
+++ b/docs/howtos/customisations/azure-openai.ipynb
@@ -75,8 +75,8 @@
     "# data\n",
     "from datasets import load_dataset\n",
     "\n",
-    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
-    "fiqa_eval"
+    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
+    "amnesty_qa"
    ]
   },
   {
@@ -241,7 +241,7 @@
    ],
    "source": [
     "result = evaluate(\n",
-    "    fiqa_eval[\"baseline\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n",
+    "    amnesty_qa[\"eval\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n",
     ")\n",
     "\n",
     "result"
diff --git a/docs/howtos/customisations/embeddings.ipynb b/docs/howtos/customisations/embeddings.ipynb
index 861ae788d..f836cd369 100644
--- a/docs/howtos/customisations/embeddings.ipynb
+++ b/docs/howtos/customisations/embeddings.ipynb
@@ -92,7 +92,7 @@
     }
    ],
    "source": [
-    "#dataset\n",
+    "# dataset\n",
     "from datasets import load_dataset\n",
     "\n",
     "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n",
diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb
index 9623b84d3..70722c34a 100644
--- a/docs/howtos/customisations/gcp-vertexai.ipynb
+++ b/docs/howtos/customisations/gcp-vertexai.ipynb
@@ -34,7 +34,9 @@
    "cell_type": "code",
    "execution_count": 1,
    "id": "0d3e6c99-c19c-44a1-8f05-4bde2de30866",
-   "metadata": {},
+   "metadata": {
+    "lines_to_next_cell": 0
+   },
    "outputs": [
     {
      "name": "stderr",
@@ -77,8 +79,8 @@
     "# data\n",
     "from datasets import load_dataset\n",
     "\n",
-    "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
-    "fiqa_eval"
+    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
+    "amnesty_qa"
    ]
   },
   {
@@ -86,16 +88,11 @@
    "id": "4e67daaa-60e3-4584-8ec6-944c3c5a1a0c",
    "metadata": {},
    "source": [
-    "Now lets import the metrics we are going to use"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "42081210-3c0d-4e27-974a-ef152364a4ab",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "```\n",
+    "\n",
+    "Now lets import the metrics we are going to use\n",
+    "\n",
+    "```python\n",
     "from ragas.metrics import (\n",
     "    context_precision,\n",
     "    answer_relevancy,  # AnswerRelevancy\n",
@@ -111,14 +108,9 @@
     "    context_recall,\n",
     "    context_precision,\n",
     "    harmfulness,\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "90fa19c3-1356-412f-a39d-f9907c69a80e",
-   "metadata": {},
-   "source": [
+    "]\n",
+    "```\n",
+    "\n",
     "By default Ragas uses `ChatOpenAI` for evaluations, lets swap that out with `ChatVertextAI`. We also need to change the embeddings used for evaluations for `OpenAIEmbeddings` to `VertextAIEmbeddings` for metrices that need it, which in our case is `answer_relevancy`.\n",
     "\n",
     "Now in order to use the new `ChatVertextAI` llm instance with Ragas metrics, you have to create a new instance of `RagasLLM` using the `ragas.llms.LangchainLLM` wrapper. Its a simple wrapper around langchain that make Langchain LLM/Chat instances compatible with how Ragas metrics will use them."
@@ -275,15 +267,9 @@
    ],
    "source": [
     "from ragas import evaluate\n",
-    "import nest_asyncio  # CHECK NOTES\n",
-    "\n",
-    "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n",
-    "nest_asyncio.apply()\n",
     "\n",
     "result = evaluate(\n",
-    "    fiqa_eval[\"baseline\"].select(\n",
-    "        range(1)\n",
-    "    ),  # using 1 as example due to quota constrains\n",
+    "    amnesty_qa[\"eval\"].select(range(1)),  # using 1 as example due to quota constrains\n",
     "    metrics=metrics,\n",
     ")\n",
     "\n",
diff --git a/docs/howtos/customisations/llms.ipynb b/docs/howtos/customisations/llms.ipynb
index 27462c077..d1f34504e 100644
--- a/docs/howtos/customisations/llms.ipynb
+++ b/docs/howtos/customisations/llms.ipynb
@@ -105,7 +105,7 @@
     "# data\n",
     "from datasets import load_dataset\n",
     "\n",
-    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n",
+    "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
     "amnesty_qa"
    ]
   },
@@ -210,7 +210,7 @@
     "    openai_api_base=inference_server_url,\n",
     "    max_tokens=5,\n",
     "    temperature=0,\n",
-    ")\n"
+    ")"
    ]
   },
   {
diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py
index bc8210f11..da43b7776 100644
--- a/src/ragas/llms/base.py
+++ b/src/ragas/llms/base.py
@@ -10,9 +10,9 @@
 from langchain_core.outputs import LLMResult
 from tenacity import (
     retry,
-    stop_after_attempt,
+    stop_after_attempt,  # for exponential backoff
     wait_random_exponential,
-)  # for exponential backoff
+)
 
 if t.TYPE_CHECKING:
     from langchain_core.callbacks import Callbacks
diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py
index da7f95736..3aca756d5 100644
--- a/src/ragas/metrics/_answer_correctness.py
+++ b/src/ragas/metrics/_answer_correctness.py
@@ -141,7 +141,7 @@ def _compute_statement_presence(self, prediction: t.Any) -> float:
 
     def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "LLM must be set"
-        q, a, g = row["question"], row["answer"], row["ground_truths"][0]
+        q, a, g = row["question"], row["answer"], row["ground_truth"]
         p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
         is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks)
 
@@ -165,7 +165,7 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
     async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.llm is not None, "LLM must be set"
 
-        q, a, g = row["question"], row["answer"], row["ground_truths"][0]
+        q, a, g = row["question"], row["answer"], row["ground_truth"]
         p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
         is_statement_present = await self.llm.agenerate_text(
             p_value, callbacks=callbacks
diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py
index f6a3669fb..c389a8919 100644
--- a/src/ragas/metrics/_answer_similarity.py
+++ b/src/ragas/metrics/_answer_similarity.py
@@ -59,15 +59,15 @@ def init_model(self):
     def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
         assert self.embeddings is not None, "embeddings must be set"
 
-        ground_truths, answers = row["ground_truths"], row["answer"]
-        ground_truths = [item[0] for item in ground_truths]
+        ground_truth, answers = row["ground_truth"], row["answer"]
+        ground_truth = [item[0] for item in ground_truth]
 
         if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings):
             raise NotImplementedError(
                 "async score [ascore()] not implemented for HuggingFace embeddings"
             )
         else:
-            embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths))
+            embeddings_1 = np.array(self.embeddings.embed_documents(ground_truth))
             embeddings_2 = np.array(self.embeddings.embed_documents(answers))
             similarity = embeddings_1 @ embeddings_2.T
             if similarity.size == 1:
@@ -79,15 +79,15 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
 
         assert isinstance(scores, np.ndarray), "Expects ndarray"
         if self.threshold:
-            scores = scores >= self.threshold  # type: ignore
+            scores = scores >= self.threshold
 
         return scores.tolist()[0]
 
     async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float:
         assert self.embeddings is not None, "embeddings must be set"
 
-        ground_truths, answers = row["ground_truths"], row["answer"]
-        ground_truths = [item[0] for item in ground_truths]
+        ground_truth: t.List[str] = row["ground_truth"]
+        answer: t.List[str] = row["answer"]
 
         if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings):
             raise NotImplementedError(
@@ -95,9 +95,9 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float
             )
         else:
             embeddings_1 = np.array(
-                await self.embeddings.aembed_documents(ground_truths)
+                await self.embeddings.aembed_documents(ground_truth)
             )
-            embeddings_2 = np.array(await self.embeddings.aembed_documents(answers))
+            embeddings_2 = np.array(await self.embeddings.aembed_documents(answer))
             similarity = embeddings_1 @ embeddings_2.T
             if similarity.size == 1:
                 scores = similarity.flatten()
@@ -106,7 +106,7 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float
 
         assert isinstance(scores, np.ndarray), "Expects ndarray"
         if self.threshold:
-            scores = scores >= self.threshold  # type: ignore
+            scores = scores >= self.threshold
 
         return scores.tolist()[0]
 
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
index 22dcd87b9..a5b723066 100644
--- a/src/ragas/metrics/_context_precision.py
+++ b/src/ragas/metrics/_context_precision.py
@@ -73,7 +73,7 @@ class ContextPrecision(MetricWithLLM):
     batch_size: int = 15
 
     def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]:
-        answer = "ground_truths"
+        answer = "ground_truth"
         if answer not in row.keys():
             logger.warning(
                 "Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead"
diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
index 0c7e15e65..b1185b9e4 100644
--- a/src/ragas/metrics/_context_recall.py
+++ b/src/ragas/metrics/_context_recall.py
@@ -87,8 +87,7 @@ class ContextRecall(MetricWithLLM):
     batch_size: int = 15
 
     def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue:
-        qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"]
-        gt = "\n".join(gt) if isinstance(gt, list) else gt
+        qstn, ctx, gt = row["question"], row["contexts"], row["ground_truth"]
         ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx
 
         return self.context_recall_prompt.format(question=qstn, context=ctx, answer=gt)
diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py
index 0f9002f89..255a1b93a 100644
--- a/src/ragas/testset/evolutions.py
+++ b/src/ragas/testset/evolutions.py
@@ -37,8 +37,8 @@ class CurrentNodes:
 
 class DataRow(BaseModel):
     question: str
-    context: str
-    answer: str
+    contexts: t.List[str]
+    ground_truth: str
     evolution_type: str
 
 
@@ -149,8 +149,8 @@ def generate_datarow(
 
         return DataRow(
             question=question,
-            context=merged_nodes.page_content,
-            answer="" if answer is None else answer,
+            contexts=[n.page_content for n in current_nodes.nodes],
+            ground_truth="" if answer is None else answer,
             evolution_type=evolution_type,
         )
 
diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py
index f0b292268..425d5f349 100644
--- a/src/ragas/testset/generator.py
+++ b/src/ragas/testset/generator.py
@@ -5,6 +5,7 @@
 from dataclasses import dataclass
 
 import pandas as pd
+from datasets import Dataset
 from langchain_openai.chat_models import ChatOpenAI
 from langchain_openai.embeddings import OpenAIEmbeddings
 
@@ -24,8 +25,8 @@
 from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter
 
 if t.TYPE_CHECKING:
-    from llama_index.readers.schema import Document as LlamaindexDocument
     from langchain_core.documents import Document as LCDocument
+    from llama_index.readers.schema import Document as LlamaindexDocument
 
 logger = logging.getLogger(__name__)
 
@@ -41,14 +42,19 @@ class TestDataset:
 
     test_data: t.List[DataRow]
 
-    def to_pandas(self) -> pd.DataFrame:
+    def _to_records(self) -> t.List[t.Dict]:
         data_samples = []
         for data in self.test_data:
             data_dict = dict(data)
             data_dict["episode_done"] = True
             data_samples.append(data_dict)
+        return data_samples
+
+    def to_pandas(self) -> pd.DataFrame:
+        return pd.DataFrame.from_records(self._to_records())
 
-        return pd.DataFrame.from_records(data_samples)
+    def to_dataset(self) -> Dataset:
+        return Dataset.from_list(self._to_records())
 
 
 @dataclass
diff --git a/src/ragas/validation.py b/src/ragas/validation.py
index fa1c4471d..17a6e3f4f 100644
--- a/src/ragas/validation.py
+++ b/src/ragas/validation.py
@@ -16,14 +16,14 @@ def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset:
 
 
 def validate_column_dtypes(ds: Dataset):
-    for column_names in ["question", "answer"]:
+    for column_names in ["question", "answer", "ground_truth"]:
         if column_names in ds.features:
             if ds.features[column_names].dtype != "string":
                 raise ValueError(
                     f'Dataset feature "{column_names}" should be of type string'
                 )
 
-    for column_names in ["contexts", "ground_truths"]:
+    for column_names in ["contexts"]:
         if column_names in ds.features:
             if not (
                 isinstance(ds.features[column_names], Sequence)
@@ -39,10 +39,10 @@ def validate_column_dtypes(ds: Dataset):
     EvaluationMode.qac: ["question", "answer", "contexts"],
     EvaluationMode.qa: ["question", "answer"],
     EvaluationMode.qc: ["question", "contexts"],
-    EvaluationMode.gc: ["ground_truths", "contexts"],
-    EvaluationMode.ga: ["ground_truths", "answer"],
-    EvaluationMode.qga: ["question", "ground_truths", "answer"],
-    EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
+    EvaluationMode.gc: ["ground_truth", "contexts"],
+    EvaluationMode.ga: ["ground_truth", "answer"],
+    EvaluationMode.qga: ["question", "ground_truth", "answer"],
+    EvaluationMode.qcg: ["question", "contexts", "ground_truth"],
 }
 
 
@@ -64,9 +64,9 @@ def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]):
             extra_msg = ""
             if (
                 isinstance(m, ContextPrecision)
-                and "ground_truths" not in available_columns
+                and "ground_truth" not in available_columns
             ):
-                extra_msg = "Looks like you're trying to use 'context_precision' without ground_truths. Please use consider using  `context_utilization' instead."
+                extra_msg = "Looks like you're trying to use 'context_precision' without ground_truth. Please use consider using  `context_utilization' instead."
 
             raise ValueError(
                 f"The metric [{m.name}] that that is used requires the following "
diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py
index ae91c9937..03d2348f3 100644
--- a/tests/benchmarks/benchmark_eval.py
+++ b/tests/benchmarks/benchmark_eval.py
@@ -17,7 +17,7 @@
 from ragas.metrics.critique import harmfulness
 
 # data
-ds = load_dataset("explodinggradients/amnesty_qa", "english")
+ds = load_dataset("explodinggradients/amnesty_qa", "english_v2")
 assert isinstance(ds, DatasetDict)
 eval_dataset = ds["eval"]
 
diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py
index c61deac40..2b9650c29 100644
--- a/tests/unit/test_validation.py
+++ b/tests/unit/test_validation.py
@@ -16,15 +16,15 @@
 
 TEST_CASES = [
     CaseToTest("a", "b", ["c"], None, True, [faithfulness], True),
-    CaseToTest("a", "b", ["c"], ["g"], True, [faithfulness], True),
-    CaseToTest("a", None, ["c"], ["g"], True, [context_precision], True),
-    CaseToTest("a", "b", "c", ["g"], False, [context_precision], True),
+    CaseToTest("a", "b", ["c"], "g", True, [faithfulness], True),
+    CaseToTest("a", None, ["c"], "g", True, [context_precision], True),
+    CaseToTest("a", "b", "c", "g", False, [context_precision], True),
     CaseToTest(
         "a", None, [["c"]], None, False, [context_precision, answer_relevancy], False
     ),
-    CaseToTest("a", None, ["c"], "g", False, [context_precision], True),
+    CaseToTest("a", None, ["c"], ["g"], False, [context_precision], True),
     CaseToTest("a", None, ["c"], [["g"]], False, [context_precision], True),
-    CaseToTest(1, None, ["c"], ["g"], False, [context_precision], True),
+    CaseToTest(1, None, ["c"], "g", False, [context_precision], True),
     CaseToTest(1, None, None, None, False, [context_precision], False),
 ]
 
@@ -39,7 +39,7 @@ def test_validate_column_dtypes(testcase):
     if testcase.c is not None:
         dataset_dict["contexts"] = [testcase.c]
     if testcase.g is not None:
-        dataset_dict["ground_truths"] = [testcase.g]
+        dataset_dict["ground_truth"] = [testcase.g]
 
     test_dataset = Dataset.from_dict(dataset_dict)
     if testcase.is_valid_columns:
@@ -59,7 +59,7 @@ def test_validate_columns_and_metrics(testcase):
     if testcase.c is not None:
         dataset_dict["contexts"] = [testcase.c]
     if testcase.g is not None:
-        dataset_dict["ground_truths"] = [testcase.g]
+        dataset_dict["ground_truth"] = [testcase.g]
     test_dataset = Dataset.from_dict(dataset_dict)
 
     if testcase.is_valid_metrics:
@@ -74,7 +74,7 @@ def test_validate_columns_and_metrics(testcase):
         "question": "query",
         "answer": "rag_answer",
         "contexts": "rag_contexts",
-        "ground_truths": "original_answer",
+        "ground_truth": "original_answer",
     },  # all columns present
     {
         "question": "query",