From dbdf54b3e416a338424a734564dc5e0333475f84 Mon Sep 17 00:00:00 2001 From: Jithin James Date: Fri, 26 Jan 2024 16:44:57 -0800 Subject: [PATCH] fix (metrics): changed ground_truths -> ground_truth (#521) --- docs/getstarted/evaluation.md | 7 ++-- docs/howtos/customisations/aws-bedrock.ipynb | 3 +- docs/howtos/customisations/azure-openai.ipynb | 6 +-- docs/howtos/customisations/embeddings.ipynb | 2 +- docs/howtos/customisations/gcp-vertexai.ipynb | 42 +++++++------------ docs/howtos/customisations/llms.ipynb | 4 +- src/ragas/llms/base.py | 4 +- src/ragas/metrics/_answer_correctness.py | 4 +- src/ragas/metrics/_answer_similarity.py | 18 ++++---- src/ragas/metrics/_context_precision.py | 2 +- src/ragas/metrics/_context_recall.py | 3 +- src/ragas/testset/evolutions.py | 8 ++-- src/ragas/testset/generator.py | 12 ++++-- src/ragas/validation.py | 16 +++---- tests/benchmarks/benchmark_eval.py | 2 +- tests/unit/test_validation.py | 16 +++---- 16 files changed, 71 insertions(+), 78 deletions(-) diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md index 0091d1832..df4029620 100644 --- a/docs/getstarted/evaluation.md +++ b/docs/getstarted/evaluation.md @@ -36,8 +36,9 @@ Ideally your list of questions should reflect the questions your users give, inc :caption: import sample dataset from datasets import load_dataset -fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval") -fiqa_eval +# loading the V2 dataset +amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2") +amnesty_qa ``` :::{seealso} @@ -85,7 +86,7 @@ Running the evaluation is as simple as calling evaluate on the `Dataset` with th from ragas import evaluate result = evaluate( - fiqa_eval["baseline"].select(range(3)), # selecting only 3 + amnesty_qa["eval"], metrics=[ context_precision, faithfulness, diff --git a/docs/howtos/customisations/aws-bedrock.ipynb b/docs/howtos/customisations/aws-bedrock.ipynb index 9d1daee65..341e618a5 100644 --- a/docs/howtos/customisations/aws-bedrock.ipynb +++ b/docs/howtos/customisations/aws-bedrock.ipynb @@ -48,7 +48,8 @@ "# data\n", "from datasets import load_dataset\n", "\n", - "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")" + "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n", + "amnesty_qa" ] }, { diff --git a/docs/howtos/customisations/azure-openai.ipynb b/docs/howtos/customisations/azure-openai.ipynb index 4a11b9bbd..c308841b2 100644 --- a/docs/howtos/customisations/azure-openai.ipynb +++ b/docs/howtos/customisations/azure-openai.ipynb @@ -75,8 +75,8 @@ "# data\n", "from datasets import load_dataset\n", "\n", - "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", - "fiqa_eval" + "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n", + "amnesty_qa" ] }, { @@ -241,7 +241,7 @@ ], "source": [ "result = evaluate(\n", - " fiqa_eval[\"baseline\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n", + " amnesty_qa[\"eval\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n", ")\n", "\n", "result" diff --git a/docs/howtos/customisations/embeddings.ipynb b/docs/howtos/customisations/embeddings.ipynb index 861ae788d..f836cd369 100644 --- a/docs/howtos/customisations/embeddings.ipynb +++ b/docs/howtos/customisations/embeddings.ipynb @@ -92,7 +92,7 @@ } ], "source": [ - "#dataset\n", + "# dataset\n", "from datasets import load_dataset\n", "\n", "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n", diff --git a/docs/howtos/customisations/gcp-vertexai.ipynb b/docs/howtos/customisations/gcp-vertexai.ipynb index 9623b84d3..70722c34a 100644 --- a/docs/howtos/customisations/gcp-vertexai.ipynb +++ b/docs/howtos/customisations/gcp-vertexai.ipynb @@ -34,7 +34,9 @@ "cell_type": "code", "execution_count": 1, "id": "0d3e6c99-c19c-44a1-8f05-4bde2de30866", - "metadata": {}, + "metadata": { + "lines_to_next_cell": 0 + }, "outputs": [ { "name": "stderr", @@ -77,8 +79,8 @@ "# data\n", "from datasets import load_dataset\n", "\n", - "fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n", - "fiqa_eval" + "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n", + "amnesty_qa" ] }, { @@ -86,16 +88,11 @@ "id": "4e67daaa-60e3-4584-8ec6-944c3c5a1a0c", "metadata": {}, "source": [ - "Now lets import the metrics we are going to use" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "42081210-3c0d-4e27-974a-ef152364a4ab", - "metadata": {}, - "outputs": [], - "source": [ + "```\n", + "\n", + "Now lets import the metrics we are going to use\n", + "\n", + "```python\n", "from ragas.metrics import (\n", " context_precision,\n", " answer_relevancy, # AnswerRelevancy\n", @@ -111,14 +108,9 @@ " context_recall,\n", " context_precision,\n", " harmfulness,\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "90fa19c3-1356-412f-a39d-f9907c69a80e", - "metadata": {}, - "source": [ + "]\n", + "```\n", + "\n", "By default Ragas uses `ChatOpenAI` for evaluations, lets swap that out with `ChatVertextAI`. We also need to change the embeddings used for evaluations for `OpenAIEmbeddings` to `VertextAIEmbeddings` for metrices that need it, which in our case is `answer_relevancy`.\n", "\n", "Now in order to use the new `ChatVertextAI` llm instance with Ragas metrics, you have to create a new instance of `RagasLLM` using the `ragas.llms.LangchainLLM` wrapper. Its a simple wrapper around langchain that make Langchain LLM/Chat instances compatible with how Ragas metrics will use them." @@ -275,15 +267,9 @@ ], "source": [ "from ragas import evaluate\n", - "import nest_asyncio # CHECK NOTES\n", - "\n", - "# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n", - "nest_asyncio.apply()\n", "\n", "result = evaluate(\n", - " fiqa_eval[\"baseline\"].select(\n", - " range(1)\n", - " ), # using 1 as example due to quota constrains\n", + " amnesty_qa[\"eval\"].select(range(1)), # using 1 as example due to quota constrains\n", " metrics=metrics,\n", ")\n", "\n", diff --git a/docs/howtos/customisations/llms.ipynb b/docs/howtos/customisations/llms.ipynb index 27462c077..d1f34504e 100644 --- a/docs/howtos/customisations/llms.ipynb +++ b/docs/howtos/customisations/llms.ipynb @@ -105,7 +105,7 @@ "# data\n", "from datasets import load_dataset\n", "\n", - "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n", + "amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n", "amnesty_qa" ] }, @@ -210,7 +210,7 @@ " openai_api_base=inference_server_url,\n", " max_tokens=5,\n", " temperature=0,\n", - ")\n" + ")" ] }, { diff --git a/src/ragas/llms/base.py b/src/ragas/llms/base.py index bc8210f11..da43b7776 100644 --- a/src/ragas/llms/base.py +++ b/src/ragas/llms/base.py @@ -10,9 +10,9 @@ from langchain_core.outputs import LLMResult from tenacity import ( retry, - stop_after_attempt, + stop_after_attempt, # for exponential backoff wait_random_exponential, -) # for exponential backoff +) if t.TYPE_CHECKING: from langchain_core.callbacks import Callbacks diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index da7f95736..3aca756d5 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -141,7 +141,7 @@ def _compute_statement_presence(self, prediction: t.Any) -> float: def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM must be set" - q, a, g = row["question"], row["answer"], row["ground_truths"][0] + q, a, g = row["question"], row["answer"], row["ground_truth"] p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks) @@ -165,7 +165,7 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float: async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.llm is not None, "LLM must be set" - q, a, g = row["question"], row["answer"], row["ground_truths"][0] + q, a, g = row["question"], row["answer"], row["ground_truth"] p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a) is_statement_present = await self.llm.agenerate_text( p_value, callbacks=callbacks diff --git a/src/ragas/metrics/_answer_similarity.py b/src/ragas/metrics/_answer_similarity.py index f6a3669fb..c389a8919 100644 --- a/src/ragas/metrics/_answer_similarity.py +++ b/src/ragas/metrics/_answer_similarity.py @@ -59,15 +59,15 @@ def init_model(self): def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert self.embeddings is not None, "embeddings must be set" - ground_truths, answers = row["ground_truths"], row["answer"] - ground_truths = [item[0] for item in ground_truths] + ground_truth, answers = row["ground_truth"], row["answer"] + ground_truth = [item[0] for item in ground_truth] if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): raise NotImplementedError( "async score [ascore()] not implemented for HuggingFace embeddings" ) else: - embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths)) + embeddings_1 = np.array(self.embeddings.embed_documents(ground_truth)) embeddings_2 = np.array(self.embeddings.embed_documents(answers)) similarity = embeddings_1 @ embeddings_2.T if similarity.size == 1: @@ -79,15 +79,15 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float: assert isinstance(scores, np.ndarray), "Expects ndarray" if self.threshold: - scores = scores >= self.threshold # type: ignore + scores = scores >= self.threshold return scores.tolist()[0] async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float: assert self.embeddings is not None, "embeddings must be set" - ground_truths, answers = row["ground_truths"], row["answer"] - ground_truths = [item[0] for item in ground_truths] + ground_truth: t.List[str] = row["ground_truth"] + answer: t.List[str] = row["answer"] if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings): raise NotImplementedError( @@ -95,9 +95,9 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float ) else: embeddings_1 = np.array( - await self.embeddings.aembed_documents(ground_truths) + await self.embeddings.aembed_documents(ground_truth) ) - embeddings_2 = np.array(await self.embeddings.aembed_documents(answers)) + embeddings_2 = np.array(await self.embeddings.aembed_documents(answer)) similarity = embeddings_1 @ embeddings_2.T if similarity.size == 1: scores = similarity.flatten() @@ -106,7 +106,7 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float assert isinstance(scores, np.ndarray), "Expects ndarray" if self.threshold: - scores = scores >= self.threshold # type: ignore + scores = scores >= self.threshold return scores.tolist()[0] diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index 22dcd87b9..a5b723066 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -73,7 +73,7 @@ class ContextPrecision(MetricWithLLM): batch_size: int = 15 def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]: - answer = "ground_truths" + answer = "ground_truth" if answer not in row.keys(): logger.warning( "Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead" diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 0c7e15e65..b1185b9e4 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -87,8 +87,7 @@ class ContextRecall(MetricWithLLM): batch_size: int = 15 def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue: - qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"] - gt = "\n".join(gt) if isinstance(gt, list) else gt + qstn, ctx, gt = row["question"], row["contexts"], row["ground_truth"] ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx return self.context_recall_prompt.format(question=qstn, context=ctx, answer=gt) diff --git a/src/ragas/testset/evolutions.py b/src/ragas/testset/evolutions.py index 0f9002f89..255a1b93a 100644 --- a/src/ragas/testset/evolutions.py +++ b/src/ragas/testset/evolutions.py @@ -37,8 +37,8 @@ class CurrentNodes: class DataRow(BaseModel): question: str - context: str - answer: str + contexts: t.List[str] + ground_truth: str evolution_type: str @@ -149,8 +149,8 @@ def generate_datarow( return DataRow( question=question, - context=merged_nodes.page_content, - answer="" if answer is None else answer, + contexts=[n.page_content for n in current_nodes.nodes], + ground_truth="" if answer is None else answer, evolution_type=evolution_type, ) diff --git a/src/ragas/testset/generator.py b/src/ragas/testset/generator.py index f0b292268..425d5f349 100644 --- a/src/ragas/testset/generator.py +++ b/src/ragas/testset/generator.py @@ -5,6 +5,7 @@ from dataclasses import dataclass import pandas as pd +from datasets import Dataset from langchain_openai.chat_models import ChatOpenAI from langchain_openai.embeddings import OpenAIEmbeddings @@ -24,8 +25,8 @@ from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter if t.TYPE_CHECKING: - from llama_index.readers.schema import Document as LlamaindexDocument from langchain_core.documents import Document as LCDocument + from llama_index.readers.schema import Document as LlamaindexDocument logger = logging.getLogger(__name__) @@ -41,14 +42,19 @@ class TestDataset: test_data: t.List[DataRow] - def to_pandas(self) -> pd.DataFrame: + def _to_records(self) -> t.List[t.Dict]: data_samples = [] for data in self.test_data: data_dict = dict(data) data_dict["episode_done"] = True data_samples.append(data_dict) + return data_samples + + def to_pandas(self) -> pd.DataFrame: + return pd.DataFrame.from_records(self._to_records()) - return pd.DataFrame.from_records(data_samples) + def to_dataset(self) -> Dataset: + return Dataset.from_list(self._to_records()) @dataclass diff --git a/src/ragas/validation.py b/src/ragas/validation.py index fa1c4471d..17a6e3f4f 100644 --- a/src/ragas/validation.py +++ b/src/ragas/validation.py @@ -16,14 +16,14 @@ def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset: def validate_column_dtypes(ds: Dataset): - for column_names in ["question", "answer"]: + for column_names in ["question", "answer", "ground_truth"]: if column_names in ds.features: if ds.features[column_names].dtype != "string": raise ValueError( f'Dataset feature "{column_names}" should be of type string' ) - for column_names in ["contexts", "ground_truths"]: + for column_names in ["contexts"]: if column_names in ds.features: if not ( isinstance(ds.features[column_names], Sequence) @@ -39,10 +39,10 @@ def validate_column_dtypes(ds: Dataset): EvaluationMode.qac: ["question", "answer", "contexts"], EvaluationMode.qa: ["question", "answer"], EvaluationMode.qc: ["question", "contexts"], - EvaluationMode.gc: ["ground_truths", "contexts"], - EvaluationMode.ga: ["ground_truths", "answer"], - EvaluationMode.qga: ["question", "ground_truths", "answer"], - EvaluationMode.qcg: ["question", "contexts", "ground_truths"], + EvaluationMode.gc: ["ground_truth", "contexts"], + EvaluationMode.ga: ["ground_truth", "answer"], + EvaluationMode.qga: ["question", "ground_truth", "answer"], + EvaluationMode.qcg: ["question", "contexts", "ground_truth"], } @@ -64,9 +64,9 @@ def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]): extra_msg = "" if ( isinstance(m, ContextPrecision) - and "ground_truths" not in available_columns + and "ground_truth" not in available_columns ): - extra_msg = "Looks like you're trying to use 'context_precision' without ground_truths. Please use consider using `context_utilization' instead." + extra_msg = "Looks like you're trying to use 'context_precision' without ground_truth. Please use consider using `context_utilization' instead." raise ValueError( f"The metric [{m.name}] that that is used requires the following " diff --git a/tests/benchmarks/benchmark_eval.py b/tests/benchmarks/benchmark_eval.py index ae91c9937..03d2348f3 100644 --- a/tests/benchmarks/benchmark_eval.py +++ b/tests/benchmarks/benchmark_eval.py @@ -17,7 +17,7 @@ from ragas.metrics.critique import harmfulness # data -ds = load_dataset("explodinggradients/amnesty_qa", "english") +ds = load_dataset("explodinggradients/amnesty_qa", "english_v2") assert isinstance(ds, DatasetDict) eval_dataset = ds["eval"] diff --git a/tests/unit/test_validation.py b/tests/unit/test_validation.py index c61deac40..2b9650c29 100644 --- a/tests/unit/test_validation.py +++ b/tests/unit/test_validation.py @@ -16,15 +16,15 @@ TEST_CASES = [ CaseToTest("a", "b", ["c"], None, True, [faithfulness], True), - CaseToTest("a", "b", ["c"], ["g"], True, [faithfulness], True), - CaseToTest("a", None, ["c"], ["g"], True, [context_precision], True), - CaseToTest("a", "b", "c", ["g"], False, [context_precision], True), + CaseToTest("a", "b", ["c"], "g", True, [faithfulness], True), + CaseToTest("a", None, ["c"], "g", True, [context_precision], True), + CaseToTest("a", "b", "c", "g", False, [context_precision], True), CaseToTest( "a", None, [["c"]], None, False, [context_precision, answer_relevancy], False ), - CaseToTest("a", None, ["c"], "g", False, [context_precision], True), + CaseToTest("a", None, ["c"], ["g"], False, [context_precision], True), CaseToTest("a", None, ["c"], [["g"]], False, [context_precision], True), - CaseToTest(1, None, ["c"], ["g"], False, [context_precision], True), + CaseToTest(1, None, ["c"], "g", False, [context_precision], True), CaseToTest(1, None, None, None, False, [context_precision], False), ] @@ -39,7 +39,7 @@ def test_validate_column_dtypes(testcase): if testcase.c is not None: dataset_dict["contexts"] = [testcase.c] if testcase.g is not None: - dataset_dict["ground_truths"] = [testcase.g] + dataset_dict["ground_truth"] = [testcase.g] test_dataset = Dataset.from_dict(dataset_dict) if testcase.is_valid_columns: @@ -59,7 +59,7 @@ def test_validate_columns_and_metrics(testcase): if testcase.c is not None: dataset_dict["contexts"] = [testcase.c] if testcase.g is not None: - dataset_dict["ground_truths"] = [testcase.g] + dataset_dict["ground_truth"] = [testcase.g] test_dataset = Dataset.from_dict(dataset_dict) if testcase.is_valid_metrics: @@ -74,7 +74,7 @@ def test_validate_columns_and_metrics(testcase): "question": "query", "answer": "rag_answer", "contexts": "rag_contexts", - "ground_truths": "original_answer", + "ground_truth": "original_answer", }, # all columns present { "question": "query",