Skip to content

Commit

Permalink
fix (metrics): changed ground_truths -> ground_truth (#521)
Browse files Browse the repository at this point in the history
  • Loading branch information
jjmachan committed Jan 27, 2024
1 parent 594871f commit dbdf54b
Show file tree
Hide file tree
Showing 16 changed files with 71 additions and 78 deletions.
7 changes: 4 additions & 3 deletions docs/getstarted/evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ Ideally your list of questions should reflect the questions your users give, inc
:caption: import sample dataset
from datasets import load_dataset
fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
fiqa_eval
# loading the V2 dataset
amnesty_qa = load_dataset("explodinggradients/amnesty_qa", "english_v2")
amnesty_qa
```

:::{seealso}
Expand Down Expand Up @@ -85,7 +86,7 @@ Running the evaluation is as simple as calling evaluate on the `Dataset` with th
from ragas import evaluate
result = evaluate(
fiqa_eval["baseline"].select(range(3)), # selecting only 3
amnesty_qa["eval"],
metrics=[
context_precision,
faithfulness,
Expand Down
3 changes: 2 additions & 1 deletion docs/howtos/customisations/aws-bedrock.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@
"# data\n",
"from datasets import load_dataset\n",
"\n",
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")"
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
"amnesty_qa"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions docs/howtos/customisations/azure-openai.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@
"# data\n",
"from datasets import load_dataset\n",
"\n",
"fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
"fiqa_eval"
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
"amnesty_qa"
]
},
{
Expand Down Expand Up @@ -241,7 +241,7 @@
],
"source": [
"result = evaluate(\n",
" fiqa_eval[\"baseline\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n",
" amnesty_qa[\"eval\"], metrics=metrics, llm=azure_model, embeddings=azure_embeddings\n",
")\n",
"\n",
"result"
Expand Down
2 changes: 1 addition & 1 deletion docs/howtos/customisations/embeddings.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@
}
],
"source": [
"#dataset\n",
"# dataset\n",
"from datasets import load_dataset\n",
"\n",
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n",
Expand Down
42 changes: 14 additions & 28 deletions docs/howtos/customisations/gcp-vertexai.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
"cell_type": "code",
"execution_count": 1,
"id": "0d3e6c99-c19c-44a1-8f05-4bde2de30866",
"metadata": {},
"metadata": {
"lines_to_next_cell": 0
},
"outputs": [
{
"name": "stderr",
Expand Down Expand Up @@ -77,25 +79,20 @@
"# data\n",
"from datasets import load_dataset\n",
"\n",
"fiqa_eval = load_dataset(\"explodinggradients/fiqa\", \"ragas_eval\")\n",
"fiqa_eval"
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
"amnesty_qa"
]
},
{
"cell_type": "markdown",
"id": "4e67daaa-60e3-4584-8ec6-944c3c5a1a0c",
"metadata": {},
"source": [
"Now lets import the metrics we are going to use"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "42081210-3c0d-4e27-974a-ef152364a4ab",
"metadata": {},
"outputs": [],
"source": [
"```\n",
"\n",
"Now lets import the metrics we are going to use\n",
"\n",
"```python\n",
"from ragas.metrics import (\n",
" context_precision,\n",
" answer_relevancy, # AnswerRelevancy\n",
Expand All @@ -111,14 +108,9 @@
" context_recall,\n",
" context_precision,\n",
" harmfulness,\n",
"]"
]
},
{
"cell_type": "markdown",
"id": "90fa19c3-1356-412f-a39d-f9907c69a80e",
"metadata": {},
"source": [
"]\n",
"```\n",
"\n",
"By default Ragas uses `ChatOpenAI` for evaluations, lets swap that out with `ChatVertextAI`. We also need to change the embeddings used for evaluations for `OpenAIEmbeddings` to `VertextAIEmbeddings` for metrices that need it, which in our case is `answer_relevancy`.\n",
"\n",
"Now in order to use the new `ChatVertextAI` llm instance with Ragas metrics, you have to create a new instance of `RagasLLM` using the `ragas.llms.LangchainLLM` wrapper. Its a simple wrapper around langchain that make Langchain LLM/Chat instances compatible with how Ragas metrics will use them."
Expand Down Expand Up @@ -275,15 +267,9 @@
],
"source": [
"from ragas import evaluate\n",
"import nest_asyncio # CHECK NOTES\n",
"\n",
"# NOTES: Only used when running on a jupyter notebook, otherwise comment or remove this function.\n",
"nest_asyncio.apply()\n",
"\n",
"result = evaluate(\n",
" fiqa_eval[\"baseline\"].select(\n",
" range(1)\n",
" ), # using 1 as example due to quota constrains\n",
" amnesty_qa[\"eval\"].select(range(1)), # using 1 as example due to quota constrains\n",
" metrics=metrics,\n",
")\n",
"\n",
Expand Down
4 changes: 2 additions & 2 deletions docs/howtos/customisations/llms.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@
"# data\n",
"from datasets import load_dataset\n",
"\n",
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english\")\n",
"amnesty_qa = load_dataset(\"explodinggradients/amnesty_qa\", \"english_v2\")\n",
"amnesty_qa"
]
},
Expand Down Expand Up @@ -210,7 +210,7 @@
" openai_api_base=inference_server_url,\n",
" max_tokens=5,\n",
" temperature=0,\n",
")\n"
")"
]
},
{
Expand Down
4 changes: 2 additions & 2 deletions src/ragas/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
from langchain_core.outputs import LLMResult
from tenacity import (
retry,
stop_after_attempt,
stop_after_attempt, # for exponential backoff
wait_random_exponential,
) # for exponential backoff
)

if t.TYPE_CHECKING:
from langchain_core.callbacks import Callbacks
Expand Down
4 changes: 2 additions & 2 deletions src/ragas/metrics/_answer_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def _compute_statement_presence(self, prediction: t.Any) -> float:

def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "LLM must be set"
q, a, g = row["question"], row["answer"], row["ground_truths"][0]
q, a, g = row["question"], row["answer"], row["ground_truth"]
p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
is_statement_present = self.llm.generate_text(p_value, callbacks=callbacks)

Expand All @@ -165,7 +165,7 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.llm is not None, "LLM must be set"

q, a, g = row["question"], row["answer"], row["ground_truths"][0]
q, a, g = row["question"], row["answer"], row["ground_truth"]
p_value = self.correctness_prompt.format(question=q, ground_truth=g, answer=a)
is_statement_present = await self.llm.agenerate_text(
p_value, callbacks=callbacks
Expand Down
18 changes: 9 additions & 9 deletions src/ragas/metrics/_answer_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ def init_model(self):
def _score(self, row: t.Dict, callbacks: Callbacks) -> float:
assert self.embeddings is not None, "embeddings must be set"

ground_truths, answers = row["ground_truths"], row["answer"]
ground_truths = [item[0] for item in ground_truths]
ground_truth, answers = row["ground_truth"], row["answer"]
ground_truth = [item[0] for item in ground_truth]

if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings):
raise NotImplementedError(
"async score [ascore()] not implemented for HuggingFace embeddings"
)
else:
embeddings_1 = np.array(self.embeddings.embed_documents(ground_truths))
embeddings_1 = np.array(self.embeddings.embed_documents(ground_truth))
embeddings_2 = np.array(self.embeddings.embed_documents(answers))
similarity = embeddings_1 @ embeddings_2.T
if similarity.size == 1:
Expand All @@ -79,25 +79,25 @@ def _score(self, row: t.Dict, callbacks: Callbacks) -> float:

assert isinstance(scores, np.ndarray), "Expects ndarray"
if self.threshold:
scores = scores >= self.threshold # type: ignore
scores = scores >= self.threshold

return scores.tolist()[0]

async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float:
assert self.embeddings is not None, "embeddings must be set"

ground_truths, answers = row["ground_truths"], row["answer"]
ground_truths = [item[0] for item in ground_truths]
ground_truth: t.List[str] = row["ground_truth"]
answer: t.List[str] = row["answer"]

if self.is_cross_encoder and isinstance(self.embeddings, HuggingfaceEmbeddings):
raise NotImplementedError(
"async score [ascore()] not implemented for HuggingFace embeddings"
)
else:
embeddings_1 = np.array(
await self.embeddings.aembed_documents(ground_truths)
await self.embeddings.aembed_documents(ground_truth)
)
embeddings_2 = np.array(await self.embeddings.aembed_documents(answers))
embeddings_2 = np.array(await self.embeddings.aembed_documents(answer))
similarity = embeddings_1 @ embeddings_2.T
if similarity.size == 1:
scores = similarity.flatten()
Expand All @@ -106,7 +106,7 @@ async def _ascore(self: t.Self, row: t.Dict, callbacks: Callbacks = []) -> float

assert isinstance(scores, np.ndarray), "Expects ndarray"
if self.threshold:
scores = scores >= self.threshold # type: ignore
scores = scores >= self.threshold

return scores.tolist()[0]

Expand Down
2 changes: 1 addition & 1 deletion src/ragas/metrics/_context_precision.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class ContextPrecision(MetricWithLLM):
batch_size: int = 15

def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]:
answer = "ground_truths"
answer = "ground_truth"
if answer not in row.keys():
logger.warning(
"Using 'context_precision' without ground truth will be soon depreciated. Use 'context_utilization' instead"
Expand Down
3 changes: 1 addition & 2 deletions src/ragas/metrics/_context_recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,7 @@ class ContextRecall(MetricWithLLM):
batch_size: int = 15

def _create_context_recall_prompt(self, row: t.Dict) -> PromptValue:
qstn, ctx, gt = row["question"], row["contexts"], row["ground_truths"]
gt = "\n".join(gt) if isinstance(gt, list) else gt
qstn, ctx, gt = row["question"], row["contexts"], row["ground_truth"]
ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx

return self.context_recall_prompt.format(question=qstn, context=ctx, answer=gt)
Expand Down
8 changes: 4 additions & 4 deletions src/ragas/testset/evolutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ class CurrentNodes:

class DataRow(BaseModel):
question: str
context: str
answer: str
contexts: t.List[str]
ground_truth: str
evolution_type: str


Expand Down Expand Up @@ -149,8 +149,8 @@ def generate_datarow(

return DataRow(
question=question,
context=merged_nodes.page_content,
answer="" if answer is None else answer,
contexts=[n.page_content for n in current_nodes.nodes],
ground_truth="" if answer is None else answer,
evolution_type=evolution_type,
)

Expand Down
12 changes: 9 additions & 3 deletions src/ragas/testset/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dataclasses import dataclass

import pandas as pd
from datasets import Dataset
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

Expand All @@ -24,8 +25,8 @@
from ragas.testset.filters import EvolutionFilter, NodeFilter, QuestionFilter

if t.TYPE_CHECKING:
from llama_index.readers.schema import Document as LlamaindexDocument
from langchain_core.documents import Document as LCDocument
from llama_index.readers.schema import Document as LlamaindexDocument

logger = logging.getLogger(__name__)

Expand All @@ -41,14 +42,19 @@ class TestDataset:

test_data: t.List[DataRow]

def to_pandas(self) -> pd.DataFrame:
def _to_records(self) -> t.List[t.Dict]:
data_samples = []
for data in self.test_data:
data_dict = dict(data)
data_dict["episode_done"] = True
data_samples.append(data_dict)
return data_samples

def to_pandas(self) -> pd.DataFrame:
return pd.DataFrame.from_records(self._to_records())

return pd.DataFrame.from_records(data_samples)
def to_dataset(self) -> Dataset:
return Dataset.from_list(self._to_records())


@dataclass
Expand Down
16 changes: 8 additions & 8 deletions src/ragas/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ def remap_column_names(dataset: Dataset, column_map: dict[str, str]) -> Dataset:


def validate_column_dtypes(ds: Dataset):
for column_names in ["question", "answer"]:
for column_names in ["question", "answer", "ground_truth"]:
if column_names in ds.features:
if ds.features[column_names].dtype != "string":
raise ValueError(
f'Dataset feature "{column_names}" should be of type string'
)

for column_names in ["contexts", "ground_truths"]:
for column_names in ["contexts"]:
if column_names in ds.features:
if not (
isinstance(ds.features[column_names], Sequence)
Expand All @@ -39,10 +39,10 @@ def validate_column_dtypes(ds: Dataset):
EvaluationMode.qac: ["question", "answer", "contexts"],
EvaluationMode.qa: ["question", "answer"],
EvaluationMode.qc: ["question", "contexts"],
EvaluationMode.gc: ["ground_truths", "contexts"],
EvaluationMode.ga: ["ground_truths", "answer"],
EvaluationMode.qga: ["question", "ground_truths", "answer"],
EvaluationMode.qcg: ["question", "contexts", "ground_truths"],
EvaluationMode.gc: ["ground_truth", "contexts"],
EvaluationMode.ga: ["ground_truth", "answer"],
EvaluationMode.qga: ["question", "ground_truth", "answer"],
EvaluationMode.qcg: ["question", "contexts", "ground_truth"],
}


Expand All @@ -64,9 +64,9 @@ def validate_evaluation_modes(ds: Dataset, metrics: list[Metric]):
extra_msg = ""
if (
isinstance(m, ContextPrecision)
and "ground_truths" not in available_columns
and "ground_truth" not in available_columns
):
extra_msg = "Looks like you're trying to use 'context_precision' without ground_truths. Please use consider using `context_utilization' instead."
extra_msg = "Looks like you're trying to use 'context_precision' without ground_truth. Please use consider using `context_utilization' instead."

raise ValueError(
f"The metric [{m.name}] that that is used requires the following "
Expand Down
2 changes: 1 addition & 1 deletion tests/benchmarks/benchmark_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ragas.metrics.critique import harmfulness

# data
ds = load_dataset("explodinggradients/amnesty_qa", "english")
ds = load_dataset("explodinggradients/amnesty_qa", "english_v2")
assert isinstance(ds, DatasetDict)
eval_dataset = ds["eval"]

Expand Down
Loading

0 comments on commit dbdf54b

Please sign in to comment.