explodinggradients · shahules786 · Sep 12, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/docs/concepts/metrics/context_precision_v2.md b/docs/concepts/metrics/context_precision_v2.md
@@ -0,0 +1,85 @@
+# Context Precision
+
+Context Precision is a metric that measures the proportion of relevant chunks in the retrieved context. It is calculated as the mean of the precision@k for each chunk in the context. Precision@k is the ratio of the number of relevant chunks at rank k to the total number of chunks at rank k.
+
+```{math}
+\text{Context Precision@K} = \frac{\sum_{k=1}^{K} \left( \text{Precision@k} \times v_k \right)}{\text{Total number of relevant items in the top } K \text{ results}}
+````
+
+```{math}
+\text{Precision@k} = {\text{true positives@k} \over  (\text{true positives@k} + \text{false positives@k})}
+````
+
+Where $K$ is the total number of chunks in `contexts` and $v_k \in \{0, 1\}$ is the relevance indicator at rank $k$.
+
+
+## LLM Based Context Precision
+
+The following metrics uses LLM to identify if a retrieved context is relevant or not.
+
+### Context Precision without reference
+
+This metric is can be used when you have both retrieved contexts and also reference contexts associated with a `user_input`. To estimate if a retrieved contexts is relevant or not this method uses the LLM to compare each of the retrieved context or chunk present in `retrieved_contexts` with `response`. 
+
+#### Example
+
+```{code-block} python
+from ragas import SingleTurnSample
+from ragas.metrics import LLMContextPrecisionWithoutReference
+
+context_precision = LLMContextPrecisionWithoutReference()
+
+sample = SingleTurnSample(
+    user_input="Where is the Eiffel Tower located?",
+    response="The Eiffel Tower is located in Paris.",
+    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
+)
+
+
+await context_precision.single_turn_ascore(sample)
+```
+
+### Context Precision with reference
+
+This metric is can be used when you have both retrieved contexts and also reference answer associated with a `user_input`. To estimate if a retrieved contexts is relevant or not this method uses the LLM to compare each of the retrieved context or chunk present in `retrieved_contexts` with `reference`. 
+
+#### Example
+
+```{code-block} python
+from ragas import SingleTurnSample
+from ragas.metrics import LLMContextPrecisionWithReference
+
+context_precision = LLMContextPrecisionWithReference()
+
+sample = SingleTurnSample(
+    user_input="Where is the Eiffel Tower located?",
+    reference="The Eiffel Tower is located in Paris.",
+    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
+)
+
+await context_precision.single_turn_ascore(sample)
+```
+
+## Non LLM Based Context Precision
+
+The following metrics uses traditional methods to identify if a retrieved context is relevant or not. You can use any non LLM based metrics as distance measure to identify if a retrieved context is relevant or not.
+
+### Context Precision with reference contexts
+
+This metric is can be used when you have both retrieved contexts and also reference contexts associated with a `user_input`. To estimate if a retrieved contexts is relevant or not this method uses the LLM to compare each of the retrieved context or chunk present in `retrieved_contexts` with each ones present in `reference_contexts`. 
+
+#### Example
+
+```{code-block} python
+from ragas import SingleTurnSample
+from ragas.metrics import NonLLMContextPrecisionWithReference
+
+context_precision = NonLLMContextPrecisionWithReference()
+
+sample = SingleTurnSample(
+    retrieved_contexts=["The Eiffel Tower is located in Paris."], 
+    reference_contexts=["Paris is the capital of France.", "The Eiffel Tower is one of the most famous landmarks in Paris."]
+)
+
+await context_precision.single_turn_ascore(sample)
+```
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
@@ -10,7 +10,10 @@
 from ragas.dataset_schema import SingleTurnSample
 from ragas.llms.output_parser import RagasoutputParser, get_json_format_instructions
 from ragas.llms.prompt import Prompt, PromptValue
+from ragas.metrics._string import NonLLMStringSimilarity
 from ragas.metrics.base import MetricType, MetricWithLLM, SingleTurnMetric, ensembler
+from ragas.run_config import RunConfig
+from ragas.utils import deprecated
 
 if t.TYPE_CHECKING:
     from langchain_core.callbacks import Callbacks
@@ -74,7 +77,7 @@ class ContextPrecisionVerifications(BaseModel):
 
 
 @dataclass
-class ContextPrecision(MetricWithLLM, SingleTurnMetric):
+class LLMContextPrecisionWithReference(MetricWithLLM, SingleTurnMetric):
     """
     Average Precision is a metric that evaluates whether all of the
     relevant items selected by the model are ranked higher or not.
@@ -86,7 +89,7 @@ class ContextPrecision(MetricWithLLM, SingleTurnMetric):
     context_precision_prompt: Prompt
     """
 
-    name: str = "context_precision"  # type: ignore
+    name: str = "llm_context_precision_with_reference"  # type: ignore
     _required_columns: t.Dict[MetricType, t.Set[str]] = field(
         default_factory=lambda: {
             MetricType.SINGLE_TURN: {
@@ -202,8 +205,8 @@ def save(self, cache_dir: str | None = None) -> None:
 
 
 @dataclass
-class ContextUtilization(ContextPrecision):
-    name: str = "context_utilization"
+class LLMContextPrecisionWithoutReference(LLMContextPrecisionWithReference):
+    name: str = "llm_context_precision_without_reference"
     _required_columns: t.Dict[MetricType, t.Set[str]] = field(
         default_factory=lambda: {
             MetricType.SINGLE_TURN: {"user_input", "response", "retreived_contexts"}
@@ -214,5 +217,103 @@ def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]:
         return row["user_input"], row["retrieved_contexts"], row["response"]
 
 
+@dataclass
+class NonLLMContextPrecisionWithReference(SingleTurnMetric):
+    name: str = "non_llm_context_precision_with_reference"  # type: ignore
+    _required_columns: t.Dict[MetricType, t.Set[str]] = field(
+        default_factory=lambda: {
+            MetricType.SINGLE_TURN: {
+                "retrieved_contexts",
+                "reference_contexts",
+            }
+        }
+    )
+    distance_measure: SingleTurnMetric = field(
+        default_factory=lambda: NonLLMStringSimilarity()
+    )
+    threshold: float = 0.5
+
+    def __post_init__(self):
+        if isinstance(self.distance_measure, MetricWithLLM):
+            raise ValueError(
+                "distance_measure must not be an instance of MetricWithLLM for NonLLMContextPrecisionWithReference"
+            )
+
+    def init(self, run_config: RunConfig) -> None:
+        ...
+
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        sample = SingleTurnSample(**row)
+        return await self._single_turn_ascore(sample, callbacks)
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        retrieved_contexts = sample.retrieved_contexts
+        reference_contexts = sample.reference_contexts
+        assert retrieved_contexts is not None, "retrieved_contexts is empty"
+        assert reference_contexts is not None, "reference_contexts is empty"
+
+        scores = []
+        for rc in retrieved_contexts:
+            scores.append(
+                max(
+                    [
+                        await self.distance_measure.single_turn_ascore(
+                            SingleTurnSample(reference=rc, response=ref), callbacks
+                        )
+                        for ref in reference_contexts
+                    ]
+                )
+            )
+        scores = [1 if score >= self.threshold else 0 for score in scores]
+        return self._calculate_average_precision(scores)
+
+    def _calculate_average_precision(self, verdict_list: t.List[int]) -> float:
+        score = np.nan
+
+        denominator = sum(verdict_list) + 1e-10
+        numerator = sum(
+            [
+                (sum(verdict_list[: i + 1]) / (i + 1)) * verdict_list[i]
+                for i in range(len(verdict_list))
+            ]
+        )
+        score = numerator / denominator
+        return score
+
+
+@dataclass
+class ContextPrecision(LLMContextPrecisionWithReference):
+    name: str = "context_precision"  # type: ignore
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        return await super()._single_turn_ascore(sample, callbacks)
+
+    @deprecated(
+        since="0.2", removal="0.3", alternative="LLMContextPrecisionWithReference"
+    )
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await super()._ascore(row, callbacks)
+
+
+@dataclass
+class ContextUtilization(LLMContextPrecisionWithoutReference):
+    name: str = "context_utilization"  # type: ignore
+
+    async def _single_turn_ascore(
+        self, sample: SingleTurnSample, callbacks: Callbacks
+    ) -> float:
+        return await super()._single_turn_ascore(sample, callbacks)
+
+    @deprecated(
+        since="0.2", removal="0.3", alternative="LLMContextPrecisionWithoutReference"
+    )
+    async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
+        return await super()._ascore(row, callbacks)
+
+
 context_precision = ContextPrecision()
 context_utilization = ContextUtilization()