Add R2 metric + test

chuvalniy · Feb 6, 2024 · 58920c8 · 58920c8
1 parent f40c70b
commit 58920c8
Show file tree

Hide file tree

Showing 11 changed files with 140 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,4 @@
 src/__pycache__
 build
 dist
-tulia.egg-info
+tulia.egg-info
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-VERSION = '0.2.1'
+VERSION = '0.3.0'
 DESCRIPTION = 'numpy based machine learning package with sklearn-like API'
 
 with open("README.md", "r") as fn:

diff --git a/src/ensemble/__pycache__/__init__.cpython-310.pyc b/src/ensemble/__pycache__/__init__.cpython-310.pyc
diff --git a/src/ensemble/__pycache__/boosting.cpython-310.pyc b/src/ensemble/__pycache__/boosting.cpython-310.pyc
diff --git a/src/ensemble/__pycache__/xgboost.cpython-310.pyc b/src/ensemble/__pycache__/xgboost.cpython-310.pyc
diff --git a/src/ensemble/catboost.py b/src/ensemble/catboost.py
@@ -0,0 +1,56 @@
+from typing import Union
+
+import numpy as np
+
+from src.base import Model
+
+
+# 1. Ordered Target encoding
+# 2. Boostrap data
+# 3. Symmetric tree
+# 4.
+
+
+class CatBoostClassifier(Model):
+    """
+    CatBoost for classification tasks.
+    """
+
+    def __init__(
+            self,
+            learning_rate: float = 3e-1,
+            n_steps: int = 100,
+            max_depth: int = 3,
+            cat_features: list = None
+    ):
+        pass
+
+
+    def fit(self, x: np.ndarray, y: np.ndarray):
+        pass
+    def predict(self, x: np.ndarray) -> np.ndarray:
+        pass
+
+    def _predict(self, x: np.ndarray) -> Union[np.ndarray, float, int]:
+        pass
+
+    def _encode_cat_features(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+        cat_feature_idxs = []
+
+        encoded_features = []
+        for idx in cat_feature_idxs:
+            encoded_feature = []
+            option_count = {}
+            total_count = {}
+            for i, x_sample in enumerate(x[:, idx]):
+                ctr = (option_count.get(x_sample, 0) + 0.05) / (total_count.get(x_sample, 0) + 1)
+                encoded_feature.append(ctr)
+
+                if y[i] == 1:
+                    option_count[x_sample] = option_count.get(x_sample, 0) + 1
+                total_count[x_sample] = total_count.get(x_sample, 0) + 1
+
+            encoded_features.append(np.array(encoded_feature))
+
+        x[:, cat_feature_idxs] = encoded_features
+        return x
diff --git a/src/metrics/__pycache__/classification.cpython-310.pyc b/src/metrics/__pycache__/classification.cpython-310.pyc
diff --git a/src/metrics/__pycache__/regression.cpython-310.pyc b/src/metrics/__pycache__/regression.cpython-310.pyc
diff --git a/src/metrics/regression.py b/src/metrics/regression.py
@@ -4,8 +4,8 @@
 def mean_squared_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
     """
     Calculate mean-squared error.
-    :param y_true: Target labels.
-    :param y_pred: Target predictions.
+    :param y_true: Target labels (n_examples, ).
+    :param y_pred: Target predictions (n_examples, ).
     :return: Loss.
     """
     n_examples = len(y_true)
@@ -17,9 +17,24 @@ def mean_squared_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
 def mean_absolute_error(y_true: np.ndarray, y_pred: np.ndarray) -> float:
     """
     Calculate mean-absolute error.
-    :param y_true: Target labels.
-    :param y_pred: Target predictions.
+    :param y_true: Target labels (n_examples, ).
+    :param y_pred: Target predictions (n_examples, ).
     :return: Loss.
     """
     error = np.mean(np.abs(y_true - y_pred))
     return error
+
+
+def r2_score(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+    """
+    Calculate R-squared.
+    :param y_true: Target labels (n_examples, ).
+    :param y_pred: Target predictions (n_examples, ).
+    :return: R-squared score.
+    """
+
+    tss = np.sum((y_true - np.mean(y_true))**2)
+    rss = np.sum((y_true - y_pred)**2)
+
+    r_squared = 1 - rss / tss
+    return r_squared
diff --git a/tests/metrics/regression/__pycache__/test_r2_score.cpython-310-pytest-7.4.4.pyc b/tests/metrics/regression/__pycache__/test_r2_score.cpython-310-pytest-7.4.4.pyc
diff --git a/tests/metrics/regression/test_r2_score.py b/tests/metrics/regression/test_r2_score.py
@@ -0,0 +1,63 @@
+import numpy as np
+
+from src.metrics import r2_score
+
+
+def test_r2_identical_arrays():
+    y_true = np.array([1, 2, 3, 4, 5])
+    y_pred = np.array([1, 2, 3, 4, 5])
+
+    expected_r2 = 1.0
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)
+
+
+def test_r2_shifted_arrays():
+    y_true = np.array([1, 2, 3, 4, 5])
+    y_pred = np.array([2, 3, 4, 5, 6])
+
+    expected_r2 = 0.5
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)
+
+
+def test_r2_reversed_arrays():
+    y_true = np.array([1, 2, 3, 4, 5])
+    y_pred = np.array([5, 4, 3, 2, 1])
+
+    expected_r2 = -3.0
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)
+
+
+def test_r2_large_numbers():
+    y_true = np.array([10, 20, 30, 40, 50])
+    y_pred = np.array([15, 25, 35, 45, 55])
+
+    expected_r2 = 0.875
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)
+
+
+def test_mse_decimal_numbers():
+    y_true = np.array([0.5, 0.6, 0.7, 0.8, 0.9])
+    y_pred = np.array([0.6, 0.7, 0.8, 0.9, 1.0])
+
+    expected_r2 = 0.5
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)
+
+
+def test_mse_with_outliers():
+    y_true = np.array([10, 20, 30, 40, 135])
+    y_pred = np.array([15, 25, 35, 45, 55])
+
+    expected_r2 = 1 - 6500 / 10180
+    r2 = r2_score(y_true, y_pred)
+
+    assert np.isclose(expected_r2, r2, atol=1e-5, rtol=1e-5)