v0.2.4: add null_policy support, fix py38 pyo3 dependency

azmyrajab · Apr 5, 2024 · 7230d0a · 7230d0a
1 parent 66ead06
commit 7230d0a
Show file tree

Hide file tree

Showing 7 changed files with 143 additions and 47 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,14 +1,14 @@
 [package]
 name = "polars_ols"
-version = "0.2.3"
+version = "0.2.4"
 edition = "2021"
 
 [lib]
 name = "polars_ols"
 crate-type= ["cdylib"]
 
 [dependencies]
-pyo3 = { version = "*", features = ["extension-module", "abi3-py310"] }
+pyo3 = { version = "*", features = ["extension-module", "abi3-py38"] }  # set > py38 supported version
 pyo3-polars = { version = "*", features = ["derive"] }
 serde = { version = "*", features = ["derive"] }
 polars = { version = "*", features = ["performant", "lazy", "ndarray", "dtype-struct"]}

diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ Importing `polars_ols` will register the namespace `least_squares` provided by t
 You can build models either by either specifying polars expressions (e.g. `pl.col(...)`) for your targets and features or using
 the formula api (patsy syntax). All models support the following general (optional) arguments:
 - `mode` - a literal which determines the type of output produced by the model
+- `null_policy` - a literal which determines how to deal with missing data
 - `add_intercept` - a boolean specifying if an intercept feature should be added to the features
 - `sample_weights` - a column or expression providing non-negative weights applied to the samples
 
@@ -179,17 +180,17 @@ This benchmark was run on randomly generated data with [pyperf](https://github.c
 | Ridge                   | 262 ± 3 us       | 369 ± 231 us      | Numpy          | 1.4x                         |
 | Weighted Least Squares  | 493 ± 7 us       | 2.13 ms ± 0.22 ms | Statsmodels    | 4.3x                         |
 | Elastic Net             | 326 ± 3 us       | 87.3 ms ± 9.0 ms  | Sklearn        | 268.2x                       |
-| Recursive Least Squares | 1.39 ms ± 0.01 ms| 22.3 ms ± 0.2 ms  | Statsmodels    | 16.0x                        |
+| Recursive Least Squares | 1.39 ms ± 0.01 ms| 18.7 ms ± 1.4 ms  | Statsmodels    | 13.5x                        |
 | Rolling Least Squares   | 2.72 ms ± 0.03 ms| 22.3 ms ± 0.2 ms  | Statsmodels    | 8.2x                         |
 
 ### n_samples=10_000, n_features=100
-| Model                   | polars_ols       | Python Benchmark  | Benchmark Type | Speed-up vs Python Benchmark |
-|-------------------------|------------------|-------------------|----------------|------------------------------|
-| Least Squares           | 15.6 ms ± 0.2 ms| 29.9 ms ± 8.6 ms  | Numpy          | 1.9x                         |
-| Ridge                   | 5.81 ms ± 0.05 ms| 5.21 ms ± 0.94 ms | Numpy          | 0.9x                         |
-| Weighted Least Squares  | 16.8 ms ± 0.2 ms| 82.4 ms ± 9.1 ms  | Statsmodels    | 4.9x                         |
-| Elastic Net             | 20.9 ms ± 0.3 ms| 134 ms ± 21 ms    | Sklearn        | 6.4x                         |
-| Recursive Least Squares | 163 ms ± 28 ms  | 3.99 sec ± 0.54 sec | Statsmodels    | 24.5x                        |
+| Model                   | polars_ols       | Python Benchmark    | Benchmark Type | Speed-up vs Python Benchmark |
+|-------------------------|------------------|---------------------|----------------|------------------------------|
+| Least Squares           | 15.6 ms ± 0.2 ms| 29.9 ms ± 8.6 ms    | Numpy          | 1.9x                         |
+| Ridge                   | 5.81 ms ± 0.05 ms| 5.21 ms ± 0.94 ms   | Numpy          | 0.9x                         |
+| Weighted Least Squares  | 16.8 ms ± 0.2 ms| 82.4 ms ± 9.1 ms    | Statsmodels    | 4.9x                         |
+| Elastic Net             | 20.9 ms ± 0.3 ms| 134 ms ± 21 ms      | Sklearn        | 6.4x                         |
+| Recursive Least Squares | 163 ms ± 28 ms  | 65.7 sec ± 28.2 sec | Statsmodels    | 403.1x                       |
 | Rolling Least Squares   | 390 ms ± 10 ms  | 3.99 sec ± 0.54 sec | Statsmodels    | 10.2x                        |
 
 Numpy's `lstsq` is already a highly optimized call into LAPACK and so the scope for speed-up is limited.

diff --git a/polars_ols/__init__.py b/polars_ols/__init__.py
@@ -3,7 +3,9 @@
 import polars as pl
 
 from polars_ols.least_squares import (
+    NullPolicy,
     OLSKwargs,
+    OutputMode,
     RLSKwargs,
     RollingKwargs,
     compute_least_squares,
@@ -33,7 +35,8 @@ def least_squares(
         *features: pl.Expr,
         sample_weights: Optional[pl.Expr] = None,
         add_intercept: bool = False,
-        mode: Literal["predictions", "residuals", "coefficients"] = "predictions",
+        mode: OutputMode = "predictions",
+        null_policy: NullPolicy = "ignore",
         **ols_kwargs,
     ) -> pl.Expr:
         return compute_least_squares(
@@ -42,6 +45,7 @@ def least_squares(
             sample_weights=sample_weights,
             add_intercept=add_intercept,
             mode=mode,
+            null_policy=null_policy,
             ols_kwargs=OLSKwargs(**ols_kwargs),
         )
 
@@ -65,7 +69,8 @@ def rls(
         *features: pl.Expr,
         sample_weights: Optional[pl.Expr] = None,
         add_intercept: bool = False,
-        mode: Literal["predictions", "residuals", "coefficients"] = "predictions",
+        mode: OutputMode = "predictions",
+        null_policy: NullPolicy = "ignore",
         **rls_kwargs,
     ):
         return compute_recursive_least_squares(
@@ -74,6 +79,7 @@ def rls(
             sample_weights=sample_weights,
             add_intercept=add_intercept,
             mode=mode,
+            null_policy=null_policy,
             rls_kwargs=RLSKwargs(**rls_kwargs),
         )
 
@@ -82,7 +88,8 @@ def rolling_ols(
         *features: pl.Expr,
         sample_weights: Optional[pl.Expr] = None,
         add_intercept: bool = False,
-        mode: Literal["predictions", "residuals", "coefficients"] = "predictions",
+        mode: OutputMode = "predictions",
+        null_policy: NullPolicy = "ignore",
         **rolling_kwargs,
     ):
         return compute_rolling_least_squares(
@@ -91,6 +98,7 @@ def rolling_ols(
             sample_weights=sample_weights,
             add_intercept=add_intercept,
             mode=mode,
+            null_policy=null_policy,
             rolling_kwargs=RollingKwargs(**rolling_kwargs),
         )
 
@@ -111,12 +119,12 @@ def from_formula(self, formula: str, **kwargs) -> pl.Expr:
     def predict(
         self, *features: pl.Expr, name: Optional[str] = None, add_intercept: bool = False
     ) -> pl.Expr:
-        return predict(self._expr, *features, name=name, add_intercept=add_intercept)
+        return predict(self._expr, *features, add_intercept=add_intercept, name=name)
 
     def predict_from_formula(self, formula: str, name: Optional[str] = None) -> pl.Expr:
         features, add_intercept = build_expressions_from_patsy_formula(
             formula, include_dependent_variable=False
         )
         has_const = any(f.meta.output_name == "const" for f in features)
         add_intercept &= not has_const
-        return self.predict(*features, add_intercept=add_intercept, name=name)
+        return self.predict(*features, name=name, add_intercept=add_intercept)