Skip to content

Commit

Permalink
[ENH] distributions: pmf and log_pmf method (#295)
Browse files Browse the repository at this point in the history
This PR adds a `pmf` and `log_pmf` method to the base interface. Fixes
#289

In accordance with #229, these return 0 resp `-np.inf` if the
distribution is continuous.

Also makes the following, connected changes:
* `pdf` return 0 for discrete distributions
* removes the discrete/continuous handling logic from the `scipy`
adapter, as this is now in the base class

I've also changed the way in which `TestScipyAdapter` queries the
distributions - by inheritance, not by tag. This is since the tag is
"mechanical" (for internal testing only) and it might confuse users to
see a value in `object_type` which is not related to an external API
property.
  • Loading branch information
fkiraly authored May 4, 2024
1 parent a12fa35 commit facaa92
Show file tree
Hide file tree
Showing 5 changed files with 158 additions and 29 deletions.
22 changes: 1 addition & 21 deletions skpro/distributions/adapters/scipy/_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class _ScipyAdapter(BaseDistribution):

_distribution_attr = "_dist"
_tags = {
"object_type": ["distribution", "scipy_distribution_adapter"],
"object_type": "distribution",
}

def __init__(self, index=None, columns=None):
Expand Down Expand Up @@ -58,17 +58,11 @@ def _var(self):

def _pdf(self, x: pd.DataFrame):
obj: Union[rv_continuous, rv_discrete] = getattr(self, self._distribution_attr)
if isinstance(obj, rv_discrete):
return 0

args, kwds = self._get_scipy_param()
return obj.pdf(x, *args, **kwds)

def _log_pdf(self, x: pd.DataFrame):
obj: Union[rv_continuous, rv_discrete] = getattr(self, self._distribution_attr)
if isinstance(obj, rv_discrete):
return 0

args, kwds = self._get_scipy_param()
return obj.logpdf(x, *args, **kwds)

Expand All @@ -85,25 +79,11 @@ def _ppf(self, p: pd.DataFrame):
def _pmf(self, x: pd.DataFrame):
"""Return the probability mass function evaluated at x."""
obj: Union[rv_continuous, rv_discrete] = getattr(self, self._distribution_attr)
if isinstance(obj, rv_continuous):
return 0

args, kwds = self._get_scipy_param()
return obj.pmf(x, *args, **kwds)

def pmf(self, x: pd.DataFrame):
"""Return the probability mass function evaluated at x."""
return self._boilerplate("_pmf", x=x)

def _log_pmf(self, x: pd.DataFrame):
"""Return the log of the probability mass function evaluated at x."""
obj: Union[rv_continuous, rv_discrete] = getattr(self, self._distribution_attr)
if isinstance(obj, rv_continuous):
return 0

args, kwds = self._get_scipy_param()
return obj.logpmf(x, *args, **kwds)

def log_pmf(self, x: pd.DataFrame):
"""Return the log of the probability mass function evaluated at x."""
return self._boilerplate("_log_pmf", x=x)
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
from skbase.testing import QuickTester

from skpro.distributions.adapters.scipy import _ScipyAdapter
from skpro.tests.test_all_estimators import BaseFixtureGenerator, PackageConfig

__author__ = ["fkiraly", "malikrafsan"]
Expand Down Expand Up @@ -60,7 +61,7 @@ class ScipyDistributionFixtureGenerator(BaseFixtureGenerator):
instances are generated by create_test_instance class method
"""

object_type_filter = "scipy_distribution_adapter"
object_type_filter = _ScipyAdapter


class TestScipyAdapter(PackageConfig, ScipyDistributionFixtureGenerator, QuickTester):
Expand Down Expand Up @@ -104,7 +105,7 @@ def test_method_continuous_dist(self, object_instance, method, scipy_method, x):
"""Test continuous distribution method."""
res = getattr(object_instance, method)(x)
if object_instance._tags["distr:measuretype"] != "continuous":
scipy_res = 0
return None # in this case, scipy method is not defined
else:
params = object_instance._get_scipy_param()
scipy_obj = object_instance._get_scipy_object()
Expand All @@ -118,7 +119,7 @@ def test_method_discrete_dist(self, object_instance, method, scipy_method, x):
"""Test discrete distribution method."""
res = getattr(object_instance, method)(x)
if object_instance._tags["distr:measuretype"] != "discrete":
scipy_res = 0
return None # in this case, scipy method is not defined
else:
params = object_instance._get_scipy_param()
scipy_obj = object_instance._get_scipy_object()
Expand Down
120 changes: 120 additions & 0 deletions skpro/distributions/base/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,10 @@ def pdf(self, x):
``pd.DataFrame`` with same columns and index as ``self``
containing :math:`p_{X_{ij}}(x_{ij})`, as above
"""
distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False)
if distr_type == "discrete":
return self._coerce_to_self_index_df(0, flatten=False)

return self._boilerplate("_pdf", x=x)

def _pdf(self, x):
Expand Down Expand Up @@ -582,6 +586,10 @@ def log_pdf(self, x):
``pd.DataFrame`` with same columns and index as ``self``
containing :math:`\log p_{X_{ij}}(x_{ij})`, as above
"""
distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False)
if distr_type == "discrete":
return self._coerce_to_self_index_df(-np.inf, flatten=False)

return self._boilerplate("_log_pdf", x=x)

def _log_pdf(self, x):
Expand All @@ -604,6 +612,118 @@ def _log_pdf(self, x):

raise NotImplementedError(self._method_error_msg("log_pdf", "error"))

def pmf(self, x):
r"""Probability mass function.
Let :math:`X` be a random variables with the distribution of ``self``,
taking values in ``(N, n)`` ``DataFrame``-s
Let :math:`x\in \mathbb{R}^{N\times n}`.
By :math:`m_{X_{ij}}`, denote the marginal mass of :math:`X` at the
:math:`(i,j)`-th entry, i.e.,
:math:`m_{X_{ij}}(x_{ij}) = \mathbb{P}(X_{ij} = x_{ij})`.
The output of this method, for input ``x`` representing :math:`x`,
is a ``DataFrame`` with same columns and indices as ``self``,
and entries :math:`m_{X_{ij}}(x_{ij})`.
If ``self`` has a mixed or discrete distribution, this returns
the weighted continuous part of `self`'s distribution instead of the pdf,
i.e., the marginal pdf integrate to the weight of the continuous part.
Parameters
----------
x : ``pandas.DataFrame`` or 2D ``np.ndarray``
representing :math:`x`, as above
Returns
-------
``pd.DataFrame`` with same columns and index as ``self``
containing :math:`p_{X_{ij}}(x_{ij})`, as above
"""
distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False)
if distr_type == "continuous":
return self._coerce_to_self_index_df(0, flatten=False)

return self._boilerplate("_pmf", x=x)

def _pmf(self, x):
"""Probability mass function.
Private method, to be implemented by subclasses.
"""
self_has_logpmf = self._has_implementation_of("log_pmf")
self_has_logpmf = self_has_logpmf or self._has_implementation_of("_log_pmf")
if self_has_logpmf:
approx_method = (
"by exponentiating the output returned by the log_pmf method, "
"this may be numerically unstable"
)
warn(self._method_error_msg("pmf", fill_in=approx_method))

x = self._coerce_to_self_index_df(x, flatten=False)
res = self.log_pmf(x=x)
if isinstance(res, pd.DataFrame):
res = res.values
return np.exp(res)

raise NotImplementedError(self._method_error_msg("pmf", "error"))

def log_pmf(self, x):
r"""Logarithmic probability mass function.
Numerically more stable than calling pmf and then taking logartihms.
Let :math:`X` be a random variables with the distribution of ``self``,
taking values in `(N, n)` ``DataFrame``-s
Let :math:`x\in \mathbb{R}^{N\times n}`.
By :math:`m_{X_{ij}}`, denote the marginal pdf of :math:`X` at the
:math:`(i,j)`-th entry, i.e.,
:math:`m_{X_{ij}}(x_{ij}) = \mathbb{P}(X_{ij} = x_{ij})`.
The output of this method, for input ``x`` representing :math:`x`,
is a ``DataFrame`` with same columns and indices as ``self``,
and entries :math:`\log m_{X_{ij}}(x_{ij})`.
If ``self`` has a mixed or discrete distribution, this returns
the weighted continuous part of `self`'s distribution instead of the pdf,
i.e., the marginal pdf integrate to the weight of the continuous part.
Parameters
----------
x : ``pandas.DataFrame`` or 2D ``np.ndarray``
representing :math:`x`, as above
Returns
-------
``pd.DataFrame`` with same columns and index as ``self``
containing :math:`\log m_{X_{ij}}(x_{ij})`, as above
"""
distr_type = self.get_tag("distr:measuretype", "mixed", raise_error=False)
if distr_type == "continuous":
return self._coerce_to_self_index_df(-np.inf, flatten=False)

return self._boilerplate("_log_pmf", x=x)

def _log_pmf(self, x):
"""Logarithmic probability mass function.
Private method, to be implemented by subclasses.
"""
if self._has_implementation_of("pmf") or self._has_implementation_of("_pmf"):
approx_method = (
"by taking the logarithm of the output returned by the pdf method, "
"this may be numerically unstable"
)
warn(self._method_error_msg("log_pmf", fill_in=approx_method))

x = self._coerce_to_self_index_df(x, flatten=False)
res = self.pmf(x=x)
if isinstance(res, pd.DataFrame):
res = res.values
return np.log(res)

raise NotImplementedError(self._method_error_msg("log_pmf", "error"))

def cdf(self, x):
r"""Cumulative distribution function.
Expand Down
16 changes: 14 additions & 2 deletions skpro/distributions/tests/test_all_distrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def _has_capability(distr, method):

METHODS_SCALAR = ["mean", "var", "energy"]
METHODS_SCALAR_POS = ["var", "energy"] # result always non-negative?
METHODS_X = ["energy", "pdf", "log_pdf", "cdf"]
METHODS_X_POS = ["energy", "pdf", "cdf"] # result always non-negative?
METHODS_X = ["energy", "pdf", "log_pdf", "pmf", "log_pmf", "cdf"]
METHODS_X_POS = ["energy", "pdf", "pmf", "cdf"] # result always non-negative?
METHODS_P = ["ppf"]
METHODS_ROWWISE = ["energy"] # results in one column

Expand Down Expand Up @@ -248,6 +248,18 @@ def test_log_pdf_and_pdf(self, object_instance):
log_pdf = d.log_pdf(x)
assert np.allclose(np.log(pdf), log_pdf)

def test_log_pmf_and_pmf(self, object_instance):
"""Test that the log of the pmf and log_pmf function are similar."""
d = object_instance
capabilities_exact = d.get_tags()["capabilities:exact"]

if "log_pmf" not in capabilities_exact or "pmf" not in capabilities_exact:
return
x = d.sample()
pmf = d.pmf(x)
log_pmf = d.log_pmf(x)
assert np.allclose(np.log(pmf), log_pmf)

def test_ppf_and_cdf(self, object_instance):
"""Test that the ppf is the inverse of the cdf."""
d = object_instance
Expand Down
22 changes: 19 additions & 3 deletions skpro/tests/test_all_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numbers
import types
from copy import deepcopy
from inspect import getfullargspec, signature
from inspect import getfullargspec, isclass, signature

import joblib
import numpy as np
Expand Down Expand Up @@ -82,13 +82,29 @@ class BaseFixtureGenerator(_BaseFixtureGenerator):

# overrides object retrieval in scikit-base
def _all_objects(self):
"""Retrieve list of all object classes of type self.object_type_filter."""
"""Retrieve list of all object classes of type self.object_type_filter.
If self.object_type_filter is None, retrieve all objects.
If class, retrieve all classes inheriting from self.object_type_filter.
Otherwise (assumed str or list of str), retrieve all classes with tags
object_type in self.object_type_filter.
"""
filter = getattr(self, "object_type_filter", None)

if isclass(filter):
object_types = filter.get_class_tag("object_type", None)
else:
object_types = filter

obj_list = all_objects(
object_types=getattr(self, "object_type_filter", None),
object_types=object_types,
return_names=False,
exclude_objects=self.exclude_objects,
)

if isclass(filter):
obj_list = [obj for obj in obj_list if issubclass(obj, filter)]

# run_test_for_class selects the estimators to run
# based on whether they have changed, and whether they have all dependencies
# internally, uses the ONLY_CHANGED_MODULES flag,
Expand Down

0 comments on commit facaa92

Please sign in to comment.