diff --git a/src/_quarto.yml b/src/_quarto.yml index a202fded4..f6aa243df 100644 --- a/src/_quarto.yml +++ b/src/_quarto.yml @@ -188,6 +188,7 @@ website: - reference-manual/pathfinder.qmd - reference-manual/variational.qmd - reference-manual/laplace.qmd + - reference-manual/laplace_embedded.qmd - reference-manual/diagnostics.qmd - section: "Usage" contents: @@ -237,6 +238,7 @@ website: - section: "Additional Distributions" contents: - functions-reference/hidden_markov_models.qmd + - functions-reference/embedded_laplace.qmd - section: "Appendix" contents: - functions-reference/mathematical_functions.qmd diff --git a/src/bibtex/all.bib b/src/bibtex/all.bib index 9f8fabd3e..d59eca688 100644 --- a/src/bibtex/all.bib +++ b/src/bibtex/all.bib @@ -274,7 +274,7 @@ @article{turing:1936 volume={58}, number={345-363}, pages={5}, year={1936} } - + @article{kucukelbir:2015, title={Automatic variational inference in Stan}, author={Kucukelbir, Alp and Ranganath, Rajesh and Gelman, Andrew and Blei, David}, journal={Advances in @@ -1273,7 +1273,7 @@ @article{Timonen+etal:2023:ODE-PSIS title={An importance sampling approach for reliable and efficient inference in {Bayesian} ordinary differential equation models}, author={Timonen, Juho and Siccha, Nikolas and Bales, Ben and L{\"a}hdesm{\"a}ki, Harri and Vehtari, - Aki}, journal={Stat}, year={2023}, volume = 12, number = 1, pages = {e614} + Aki}, journal={Stat}, year={2023}, volume = 12, number = 1, pages = {e614} } @article{Vehtari+etal:2024:PSIS, @@ -1320,5 +1320,71 @@ @misc{seyboldt:2024 @article{lancaster:1965, ISSN = {00029890, 19300972}, URL = {http://www.jstor.org/stable/2312989}, author = {H. O. Lancaster}, journal = {The American Mathematical Monthly}, number = {1}, pages = {4--12}, publisher = -{Mathematical Association of America}, title = {The {H}elmert Matrices}, urldate -= {2025-05-07}, volume = {72}, year = {1965} } +{Mathematical Association of America}, title = {The {H}elmert Matrices}, +urldate = {2025-05-07}, volume = {72}, year = {1965} } + +@article{Margossian:2020, + Author = {Margossian, Charles C and Vehtari, Aki and Simpson, Daniel + and Agrawal, Raj}, + Title = {Hamiltonian Monte Carlo using an adjoint-differentiated Laplace approximation: Bayesian inference for latent Gaussian models and beyond}, + journal = {Advances in Neural Information Processing Systems}, + volume = {34}, + Year = {2020}} + +@article{Kuss:2005, + author = {Kuss, Malte and Rasmussen, Carl E}, + title = {Assessing Approximate Inference for Binary {Gaussian} Process Classification}, + journal = {Journal of Machine Learning Research}, + volume = {6}, + pages = {1679 -- 1704}, + year = {2005}} + +@article{Vanhatalo:2010, + author = {Jarno Vanhatalo and Ville Pietil\"{a}inen and Aki Vehtari}, + title = {Approximate inference for disease mapping with sparse {Gaussian} processes}, + journal = {Statistics in Medicine}, + year = {2010}, + volume = {29}, + number = {15}, + pages = {1580--1607} +} + +@article{Cseke:2011, + author = {Botond Cseke and Heskes, Tom}, + title = {Approximate marginals in latent {Gaussian} models}, + journal = {Journal of Machine Learning Research}, + volume = {12}, + issue = {2}, + page = {417 -- 454}, + year = {2011}} + +@article{Vehtari:2016, + author = {Aki Vehtari and Tommi Mononen and Ville Tolvanen and Tuomas Sivula and Ole Winther}, + title = {Bayesian Leave-One-Out Cross-Validation Approximations for {Gaussian} Latent Variable Models}, + journal = {Journal of Machine Learning Research}, + year = {2016}, + volume = {17}, + number = {103}, + pages = {1--38}, + url = {http://jmlr.org/papers/v17/14-540.html} +} + + +@article{Margossian:2023, + author = {Margossian, Charles C}, + title = {General adjoint-differentiated Laplace approximation}, + journal = {arXiv:2306.14976 }, + year = {2023}} + + + @article{Rue:2009, + title={Approximate Bayesian inference for latent Gaussian models by using integrated nested Laplace approximations}, + author={Rue, H{\aa}vard and Martino, Sara and Chopin, Nicolas}, + journal={Journal of the Royal Statistical Society: Series B (Statistical Methodology)}, + volume={71}, + number={2}, + pages={319--392}, + year={2009}, + publisher={Wiley Online Library}, + doi={10.1111/j.1467-9868.2008.00700.x} +} diff --git a/src/functions-reference/_quarto.yml b/src/functions-reference/_quarto.yml index bc6e83acb..9fd6f9a9a 100644 --- a/src/functions-reference/_quarto.yml +++ b/src/functions-reference/_quarto.yml @@ -72,6 +72,7 @@ book: - part: "Additional Distributions" chapters: - hidden_markov_models.qmd + - embedded_laplace.qmd - part: "Appendix" chapters: - mathematical_functions.qmd diff --git a/src/functions-reference/embedded_laplace.qmd b/src/functions-reference/embedded_laplace.qmd new file mode 100644 index 000000000..63346dbfb --- /dev/null +++ b/src/functions-reference/embedded_laplace.qmd @@ -0,0 +1,616 @@ +--- +pagetitle: Embedded Laplace Approximation +--- + +# Embedded Laplace Approximation + +The embedded Laplace approximation can be used to approximate certain +marginal and conditional distributions that arise in latent Gaussian models. +A latent Gaussian model observes the following hierarchical structure: +\begin{eqnarray} + \phi &\sim& p(\phi), \\ + \theta &\sim& \text{MultiNormal}(0, K(\phi)), \\ + y &\sim& p(y \mid \theta, \phi). +\end{eqnarray} +In this formulation, $y$ represents the +observed data, and $p(y \mid \theta, \phi)$ is the likelihood function that +specifies how observations are generated conditional on the latent Gaussian +variables $\theta$ and hyperparameters $\phi$. +$K(\phi)$ denotes the prior covariance matrix for the latent Gaussian variables +$\theta$ and is parameterized by $\phi$. +The prior $p(\theta \mid \phi)$ is restricted to be a multivariate normal +centered at 0. That said, we can always pick a likelihood that offsets $\theta$, +which is equivalently to specifying a prior mean. + +To sample from the joint posterior $p(\phi, \theta \mid y)$, we can either +use a standard method, such as Markov chain Monte Carlo, or we can follow +a two-step procedure: + +1. sample from the *marginal posterior* $p(\phi \mid y)$, +2. sample from the *conditional posterior* $p(\theta \mid y, \phi)$. + +In the above procedure, neither the marginal posterior nor the conditional posterior +are typically available in closed form and so they must be approximated. +The marginal posterior can be written as $p(\phi \mid y) \propto p(y \mid \phi) p(\phi)$, +where $p(y \mid \phi) = \int p(y \mid \phi, \theta) p(\theta) \text{d}\theta$ +is called the marginal likelihood. The Laplace method approximates +$p(y \mid \phi, \theta) p(\theta)$ with a normal distribution centered at +$$ + \theta^* = \underset{\theta}{\text{argmax}} \ \log p(\theta \mid y, \phi), +$$ +and $\theta^*$ is obtained using a numerical optimizer. +The resulting Gaussian integral can be evaluated analytically to obtain an +approximation to the log marginal likelihood +$\log \hat p(y \mid \phi) \approx \log p(y \mid \phi)$. +Specifically: +$$ + \hat p(y \mid \phi) = \frac{p(\theta^* \mid \phi) p(y \mid \theta^*, \phi)}{\hat p (\theta^* \mid \phi, y)}. +$$ + +Combining this marginal likelihood with the prior in the `model` +block, we can then sample from the marginal posterior $p(\phi \mid y)$ +using one of Stan's algorithms. The marginal posterior is lower +dimensional and likely to have a simpler geometry leading to more +efficient inference. On the other hand each marginal likelihood +computation is more costly, and the combined change in efficiency +depends on the case. + +To obtain posterior draws for $\theta$, we sample from the normal +approximation to $p(\theta \mid y, \phi)$ in `generated quantities`. +The process of iteratively sampling from $p(\phi \mid y)$ (say, with MCMC) and +then $p(\theta \mid y, \phi)$ produces samples from the joint posterior +$p(\theta, \phi \mid y)$. + +The Laplace approximation is especially useful if $p(y \mid \phi, \theta)$ is +log-concave. Stan's embedded Laplace approximation is restricted to the case +where the prior $p(\theta \mid \phi)$ is multivariate normal. +Furthermore, the likelihood $p(y \mid \phi, \theta)$ must be computed using +only operations which support higher-order derivatives +(see section [specifying the likelihood function](#laplace_likelihood_spec)). + +## Approximating the log marginal likelihood $\log p(y \mid \phi)$ + +In the `model` block, we increment `target` with `laplace_marginal`, a function +that approximates the log marginal likelihood $\log p(y \mid \phi)$. +The signature of the function is: + +\index{{\tt \bfseries laplace\_marginal\_tol }!{\tt (function likelihood\_function, tuple(...) likelihood\_arguments, function covariance\_function, tuple(...), vector theta\_init covariance\_arguments): real}|hyperpage} + + + +`real` **`laplace_marginal`**`(function likelihood_function, tuple(...) likelihood_arguments, function covariance_function, tuple(...) covariance_arguments)` + +Which returns an approximation to the log marginal likelihood $p(y \mid \phi)$. +{{< since 2.37 >}} + +This function takes in the following arguments. + +1. `likelihood_function` - user-specified log likelihood whose first argument is the vector of latent Gaussian variables `theta` +2. `likelihood_arguments` - A tuple of the log likelihood arguments whose internal members will be passed to the covariance function +3. `covariance_function` - Prior covariance function +4. `covariance_arguments` A tuple of the arguments whose internal members will be passed to the the covariance function + +Below we go over each argument in more detail. + +## Specifying the log likelihood function {#laplace-likelihood_spec} + +The first step to use the embedded Laplace approximation is to write down a +function in the `functions` block which returns the log joint likelihood +$\log p(y \mid \theta, \phi)$. + +There are a few constraints on this function: + +1. The function return type must be `real` + +2. The first argument must be the latent Gaussian variable $\theta$ and must +have type `vector`. + +3. The operations in the function must support higher-order automatic +differentiation (AD). Most functions in Stan support higher-order AD. +The exceptions are functions with specialized calls for reverse-mode AD, and +these are higher-order functions (algebraic solvers, differential equation +solvers, and integrators), the marginalization function for hidden Markov +models (HMM) function, and the embedded Laplace approximation itself. + +The base signature of the function is + +```stan +real likelihood_function(vector theta, ...) +``` + +The `...` represents a set of optional variadic arguments. There is no type +restrictions for the variadic arguments `...` and each argument can be passed +as data or parameter. + +The tuple after `likelihood_function` contains the arguments that get passed +to `likelihood_function` *excluding $\theta$*. For instance, if a user defined +likelihood uses a real and a matrix the likelihood function's signature would +first have a vector and then a real and matrix argument. + +```stan +real likelihood_fun(vector theta, real a, matrix X) +``` + +The call to the laplace marginal would start with this likelihood and +tuple holding the other likelihood arguments. We do not need to pass +`theta`, since it is marginalized out and therefore does not +appear explicitly as a model parameter. + +```stan +real val = laplace_marginal(likelihood_fun, (a, X), ...); +``` + +As always, users should use parameter arguments only when necessary in order to +speed up differentiation. +In general, we recommend marking data only arguments with the keyword `data`, +for example, + +```stan +real likelihood_function(vector theta, data vector x, ...) +``` + +## Specifying the covariance function + +The argument `covariance_function` returns the prior covariance matrix +$K$. The signature for this function is the same as a standard stan function. +It's return type must be a matrix of size $n \times n$ where $n$ is the size of $\theta$. + +```stan +matrix covariance_function(...) +``` + + + +The `...` represents a set of optional +variadic arguments. There is no type restrictions for the variadic arguments +`...` and each argument can be passed as data or parameter. The variables +$\phi$ is implicitly defined as the collection of all non-data arguments passed +to `likelihood_function` (excluding $\theta$) and `covariance_function`. + +The tuple after `covariance_function` contains the arguments that get passed +to `covariance_function`. For instance, if a user defined covariance function +uses two vectors +```stan +matrix cov_fun(real b, matrix Z) +``` +the call to the Laplace marginal would include the covariance function and +a tuple holding the covariance function arguments. + +```stan +real val = laplace_marginal(likelihood_fun, (a, X), cov_fun, (b, Z), ...); +``` + +## Control parameters + +It also possible to specify control parameters, which can help improve the +optimization that underlies the Laplace approximation, using `laplace_marginal_tol` +with the following signature: + +\index{{\tt \bfseries laplace\_marginal\_tol }!{\tt (function likelihood\_function, tuple(...), function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + + +\index{{\tt \bfseries laplace\_marginal\_tol }!{\tt (function likelihood\_function, tuple(...), function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol`**`(function likelihood_function, tuple(...), function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +and allows the user to tune the control parameters of the approximation. + +* `theta_init`: the initial guess for the Newton solver when finding the mode +of $p(\theta \mid y, \phi)$. By default, it is a zero-vector. + +* `tol`: the tolerance $\epsilon$ of the optimizer. Specifically, the optimizer +stops when $||\nabla \log p(\theta \mid y, \phi)|| \le \epsilon$. By default, +the value is $\epsilon = 10^{-6}$. + +* `max_num_steps`: the maximum number of steps taken by the optimizer before +it gives up (in which case the Metropolis proposal gets rejected). The default +is 100 steps. + +* `hessian_block_size`: the size of the blocks, assuming the Hessian +$\partial \log p(y \mid \theta, \phi) \ \partial \theta$ is block-diagonal. +The structure of the Hessian is determined by the dependence structure of $y$ +on $\theta$. By default, the Hessian is treated as diagonal +(`hessian_block_size=1`). If the Hessian is not block diagonal, then set +`hessian_block_size=n`, where `n` is the size of $\theta$. + +* `solver`: choice of Newton solver. The optimizer used to compute the +Laplace approximation does one of three matrix decompositions to compute a +Newton step. The problem determines which decomposition is numerical stable. +By default (`solver=1`), the solver makes a Cholesky decomposition of the +negative Hessian, $- \partial \log p(y \mid \theta, \phi) / \partial \theta$. +If `solver=2`, the solver makes a Cholesky decomposition of the covariance +matrix $K(\phi)$. +If the Cholesky decomposition cannot be computed for neither the negative +Hessian nor the covariance matrix, use `solver=3` which uses a more expensive +but less specialized approach. + +* `max_steps_linesearch`: maximum number of steps in linesearch. The linesearch +method tries to insure that the Newton step leads to a decrease in the +objective function. If the Newton step does not improve the objective function, +the step is repeatedly halved until the objective function decreases or the +maximum number of steps in the linesearch is reached. By default, +`max_steps_linesearch=0`, meaning no linesearch is performed. + +{{< since 2.37 >}} + +## Sample from the approximate conditional $\hat{p}(\theta \mid y, \phi)$ + +In `generated quantities`, it is possible to sample from the Laplace +approximation of $p(\theta \mid \phi, y)$ using `laplace_latent_rng`. +The signature for `laplace_latent_rng` follows closely +the signature for `laplace_marginal`: + + +\index{{\tt \bfseries laplace\_latent\_rng }!{\tt (function likelihood\_function, tuple(...), function covariance\_function, tuple(...), vector theta\_init): vector}|hyperpage} + +`vector` **`laplace_latent_rng`**`(function likelihood_function, tuple(...), function covariance_function, tuple(...))`
\newline + +Draws approximate samples from the conditional posterior $p(\theta \mid y, \phi)$. +{{< since 2.37 >}} + +Once again, it is possible to specify control parameters: +\index{{\tt \bfseries laplace\_latent\_tol\_rng }!{\tt (function likelihood\_function, tuple(...), function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): vector}|hyperpage} + +`vector` **`laplace_latent_tol_rng`**`(function likelihood_function, tuple(...), function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline +Draws approximate samples from the conditional posterior $p(\theta \mid y, \phi)$ +and allows the user to tune the control parameters of the approximation. +{{< since 2.37 >}} + +## Built-in Laplace marginal likelihood functions + +Stan provides convenient wrappers for the embedded Laplace approximation +when applied to latent Gaussian models with certain likelihoods. +With this wrapper, the likelihood is pre-specified and does not need to be +specified by the user. +The selection of supported likelihoods is currently +narrow and expected to grow. The wrappers exist for the user's +convenience but are not more computationally efficient than specifying log +likelihoods in the `functions` block. + +### Poisson with log link + +Given count data, with each observed count $y_i$ associated with a group +$g(i)$ and a corresponding latent variable $\theta_{g(i)}$, and a Poisson model, +the likelihood is +$$ +p(y \mid \theta, \phi) = \prod_i\text{Poisson} (y_i \mid \exp(\theta_{g(i)} + m_{g(i)})), +$$ +where $m_{g(i)}$ acts as an offset for $\theta_{g(i)}$. This can also be +interpreted as a prior mean on $\theta_{g(i)}$. +The arguments required to compute this likelihood are: + +* `y`: an array of counts. +* `y_index`: an array whose $i^\text{th}$ element indicates to which +group the $i^\text{th}$ observation belongs to. +* `m`: a vector of ofssets or prior means for $\theta$. + + +\index{{\tt \bfseries laplace\_marginal\_poisson\_log }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_poisson_log`**`(y_index, m, covariance_function, (...))`
\newline + +Increment target log probability density with `laplace_marginal_poisson_log_lupmf(y | y_index, m, covariance_function, (...))`. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_poisson\_log }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_tol_poisson_log`**`(y_index, m, covariance_function, (...), theta_init, tol, max_steps, hessian_block_size, solver, max_steps_linesearch)`
\newline + +Increment target log probability density with `laplace_marginal_poisson_log_lupmf(y | y_index, m, covariance_function, (...))`. + +The signatures for the embedded Laplace approximation function with a Poisson +likelihood are + + +\index{{\tt \bfseries laplace\_marginal\_poisson\_log\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): real}|hyperpage} +`real` **`laplace_marginal_poisson_log_lpmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...))`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a Poisson +distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_poisson\_log\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} +`real` **`laplace_marginal_tol_poisson_log_lpmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a Poisson +distribution with a log link, and allows the user to tune the control +parameters of the approximation. +{{< since 2.37 >}} + + + +\index{{\tt \bfseries laplace\_marginal\_poisson\_log\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} +`real` **`laplace_marginal_poisson_log_lupmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a Poisson +distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_poisson\_log\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, vector m, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol_poisson_log_lupmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a Poisson +distribution with a log link, and allows the user to tune the control +parameters of the approximation. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_poisson\_log\_rng }!{\tt (array[] int y, array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): vector}|hyperpage} +`vector` **`laplace_latent_poisson_log_rng`**`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...))`
\newline +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi)$ in the special case where the likelihood +$p(y \mid \theta)$ is a Poisson distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_tol\_poisson\_log\_rng }!{\tt (array[] int y, array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): vector}|hyperpage} + +`vector` **`laplace_latent_tol_poisson_log_rng`**`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi)$ in the special case where the likelihood +$p(y \mid \theta)$ is a Poisson distribution with a log link +and allows the user to tune the control parameters of the approximation. +{{< since 2.37 >}} + + +### Negative Binomial with log link + +The negative Binomial distribution generalizes the Poisson distribution by +introducing the dispersion parameter $\eta$. The corresponding likelihood is then +$$ +p(y \mid \theta, \phi) = \prod_i\text{NegBinomial2} (y_i \mid \exp(\theta_{g(i)} + m_{g(i)}), \eta). +$$ +Here we use the alternative parameterization implemented in Stan, meaning that +$$ +\mathbb E(y_i) = \exp (\theta_{g(i)} + m_{g(i)}), \\ +\text{Var}(y_i) = \mathbb E(y_i) + \frac{(\mathbb E(y_i))^2}{\eta}. +$$ +The arguments for the likelihood function are: + +* `y`: the observed counts +* `y_index`: an array whose $i^\text{th}$ element indicates to which +group the $i^\text{th}$ observation belongs to. +* `eta`: the overdispersion parameter. +* `m`: a vector of ofssets or prior means for $\theta$. + + +\index{{\tt \bfseries laplace\_marginal\_neg\_binomial\_2\_log }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_neg_binomial_2_log`**`(y_index, eta, m, covariance_function, (...))`
\newline + +Increment target log probability density with `laplace_marginal_neg_binomial_2_log_lupmf(y | y_index, eta, m, covariance_function, (...))`. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_neg\_binomial\_2\_log }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_tol_neg_binomial_2_log`**`(y_index, eta, m, covariance_function, (...), tol, max_steps, hessian_block_size, solver, max_steps_linesearch)`
\newline + +Increment target log probability density with `laplace_marginal_tol_neg_binomial_2_log_lupmf(y | y_index, eta, m, covariance_function, (...), tol, max_steps, hessian_block_size, solver, max_steps_linesearch)`. +{{< since 2.37 >}} + + +The function signatures for the embedded Laplace approximation with a negative +Binomial likelihood are + + +\index{{\tt \bfseries laplace\_marginal\_neg\_binomial\_2\_log\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): real}|hyperpage} + +`real` **`laplace_marginal_neg_binomial_2_log_lpmf`**`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...))`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi, \eta)$ +in the special case where the likelihood $p(y \mid \theta, \eta)$ is a Negative +Binomial distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_neg\_binomial\_2\_log\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol_neg_binomial_2_log_lpmf`**`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi, \eta)$ +in the special case where the likelihood $p(y \mid \theta, \eta)$ is a Negative +Binomial distribution with a log link, and allows the user to tune the control +parameters of the approximation. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_neg\_binomial\_2\_log\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): real}|hyperpage} + +`real` **`laplace_marginal_neg_binomial_2_log_lupmf`**`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...))`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi, \eta)$ +in the special case where the likelihood $p(y \mid \theta, \eta)$ is a Negative +Binomial distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_neg\_binomial\_2\_log\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol_neg_binomial_2_log_lupmf`**`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi, \eta)$ +in the special case where the likelihood $p(y \mid \theta, \eta)$ is a Negative +Binomial distribution with a log link, and allows the user to tune the control +parameters of the approximation. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_neg\_binomial\_2\_log\_rng }!{\tt (array[] int y, array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): vector}|hyperpage} + +`vector` **`laplace_latent_neg_binomial_2_log_rng`**`(array[] int y, array[] int y_index, real eta, vector m, function covariance_function, tuple(...))`
\newline + +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi, \eta)$ in the special case where the likelihood +$p(y \mid \theta, \eta)$ is a Negative binomial distribution with a log link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_tol\_neg\_binomial\_2\_log\_rng }!{\tt (array[] int y, array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): vector}|hyperpage} + +`vector` **`laplace_latent_tol_neg_binomial_2_log_rng`**`(array[] int y, array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi, \eta)$ in the special case where the likelihood +$p(y \mid \theta, \eta)$ is a Negative binomial distribution with a log link +and allows the user to tune the control parameters of the approximation. +{{< since 2.37 >}} + +### Bernoulli with logit link + +Given binary outcome $y_i \in \{0, 1\}$ and Bernoulli model, the likelihood is +$$ +p(y \mid \theta, \phi) = \prod_i\text{Bernoulli} (y_i \mid \text{logit}^{-1}(\theta_{g(i)} + m_{g(i)})). +$$ +The arguments of the likelihood function are: + +* `y`: the observed counts +* `y_index`: an array whose $i^\text{th}$ element indicates to which +group the $i^\text{th}$ observation belongs to. +* `m`: a vector of ofssets or prior means for $\theta$. + + +\index{{\tt \bfseries laplace\_marginal\_bernoulli\_logit }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_bernoulli_logit`**`(y_index, m, covariance_function, (...))`
\newline + +Increment target log probability density with `laplace_marginal_bernoulli_logit_lupmf(y | y_index, m, covariance_function, (...))`. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_bernoulli\_logit }!sampling statement|hyperpage} + +`y ~ ` **`laplace_marginal_tol_bernoulli_logit`**`(y_index, m, covariance_function, (...), tol, max_steps, hessian_block_size, solver, max_steps_linesearch)`
\newline + +Increment target log probability density with `laplace_marginal_tol_bernoulli_logit_lupmf(y | y_index, m, covariance_function, (...), theta_init, tol, max_steps, hessian_block_size, solver, max_steps_linesearch)`. +{{< since 2.37 >}} + + +The function signatures for the embedded Laplace approximation with a Bernoulli likelihood are + + +\index{{\tt \bfseries laplace\_marginal\_bernoulli\_logit\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, function covariance\_function, tuple(...)): real}|hyperpage} + +`real` **`laplace_marginal_bernoulli_logit_lpmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...))`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a bernoulli +distribution with a logit link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_bernoulli\_logit\_lpmf }!{\tt (array[] int y \textbar\ array[] int y\_index, vector m, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol_bernoulli_logit_lpmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a bernoulli +distribution with a logit link and allows the user to tune the control parameters. +{{< since 2.37 >}} + + + +\index{{\tt \bfseries laplace\_marginal\_bernoulli\_logit\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, vector m, function covariance\_function, tuple(...)): real}|hyperpage} + +`real` **`laplace_marginal_bernoulli_logit_lupmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...))`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a bernoulli +distribution with a logit link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_marginal\_tol\_bernoulli\_logit\_lupmf }!{\tt (array[] int y \textbar\ array[] int y\_index, vector m, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): real}|hyperpage} + +`real` **`laplace_marginal_tol_bernoulli_logit_lupmf`**`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns an approximation to the log marginal likelihood $p(y \mid \phi)$ +in the special case where the likelihood $p(y \mid \theta)$ is a bernoulli +distribution with a logit link and allows the user to tune the control parameters. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_bernoulli\_logit\_rng }!{\tt (array[] int y, array[] int y\_index, function covariance\_function, tuple(...), vector theta\_init): vector}|hyperpage} + +`vector` **`laplace_latent_bernoulli_logit_rng`**`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...))`
\newline + +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi)$ in the special case where the likelihood +$p(y \mid \theta)$ is a Bernoulli distribution with a logit link. +{{< since 2.37 >}} + + +\index{{\tt \bfseries laplace\_latent\_tol\_bernoulli\_logit\_rng }!{\tt (array[] int y, array[] int y\_index, vector m, function covariance\_function, tuple(...), vector theta\_init, real tol, int max\_steps, int hessian\_block\_size, int solver, int max\_steps\_linesearch): vector}|hyperpage} + +`vector` **`laplace_latent_tol_bernoulli_logit_rng`**`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch)`
\newline + +Returns a draw from the Laplace approximation to the conditional posterior +$p(\theta \mid y, \phi)$ in the special case where the likelihood +$p(y \mid \theta)$ is a Bernoulli distribution with a logit link, +and lets the user tune the control parameters of the approximation. +{{< since 2.37 >}} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/functions-reference/functions_index.qmd b/src/functions-reference/functions_index.qmd index 8bf2b8ad3..6aa9250a3 100644 --- a/src/functions-reference/functions_index.qmd +++ b/src/functions-reference/functions_index.qmd @@ -1621,6 +1621,141 @@ pagetitle: Alphabetical Index -
[`(T x) : R`](real-valued_basic_functions.qmd#index-entry-ab5ca07ba7ba53020939ca3ba7c6e64ccf05cf19) (real-valued_basic_functions.html)
+**laplace_latent_bernoulli_logit_rng**: + + -
[`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...)) : vector`](embedded_laplace.qmd#index-entry-627a0deb197f3eba9ab71644f42e8afb3e0793e3) (embedded_laplace.html)
+ + +**laplace_latent_neg_binomial_2_log_rng**: + + -
[`(array[] int y, array[] int y_index, real eta, vector m, function covariance_function, tuple(...)) : vector`](embedded_laplace.qmd#index-entry-05f4fe23889b0c68edde7fc84945af0bf8d1e726) (embedded_laplace.html)
+ + +**laplace_latent_poisson_log_rng**: + + -
[`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...)) : vector`](embedded_laplace.qmd#index-entry-e6f1da0a228b9548915532b62d3d8d25ea1f893d) (embedded_laplace.html)
+ + +**laplace_latent_rng**: + + -
[`(function likelihood_function, tuple(...), function covariance_function, tuple(...)) : vector`](embedded_laplace.qmd#index-entry-6d0685309664591fc32d3e2a2304af7aa5459e1c) (embedded_laplace.html)
+ + +**laplace_latent_tol_bernoulli_logit_rng**: + + -
[`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : vector`](embedded_laplace.qmd#index-entry-66d8665514d9fb3d6d17ee95d68e7d186e87e229) (embedded_laplace.html)
+ + +**laplace_latent_tol_neg_binomial_2_log_rng**: + + -
[`(array[] int y, array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : vector`](embedded_laplace.qmd#index-entry-95b14b2cc2f02a8abcbb4f4d09fffa1901608512) (embedded_laplace.html)
+ + +**laplace_latent_tol_poisson_log_rng**: + + -
[`(array[] int y, array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : vector`](embedded_laplace.qmd#index-entry-97f692748ebfabf0574b7a8e2edaceb940ee4b6b) (embedded_laplace.html)
+ + +**laplace_marginal**: + + -
[`(function likelihood_function, tuple(...) likelihood_arguments, function covariance_function, tuple(...) covariance_arguments) : real`](embedded_laplace.qmd#index-entry-6da15f0ed076016d814cdc278127896f99d29633) (embedded_laplace.html)
+ + +**laplace_marginal_bernoulli_logit**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-1d93ab799518d0aac88e63c01f9655f36c7cbeb6) (embedded_laplace.html)
+ + +**laplace_marginal_bernoulli_logit_lpmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-6f7e1622f7b7534d2af01b71426f48c1ed0021b6) (embedded_laplace.html)
+ + +**laplace_marginal_bernoulli_logit_lupmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-bb1efa7b7c8782cd786578df678fa7327e6b1e15) (embedded_laplace.html)
+ + +**laplace_marginal_neg_binomial_2_log**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-7f807083cfb7347ece100da5dfe719186a678d02) (embedded_laplace.html)
+ + +**laplace_marginal_neg_binomial_2_log_lpmf**: + + -
[`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-b6bc4ba819ac536112abc8a24eb407fdbbf77fa4) (embedded_laplace.html)
+ + +**laplace_marginal_neg_binomial_2_log_lupmf**: + + -
[`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-f4be4d5300d7dbedff1ddbd5fdad1b78f0cfb621) (embedded_laplace.html)
+ + +**laplace_marginal_poisson_log**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-b2c6946b65561039eb8bf3999fb59d38a6336e10) (embedded_laplace.html)
+ + +**laplace_marginal_poisson_log_lpmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-6ec228e252d52b0e8ee5f3dd836607f8d8cb8a29) (embedded_laplace.html)
+ + +**laplace_marginal_poisson_log_lupmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...)) : real`](embedded_laplace.qmd#index-entry-c092314a5f45deef27ce0e8930a7d28c87ca601d) (embedded_laplace.html)
+ + +**laplace_marginal_tol**: + + -
[`(function likelihood_function, tuple(...), function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-0f4bd0330deef2db7884dc5a4c933f181e1f2a8c) (embedded_laplace.html)
+ + +**laplace_marginal_tol_bernoulli_logit**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-92d43f7c3643c7d85966bbfc0b2a71546facb37c) (embedded_laplace.html)
+ + +**laplace_marginal_tol_bernoulli_logit_lpmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-25eee002446b22fc3bbeb35b557d0371f7c6fba5) (embedded_laplace.html)
+ + +**laplace_marginal_tol_bernoulli_logit_lupmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-d3cec88dd3810edb8b58d7897a3c69e6d4172f8d) (embedded_laplace.html)
+ + +**laplace_marginal_tol_neg_binomial_2_log**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-40016f7d5d96b9d999ca42f446720cf3f3cd5769) (embedded_laplace.html)
+ + +**laplace_marginal_tol_neg_binomial_2_log_lpmf**: + + -
[`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-4f9b672d0827401038b66b3b7ebf71225e12794e) (embedded_laplace.html)
+ + +**laplace_marginal_tol_neg_binomial_2_log_lupmf**: + + -
[`(array[] int y | array[] int y_index, real eta, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-89629abb0f3629f91a5f95662063961ffaa43651) (embedded_laplace.html)
+ + +**laplace_marginal_tol_poisson_log**: + + -
[distribution statement](embedded_laplace.qmd#index-entry-4a7ecca812a308ea648bb31996559cba79738d83) (embedded_laplace.html)
+ + +**laplace_marginal_tol_poisson_log_lpmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-0bdae1e29545db671474b6dd8ad71c13a9653450) (embedded_laplace.html)
+ + +**laplace_marginal_tol_poisson_log_lupmf**: + + -
[`(array[] int y | array[] int y_index, vector m, function covariance_function, tuple(...), vector theta_init, real tol, int max_steps, int hessian_block_size, int solver, int max_steps_linesearch) : real`](embedded_laplace.qmd#index-entry-218384ef96a2418722ba7d631c085623afb9bc23) (embedded_laplace.html)
+ + **lbeta**: -
[`(real alpha, real beta) : real`](real-valued_basic_functions.qmd#index-entry-def48992e0f8724381904fba78466b0f4a0d14bd) (real-valued_basic_functions.html)
diff --git a/src/reference-manual/_quarto.yml b/src/reference-manual/_quarto.yml index 889680338..1ef1b59fc 100644 --- a/src/reference-manual/_quarto.yml +++ b/src/reference-manual/_quarto.yml @@ -59,6 +59,7 @@ book: - pathfinder.qmd - variational.qmd - laplace.qmd + - laplace_embedded.qmd - diagnostics.qmd - part: "Usage" diff --git a/src/reference-manual/laplace_embedded.qmd b/src/reference-manual/laplace_embedded.qmd new file mode 100644 index 000000000..3912ccc8a --- /dev/null +++ b/src/reference-manual/laplace_embedded.qmd @@ -0,0 +1,176 @@ +--- +pagetitle: Embedded Laplace Approximation +--- + +# Embedded Laplace Approximation + +Stan provides functions to perform an embedded Laplace +approximation for latent Gaussian models, following the procedure described +by @RasmussenWilliams:2006 and @Rue:2009. This approach is often refered to +as the integrated or nested Laplace approximation, although the exact details +of the method can vary. The details of Stan's implementation can be found in +references [@Margossian:2020; @Margossian:2023]. + +A standard approach to fit a latent Gaussian model would be to perform inference +jointly over the latent Gaussian variables and the hyperparameters. +Instead, the embedded Laplace approximation can be used to do *approximate* +marginalization of the latent Gaussian variables; we can then +use any inference over the remaining hyperparameters, for example Hamiltonian +Monte Carlo sampling. + +Formally, consider a latent Gaussian model, +\begin{eqnarray*} + \phi & \sim & p(\phi) \\ + \theta & \sim & \text{Multi-Normal}(0, K(\phi)) \\ + y & \sim & p(y \mid \theta, \phi). +\end{eqnarray*} +The motivation for marginalization is to bypass the challenging geometry of the joint +posterior $p(\phi, \theta \mid y)$. This geometry (e.g. funnels) often frustrates +inference algorithms, including Hamiltonian Monte Carlo sampling and approximate +methods such as variational inference. On the other hand, the marginal posterior +$p(\phi \mid y)$ is often well-behaved and in many cases low-dimensional. +Furthermore, the conditional posterior $p(\theta \mid \phi, y)$ can be well +approximated by a normal distribution, if the likelihood $p(y \mid \theta, \phi)$ +is log concave. + + +## Approximation of the conditional posterior and marginal likelihood + +The Laplace approximation is the normal distribution that matches the mode +and curvature of the conditional posterior $p(\theta \mid y, \phi)$. +The mode, +$$ + \theta^* = \underset{\theta}{\text{argmax}} \ p(\theta \mid y, \phi), +$$ +is estimated by a Newton solver. Since the approximation is normal, +the curvature is matched by setting the covariance to the negative Hessian +of the log conditional posterior, evaluated at the mode, +$$ + \Sigma^* = - \left . \frac{\partial^2}{\partial \theta^2} + \log p (\theta \mid \phi, y) \right |_{\theta =\theta^*}. +$$ +The resulting Laplace approximation is then, +$$ +\hat p_\mathcal{L} (\theta \mid y, \phi) = \text{Multi-Normal}(\theta^*, \Sigma^*) +\approx p(\theta \mid y, \phi). +$$ +This approximation implies another approximation for the marginal likelihood, +$$ + \hat p_\mathcal{L}(y \mid \phi) := \frac{p(\theta^* \mid \phi) \ + p(y \mid \theta^*, \phi) }{ \hat p_\mathcal{L} (\theta^* \mid \phi, y) } + \approx p(y \mid \phi). +$$ +Hence, a strategy to approximate the posterior of the latent Gaussian model +is to first estimate the marginal posterior +$\hat p_\mathcal{L}(\phi \mid y) \propto p(\phi) p_\mathcal{L} (y \mid \phi)$ +using any algorithm supported by Stan. +Approximate posterior draws for the latent Gaussian variables are then +obtained by first drawing $\phi \sim \hat p_\mathcal{L}(\phi \mid y)$ and +then $\theta \sim \hat p_\mathcal{L}(\theta \mid \phi, y)$. + + +## Trade-offs of the approximation + +The embedded Laplace approximation presents several trade-offs with standard +inference over the joint posterior $p(\theta, \phi \mid y)$. The main +advantage of the embedded Laplace approximation is that it side-steps the +intricate geometry of hierarchical models. The marginal posterior +$p(\phi \mid y)$ can then be handled by Hamiltonian Monte Carlo sampling +without extensive tuning or reparameterization, and the mixing time is faster, +meaning we can run shorter chains to achieve a desired precision. In some cases, +approximate methods, e.g. variational inference, which +work poorly on the joint $p(\theta, \phi \mid y)$ work well on the marginal +posterior $p(\phi \mid y)$. + +On the other hand, the embedded Laplace approximation presents certain +disadvantages. First, we need to perform a Laplace approximation each time +the log marginal likelihood is evaluated, meaning each iteration +can be expensive. Secondly, the approximation can introduce non-negligible +error, especially with non-conventional likelihoods (note the prior +is always multivariate normal). How these trade-offs are resolved depends on +the application; see @Margossian:2020 for some examples. + + +## Details of the approximation + +### Tuning the Newton solver + +A critical component of the embedded Laplace approximation is the Newton solver +used to estimate the mode $\theta^*$ of $p(\theta \mid \phi, y)$. The objective +function being maximized is +$$ +\Psi(\theta) = \log p(\theta \mid \phi) + \log p(y \mid \theta, \phi), +$$ +and convergence is declared if the change in the objective is sufficiently +small between two iterations +$$ +| \Psi (\theta^{(i + 1)}) - \Psi (\theta^{(i)}) | \le \Delta, +$$ +for some *tolerance* $\Delta$. The solver also stops after reaching a +pre-specified *maximum number of steps*: in that case, Stan throws an exception +and rejects the current proposal. This is not a problem, as +long as these exceptions are rare and confined to early phases of the warmup. + +The Newton iteration can be augmented with a linesearch step to insure that +at each iteration the objective function $\Psi$ decreases. Specifically, +suppose that +$$ +\Psi (\theta^{(i + 1)}) < \Psi (\theta^{(i)}). +$$ +This can indicate that the Newton step is too large and that we skipped a region +where the objective function decreases. In that case, we can reduce the step +length by a factor of 2, using +$$ + \theta^{(i + 1)} \leftarrow \frac{\theta^{(i + 1)} + \theta^{(i)}}{2}. +$$ +We repeat this halving of steps until +$\Psi (\theta^{(i + 1)}) \ge \Psi (\theta^{(i)})$, or until a maximum number +of linesearch steps is reached. By default, this maximum is set to 0, which +means the Newton solver performs no linesearch. For certain problems, adding +a linsearch can make the optimization more stable. + + +The embedded Laplace approximation uses a custom Newton solver, specialized +to find the mode of $p(\theta \mid \phi, y)$. +A keystep for efficient optimization is to insure all matrix inversions are +numerically stable. This can be done using the Woodburry-Sherman-Morrison +formula and requires one of three matrix decompositions: + +1. Cholesky decomposition of the Hessian of the negative log likelihood +$W = - \partial^2_\theta \log p(y \mid \theta, \phi)$ + +2. Cholesky decomposition of the prior covariance matrix $K(\phi)$. + +3. LU-decomposition of $I + KW$, where $I$ is the identity matrix. + +The first solver (1) should be used if the negative log likelihood is +positive-definite. Otherwise the user should rely on (2). In rarer cases where +it is not numerically safe to invert the covariance matrix $K$, users can +use the third solver as a last-resort option. + + +### Sparse Hessian of the log likelihood + +A key step to speed up computation is to take advantage of the sparsity of +the Hessian of the log likelihood, +$$ + H = \frac{\partial^2}{\partial \theta^2} \log p(y \mid \theta, \phi). +$$ +For example, if the observations $(y_1, \cdots, y_N)$ are conditionally +independent and each depends on only depend on one component of $\theta$, +such that +$$ + \log p(y \mid \theta, \phi) = \sum_{i = 1}^N \log p(y_i \mid \theta_i, \phi), +$$ +then the Hessian is diagonal. This leads to faster calculations of the Hessian +and subsequently sparse matrix operations. This case is common in Gaussian +process models, and certain hierarchical models. + +Stan's suite of functions for the embedded Laplace approximation are not +equipped to handle arbitrary sparsity structures; instead, they work on +block-diagonal Hessians, and the user can specify the size $B$ of these blocks. +The user is responsible for working out what $B$ is. If the Hessian is dense, +then we simply set $B = N$. + +NOTE: currently, there is no support for sparse prior covariance matrix. +We expect this to be supported in future versions of Stan. diff --git a/src/stan-users-guide/gaussian-processes.qmd b/src/stan-users-guide/gaussian-processes.qmd index e7acf9a84..c92c3ec2e 100644 --- a/src/stan-users-guide/gaussian-processes.qmd +++ b/src/stan-users-guide/gaussian-processes.qmd @@ -28,7 +28,7 @@ functions drawn from the process. Gaussian processes can be encoded in Stan by implementing their mean and covariance functions or by using the specialized covariance functions -outlined below, and plugging the result into the Gaussian model. +outlined below, and plugging the result into the Gaussian model. This form of model is straightforward and may be used for simulation, model fitting, or posterior predictive inference. A more efficient Stan implementation for the GP with a normally distributed outcome marginalizes @@ -486,8 +486,165 @@ model { } ``` +#### Poisson GP using an embedded Laplace approximation {-} + +For computational reasons, we may want to integrate out the Gaussian process +$f$, as was done in the normal output model. Unfortunately, exact +marginalization over $f$ is not possible when the outcome model is not normal. +Instead, we may perform *approximate* marginalization with an *embedded +Laplace approximation* [@RasmussenWilliams:2006; @Rue:2009; @Margossian:2020]. +To do so, we first use the function `laplace_marginal` to approximate the marginal +likelihood $p(y \mid \rho, \alpha, a)$ and sample the +hyperparameters with Hamiltonian Monte Carlo sampling. Then, we recover the +integrated out $f$ in the `generated quantities` block using +`laplace_latent_rng`. + +The embedded Laplace approximation computes a Gaussian approximation of the +conditional posterior, +$$ + \hat p_\mathcal{L}(f \mid \rho, \alpha, a, y) \approx p(f \mid \rho, \alpha, a, y), +$$ +where $\hat p_\mathcal{L}$ is a Gaussian that matches the mode and curvature +of $p(f \mid \rho, \alpha, a, y)$. We then obtain an approximation of +the marginal likelihood as follows: +$$ + \hat p_\mathcal{L}(y \mid \rho, \alpha, a) + = \frac{p(f^* \mid \alpha, \rho) p(y \mid f^*, a)}{ + \hat p_\mathcal{L}(f \mid \rho, \alpha, a, y)}, +$$ +where $f^*$ is the mode of $p(f \mid \rho, \alpha, a, y)$, obtained via +numerical optimization. + +To use Stan's embedded Laplace approximation, we must define the prior covariance +function and the log likelihood function in the functions block. +```stan +functions { + // log likelihood function + real ll_function(vector f, real a, array[] int y) { + return poisson_log_lpmf(y | a + f); + } + + // covariance function + matrix cov_function(real rho, real alpha, array[] real x, int N, real delta) { + matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho); + return add_diag(K, delta) + } + +} +``` + +We then increment `target` in the model block with the approximation to +$\log p(y \mid \rho, \alpha, a)$. +```stan +model { + rho ~ inv_gamma(5, 5); + alpha ~ std_normal(); + sigma ~ std_normal(); + + target += laplace_marginal(ll_function, (a, y), + cov_function, (rho, alpha, x, N, delta)); +} +``` +Notice that we do not need to construct $f$ explicitly, since it is +marginalized out. Instead, we recover the GP function in `generated quantities`: +```stan +generated quantities { + vector[N] f = laplace_latent_rng(ll_function, (a, y), + cov_function, (rho, alpha, x, N, delta)); +} +``` + +Users can set the control parameters of the embedded Laplace approximation, +via `laplace_marginal_tol` and `laplace_latent_tol_rng`. When using these +functions, the user must set *all* the control parameters. +```stan +transformed data { +// ... + + vector[N] f_init = rep_vector(0, N); // starting point for optimizer. + real tol = 1e-6; // optimizer's tolerance for Laplace approx. + int max_num_steps = 1e3; // maximum number of steps for optimizer. + int hessian_block_size = 1; // when hessian of log likelihood is block + // diagonal, size of block (here 1). + int solver = 1; // which Newton optimizer to use; default is 1, + // use 2 and 3 only for special cases. + max_steps_linesearch = 0; // if >= 1, optimizer does a lineseach with + // specified number of steps. +} -#### Logistic Gaussian process regression {-} +// ... + +model { +// ... + + target += laplace_marginal(ll_function, (a, y), + cov_function, (rho, alpha, x, N, delta), + f_init, tol, max_num_steps, hessian_block_size, + solver, max_steps_linesearch); +} + +generated quantities { + vector[N] f = laplace_latent_rng(ll_function, (a, y), + cov_function, (rho, alpha, x, N, delta), + f_init, tol, max_num_steps, + hessian_block_size, solver, + max_steps_linesearch); +} + +``` +For details about the control parameters, see @Margossian:2023. + + +Stan also provides support for a limited menu of built-in functions, including +the Poisson distribution with a log link and and prior mean $m$. When using such +a built-in function, the user does not need to specify a likelihood in the +`functions` block. However, the user must strictly follow the signature of the +likelihood: in this case, $m$ must be a vector of length $N$ (to allow for +different offsets for each observation $y_i$) and we must indicate which +element of $f$ each component of $y$ matches using the variable $y_\text{index}$. +In our example, there is a simple pairing $(y_i, f_i)$, however we could imagine +a scenario where multiple observations $(y_{j1}, y_{j2}, ...)$ are observed +for a single $f_j$. + +```stan +transformed data { + // ... + array[n_obs] int y_index; + for (i in 1:n_obs) y_index[i] = i - 1; +} + +// ... + +transformed parameter { + vector[N] m = rep_vector(a, N); +} + +model { + // ... + target += laplace_marginal_poisson_log_lpmf(y | y_index, m, + cov_function, (rho, alpha, x, N, delta)); +} + +generated quantities { + vector[N] f = laplace_latent_poisson_log_rng(y, y_index, m, + cov_function, (rho, alpha, x, N, delta)); +} + +``` + +As before, we could specify the control parameters for the embedded Laplace +approximation using `laplace_marginal_tol_poisson_log_lpmf` and +`laplace_latent_tol_poisson_log_nrg`. + +Marginalization with a Laplace approximation can lead to faster inference, +however it also introduces an approximation error. In practice, this error +is negligible when using a Poisson likelihood and the approximation works well +for log concave likelihoods [@Kuss:2005; @Vanhatalo:2010; @Cseke:2011; +@Vehtari:2016]. +Still, users should exercise caution, especially +when trying unconventional likelihoods. + +#### Logistic GP regression {-} For binary classification problems, the observed outputs $z_n \in \{ 0,1 \}$ are binary. These outputs are modeled using a Gaussian @@ -514,10 +671,47 @@ data { // ... model { // ... - y ~ bernoulli_logit(a + f); + z ~ bernoulli_logit(a + f); } ``` +#### Logistic GP regression with an embedded Laplace approximation {-} + +As with the Poisson GP, we cannot marginalize the GP function exactly, +however we can resort to an embedded Laplace approximation. + +```stan +functions { + // log likelihood function + real ll_function(vector f, real a, array[] int z) { + return bernoulli_logit_lpmf(z | a + f); + } + + // covariance function + matrix cov_function(real rho, real alpha, array[] real x, int N, real delta) { + matrix[N, N] K = gp_exp_quad_cov(x, alpha, rho); + return add_diag(K, delta) + } +} + +// ... + +model { + target += laplace_marginal(ll_function, (a, z), + cov_function, (rho, alpha, x, N, delta)); +} + +generated quantities { + vector[N] f = laplace_latent_rng(ll_function, (a, z), + cov_function, (rho, alpha, x, N, delta)); +} +``` + +While marginalization with a Laplace approximation can lead to faster inference, +it also introduces an approximation error. In practice, this error may not be +negligible with a Bernoulli likelihood; for more discussion see, e.g. +[@Vehtari:2016; @Margossian:2020]. + ### Automatic relevance determination {-} @@ -799,7 +993,7 @@ input vector `x`. All that is left is to define the univariate normal distribution statement for `y`. The generated quantities block defines the quantity `y2`. We generate -`y2` by randomly generating `N2` values from univariate normals with +`y2` by randomly generating `N2` values from univariate normals with each mean corresponding to the appropriate element in `f`.