From 4c320ee54cdf10e90612740aa8231764d64bc837 Mon Sep 17 00:00:00 2001
From: James Bristow <james.bristow@plantandfood.co.nz>
Date: Wed, 7 Feb 2024 13:05:29 +1300
Subject: [PATCH] Adding Pyro

---
 pyro/README.md               |   3 +
 pyro/hmm.py                  | 774 +++++++++++++++++++++++++++++++++++
 pyro/kalman-filter.ipynb     | 218 ++++++++++
 pyro/mixed_hmm/README.md     |  44 ++
 pyro/mixed_hmm/__init__.py   |   0
 pyro/mixed_hmm/experiment.py | 180 ++++++++
 pyro/mixed_hmm/model.py      | 272 ++++++++++++
 pyro/mixed_hmm/seal_data.py  |  75 ++++
 requirements.txt             |   8 +-
 9 files changed, 1570 insertions(+), 4 deletions(-)
 create mode 100644 pyro/README.md
 create mode 100644 pyro/hmm.py
 create mode 100644 pyro/kalman-filter.ipynb
 create mode 100644 pyro/mixed_hmm/README.md
 create mode 100644 pyro/mixed_hmm/__init__.py
 create mode 100644 pyro/mixed_hmm/experiment.py
 create mode 100644 pyro/mixed_hmm/model.py
 create mode 100644 pyro/mixed_hmm/seal_data.py

diff --git a/pyro/README.md b/pyro/README.md
new file mode 100644
index 0000000..e62a7f0
--- /dev/null
+++ b/pyro/README.md
@@ -0,0 +1,3 @@
+# Pyro
+
+See https://pyro.ai/examples/index.html
\ No newline at end of file
diff --git a/pyro/hmm.py b/pyro/hmm.py
new file mode 100644
index 0000000..25dccfd
--- /dev/null
+++ b/pyro/hmm.py
@@ -0,0 +1,774 @@
+# Copyright (c) 2017-2019 Uber Technologies, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example shows how to marginalize out discrete model variables in Pyro.
+
+This combines Stochastic Variational Inference (SVI) with a
+variable elimination algorithm, where we use enumeration to exactly
+marginalize out some variables from the ELBO computation. We might
+call the resulting algorithm collapsed SVI or collapsed SGVB (i.e
+collapsed Stochastic Gradient Variational Bayes). In the case where
+we exactly sum out all the latent variables (as is the case here),
+this algorithm reduces to a form of gradient-based Maximum
+Likelihood Estimation.
+
+To marginalize out discrete variables ``x`` in Pyro's SVI:
+
+1. Verify that the variable dependency structure in your model
+    admits tractable inference, i.e. the dependency graph among
+    enumerated variables should have narrow treewidth.
+2. Annotate each target each such sample site in the model
+    with ``infer={"enumerate": "parallel"}``
+3. Ensure your model can handle broadcasting of the sample values
+    of those variables
+4. Use the ``TraceEnum_ELBO`` loss inside Pyro's ``SVI``.
+
+Note that empirical results for the models defined here can be found in
+reference [1]. This paper also includes a description of the "tensor
+variable elimination" algorithm that Pyro uses under the hood to
+marginalize out discrete latent variables.
+
+References
+
+1. "Tensor Variable Elimination for Plated Factor Graphs",
+Fritz Obermeyer, Eli Bingham, Martin Jankowiak, Justin Chiu,
+Neeraj Pradhan, Alexander Rush, Noah Goodman. https://arxiv.org/abs/1902.03210
+"""
+import argparse
+import logging
+import sys
+
+import torch
+import torch.nn as nn
+from torch.distributions import constraints
+
+import pyro
+import pyro.contrib.examples.polyphonic_data_loader as poly
+import pyro.distributions as dist
+from pyro import poutine
+from pyro.infer import SVI, JitTraceEnum_ELBO, TraceEnum_ELBO, TraceTMC_ELBO
+from pyro.infer.autoguide import AutoDelta
+from pyro.ops.indexing import Vindex
+from pyro.optim import Adam
+from pyro.util import ignore_jit_warnings
+
+logging.basicConfig(format="%(relativeCreated) 9d %(message)s", level=logging.DEBUG)
+
+# Add another handler for logging debugging events (e.g. for profiling)
+# in a separate stream that can be captured.
+log = logging.getLogger()
+debug_handler = logging.StreamHandler(sys.stdout)
+debug_handler.setLevel(logging.DEBUG)
+debug_handler.addFilter(filter=lambda record: record.levelno <= logging.DEBUG)
+log.addHandler(debug_handler)
+
+
+# Let's start with a simple Hidden Markov Model.
+#
+#     x[t-1] --> x[t] --> x[t+1]
+#        |        |         |
+#        V        V         V
+#     y[t-1]     y[t]     y[t+1]
+#
+# This model includes a plate for the data_dim = 88 keys on the piano. This
+# model has two "style" parameters probs_x and probs_y that we'll draw from a
+# prior. The latent state is x, and the observed state is y. We'll drive
+# probs_* with the guide, enumerate over x, and condition on y.
+#
+# Importantly, the dependency structure of the enumerated variables has
+# narrow treewidth, therefore admitting efficient inference by message passing.
+# Pyro's TraceEnum_ELBO will find an efficient message passing scheme if one
+# exists.
+def model_0(sequences, lengths, args, batch_size=None, include_prior=True):
+    assert not torch._C._get_tracing_state()
+    num_sequences, max_length, data_dim = sequences.shape
+    with poutine.mask(mask=include_prior):
+        # Our prior on transition probabilities will be:
+        # stay in the same state with 90% probability; uniformly jump to another
+        # state with 10% probability.
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1),
+        )
+        # We put a weak prior on the conditional probability of a tone sounding.
+        # We know that on average about 4 of 88 tones are active, so we'll set a
+        # rough weak prior of 10% of the notes being active at any one time.
+        probs_y = pyro.sample(
+            "probs_y",
+            dist.Beta(0.1, 0.9).expand([args.hidden_dim, data_dim]).to_event(2),
+        )
+    # In this first model we'll sequentially iterate over sequences in a
+    # minibatch; this will make it easy to reason about tensor shapes.
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    for i in pyro.plate("sequences", len(sequences), batch_size):
+        length = lengths[i]
+        sequence = sequences[i, :length]
+        x = 0
+        for t in pyro.markov(range(length)):
+            # On the next line, we'll overwrite the value of x with an updated
+            # value. If we wanted to record all x values, we could instead
+            # write x[t] = pyro.sample(...x[t-1]...).
+            x = pyro.sample(
+                "x_{}_{}".format(i, t),
+                dist.Categorical(probs_x[x]),
+                infer={"enumerate": "parallel"},
+            )
+            with tones_plate:
+                pyro.sample(
+                    "y_{}_{}".format(i, t),
+                    dist.Bernoulli(probs_y[x.squeeze(-1)]),
+                    obs=sequence[t],
+                )
+
+
+# To see how enumeration changes the shapes of these sample sites, we can use
+# the Trace.format_shapes() to print shapes at each site:
+# $ python examples/hmm.py -m 0 -n 1 -b 1 -t 5 --print-shapes
+# ...
+#  Sample Sites:
+#   probs_x dist          | 16 16
+#          value          | 16 16
+#   probs_y dist          | 16 88
+#          value          | 16 88
+#     tones dist          |
+#          value       88 |
+# sequences dist          |
+#          value        1 |
+#   x_178_0 dist          |
+#          value    16  1 |
+#   y_178_0 dist    16 88 |
+#          value       88 |
+#   x_178_1 dist    16  1 |
+#          value 16  1  1 |
+#   y_178_1 dist 16  1 88 |
+#          value       88 |
+#   x_178_2 dist 16  1  1 |
+#          value    16  1 |
+#   y_178_2 dist    16 88 |
+#          value       88 |
+#   x_178_3 dist    16  1 |
+#          value 16  1  1 |
+#   y_178_3 dist 16  1 88 |
+#          value       88 |
+#   x_178_4 dist 16  1  1 |
+#          value    16  1 |
+#   y_178_4 dist    16 88 |
+#          value       88 |
+#
+# Notice that enumeration (over 16 states) alternates between two dimensions:
+# -2 and -3.  If we had not used pyro.markov above, each enumerated variable
+# would need its own enumeration dimension.
+
+
+# Next let's make our simple model faster in two ways: first we'll support
+# vectorized minibatches of data, and second we'll support the PyTorch jit
+# compiler.  To add batch support, we'll introduce a second plate "sequences"
+# and randomly subsample data to size batch_size.  To add jit support we
+# silence some warnings and try to avoid dynamic program structure.
+
+
+# Note that this is the "HMM" model in reference [1] (with the difference that
+# in [1] the probabilities probs_x and probs_y are not MAP-regularized with
+# Dirichlet and Beta distributions for any of the models)
+def model_1(sequences, lengths, args, batch_size=None, include_prior=True):
+    # Sometimes it is safe to ignore jit warnings. Here we use the
+    # pyro.util.ignore_jit_warnings context manager to silence warnings about
+    # conversion to integer, since we know all three numbers will be the same
+    # across all invocations to the model.
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+    with poutine.mask(mask=include_prior):
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1),
+        )
+        probs_y = pyro.sample(
+            "probs_y",
+            dist.Beta(0.1, 0.9).expand([args.hidden_dim, data_dim]).to_event(2),
+        )
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    # We subsample batch_size items out of num_sequences items. Note that since
+    # we're using dim=-1 for the notes plate, we need to batch over a different
+    # dimension, here dim=-2.
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        x = 0
+        # If we are not using the jit, then we can vary the program structure
+        # each call by running for a dynamically determined number of time
+        # steps, lengths.max(). However if we are using the jit, then we try to
+        # keep a single program structure for all minibatches; the fixed
+        # structure ends up being faster since each program structure would
+        # need to trigger a new jit compile stage.
+        for t in pyro.markov(range(max_length if args.jit else lengths.max())):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                x = pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(probs_x[x]),
+                    infer={"enumerate": "parallel"},
+                )
+                with tones_plate:
+                    pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(probs_y[x.squeeze(-1)]),
+                        obs=sequences[batch, t],
+                    )
+
+
+# Let's see how batching changes the shapes of sample sites:
+# $ python examples/hmm.py -m 1 -n 1 -t 5 --batch-size=10 --print-shapes
+# ...
+#  Sample Sites:
+#   probs_x dist             | 16 16
+#          value             | 16 16
+#   probs_y dist             | 16 88
+#          value             | 16 88
+#     tones dist             |
+#          value          88 |
+# sequences dist             |
+#          value          10 |
+#       x_0 dist       10  1 |
+#          value    16  1  1 |
+#       y_0 dist    16 10 88 |
+#          value       10 88 |
+#       x_1 dist    16 10  1 |
+#          value 16  1  1  1 |
+#       y_1 dist 16  1 10 88 |
+#          value       10 88 |
+#       x_2 dist 16  1 10  1 |
+#          value    16  1  1 |
+#       y_2 dist    16 10 88 |
+#          value       10 88 |
+#       x_3 dist    16 10  1 |
+#          value 16  1  1  1 |
+#       y_3 dist 16  1 10 88 |
+#          value       10 88 |
+#       x_4 dist 16  1 10  1 |
+#          value    16  1  1 |
+#       y_4 dist    16 10 88 |
+#          value       10 88 |
+#
+# Notice that we're now using dim=-2 as a batch dimension (of size 10),
+# and that the enumeration dimensions are now dims -3 and -4.
+
+
+# Next let's add a dependency of y[t] on y[t-1].
+#
+#     x[t-1] --> x[t] --> x[t+1]
+#        |        |         |
+#        V        V         V
+#     y[t-1] --> y[t] --> y[t+1]
+#
+# Note that this is the "arHMM" model in reference [1].
+def model_2(sequences, lengths, args, batch_size=None, include_prior=True):
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+    with poutine.mask(mask=include_prior):
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1),
+        )
+        probs_y = pyro.sample(
+            "probs_y",
+            dist.Beta(0.1, 0.9).expand([args.hidden_dim, 2, data_dim]).to_event(3),
+        )
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        x, y = 0, 0
+        for t in pyro.markov(range(max_length if args.jit else lengths.max())):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                x = pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(probs_x[x]),
+                    infer={"enumerate": "parallel"},
+                )
+                # Note the broadcasting tricks here: to index probs_y on tensors x and y,
+                # we also need a final tensor for the tones dimension. This is conveniently
+                # provided by the plate associated with that dimension.
+                with tones_plate as tones:
+                    y = pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(probs_y[x, y, tones]),
+                        obs=sequences[batch, t],
+                    ).long()
+
+
+# Next consider a Factorial HMM with two hidden states.
+#
+#    w[t-1] ----> w[t] ---> w[t+1]
+#        \ x[t-1] --\-> x[t] --\-> x[t+1]
+#         \  /       \  /       \  /
+#          \/         \/         \/
+#        y[t-1]      y[t]      y[t+1]
+#
+# Note that since the joint distribution of each y[t] depends on two variables,
+# those two variables become dependent. Therefore during enumeration, the
+# entire joint space of these variables w[t],x[t] needs to be enumerated.
+# For that reason, we set the dimension of each to the square root of the
+# target hidden dimension.
+#
+# Note that this is the "FHMM" model in reference [1].
+def model_3(sequences, lengths, args, batch_size=None, include_prior=True):
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+    hidden_dim = int(args.hidden_dim**0.5)  # split between w and x
+    with poutine.mask(mask=include_prior):
+        probs_w = pyro.sample(
+            "probs_w", dist.Dirichlet(0.9 * torch.eye(hidden_dim) + 0.1).to_event(1)
+        )
+        probs_x = pyro.sample(
+            "probs_x", dist.Dirichlet(0.9 * torch.eye(hidden_dim) + 0.1).to_event(1)
+        )
+        probs_y = pyro.sample(
+            "probs_y",
+            dist.Beta(0.1, 0.9).expand([hidden_dim, hidden_dim, data_dim]).to_event(3),
+        )
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        w, x = 0, 0
+        for t in pyro.markov(range(max_length if args.jit else lengths.max())):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                w = pyro.sample(
+                    "w_{}".format(t),
+                    dist.Categorical(probs_w[w]),
+                    infer={"enumerate": "parallel"},
+                )
+                x = pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(probs_x[x]),
+                    infer={"enumerate": "parallel"},
+                )
+                with tones_plate as tones:
+                    pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(probs_y[w, x, tones]),
+                        obs=sequences[batch, t],
+                    )
+
+
+# By adding a dependency of x on w, we generalize to a
+# Dynamic Bayesian Network.
+#
+#     w[t-1] ----> w[t] ---> w[t+1]
+#        |  \       |  \       |   \
+#        | x[t-1] ----> x[t] ----> x[t+1]
+#        |   /      |   /      |   /
+#        V  /       V  /       V  /
+#     y[t-1]       y[t]      y[t+1]
+#
+# Note that message passing here has roughly the same cost as with the
+# Factorial HMM, but this model has more parameters.
+#
+# Note that this is the "PFHMM" model in reference [1].
+def model_4(sequences, lengths, args, batch_size=None, include_prior=True):
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+    hidden_dim = int(args.hidden_dim**0.5)  # split between w and x
+    with poutine.mask(mask=include_prior):
+        probs_w = pyro.sample(
+            "probs_w", dist.Dirichlet(0.9 * torch.eye(hidden_dim) + 0.1).to_event(1)
+        )
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(hidden_dim) + 0.1)
+            .expand_by([hidden_dim])
+            .to_event(2),
+        )
+        probs_y = pyro.sample(
+            "probs_y",
+            dist.Beta(0.1, 0.9).expand([hidden_dim, hidden_dim, data_dim]).to_event(3),
+        )
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        # Note the broadcasting tricks here: we declare a hidden torch.arange and
+        # ensure that w and x are always tensors so we can unsqueeze them below,
+        # thus ensuring that the x sample sites have correct distribution shape.
+        w = x = torch.tensor(0, dtype=torch.long)
+        for t in pyro.markov(range(max_length if args.jit else lengths.max())):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                w = pyro.sample(
+                    "w_{}".format(t),
+                    dist.Categorical(probs_w[w]),
+                    infer={"enumerate": "parallel"},
+                )
+                x = pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(Vindex(probs_x)[w, x]),
+                    infer={"enumerate": "parallel"},
+                )
+                with tones_plate as tones:
+                    pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(probs_y[w, x, tones]),
+                        obs=sequences[batch, t],
+                    )
+
+
+# Next let's consider a neural HMM model.
+#
+#     x[t-1] --> x[t] --> x[t+1]   } standard HMM +
+#        |        |         |
+#        V        V         V
+#     y[t-1] --> y[t] --> y[t+1]   } neural likelihood
+#
+# First let's define a neural net to generate y logits.
+class TonesGenerator(nn.Module):
+    def __init__(self, args, data_dim):
+        self.args = args
+        self.data_dim = data_dim
+        super().__init__()
+        self.x_to_hidden = nn.Linear(args.hidden_dim, args.nn_dim)
+        self.y_to_hidden = nn.Linear(args.nn_channels * data_dim, args.nn_dim)
+        self.conv = nn.Conv1d(1, args.nn_channels, 3, padding=1)
+        self.hidden_to_logits = nn.Linear(args.nn_dim, data_dim)
+        self.relu = nn.ReLU()
+
+    def forward(self, x, y):
+        # Hidden units depend on two inputs: a one-hot encoded categorical variable x, and
+        # a bernoulli variable y. Whereas x will typically be enumerated, y will be observed.
+        # We apply x_to_hidden independently from y_to_hidden, then broadcast the non-enumerated
+        # y part up to the enumerated x part in the + operation.
+        x_onehot = y.new_zeros(x.shape[:-1] + (self.args.hidden_dim,)).scatter_(
+            -1, x, 1
+        )
+        y_conv = self.relu(self.conv(y.reshape(-1, 1, self.data_dim))).reshape(
+            y.shape[:-1] + (-1,)
+        )
+        h = self.relu(self.x_to_hidden(x_onehot) + self.y_to_hidden(y_conv))
+        return self.hidden_to_logits(h)
+
+
+# We will create a single global instance later.
+tones_generator = None
+
+
+# The neural HMM model now uses tones_generator at each time step.
+#
+# Note that this is the "nnHMM" model in reference [1].
+def model_5(sequences, lengths, args, batch_size=None, include_prior=True):
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+
+    # Initialize a global module instance if needed.
+    global tones_generator
+    if tones_generator is None:
+        tones_generator = TonesGenerator(args, data_dim)
+    pyro.module("tones_generator", tones_generator)
+
+    with poutine.mask(mask=include_prior):
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1),
+        )
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        x = 0
+        y = torch.zeros(data_dim)
+        for t in pyro.markov(range(max_length if args.jit else lengths.max())):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                x = pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(probs_x[x]),
+                    infer={"enumerate": "parallel"},
+                )
+                # Note that since each tone depends on all tones at a previous time step
+                # the tones at different time steps now need to live in separate plates.
+                with pyro.plate("tones_{}".format(t), data_dim, dim=-1):
+                    y = pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(logits=tones_generator(x, y)),
+                        obs=sequences[batch, t],
+                    )
+
+
+# Next let's consider a second-order HMM model
+# in which x[t+1] depends on both x[t] and x[t-1].
+#
+#                     _______>______
+#         _____>_____/______        \
+#        /          /       \        \
+#     x[t-1] --> x[t] --> x[t+1] --> x[t+2]
+#        |        |          |          |
+#        V        V          V          V
+#     y[t-1]     y[t]     y[t+1]     y[t+2]
+#
+#  Note that in this model (in contrast to the previous model) we treat
+#  the transition and emission probabilities as parameters (so they have no prior).
+#
+# Note that this is the "2HMM" model in reference [1].
+def model_6(sequences, lengths, args, batch_size=None, include_prior=False):
+    num_sequences, max_length, data_dim = sequences.shape
+    assert lengths.shape == (num_sequences,)
+    assert lengths.max() <= max_length
+    hidden_dim = args.hidden_dim
+
+    if not args.raftery_parameterization:
+        # Explicitly parameterize the full tensor of transition probabilities, which
+        # has hidden_dim cubed entries.
+        probs_x = pyro.param(
+            "probs_x",
+            torch.rand(hidden_dim, hidden_dim, hidden_dim),
+            constraint=constraints.simplex,
+        )
+    else:
+        # Use the more parsimonious "Raftery" parameterization of
+        # the tensor of transition probabilities. See reference:
+        # Raftery, A. E. A model for high-order markov chains.
+        # Journal of the Royal Statistical Society. 1985.
+        probs_x1 = pyro.param(
+            "probs_x1",
+            torch.rand(hidden_dim, hidden_dim),
+            constraint=constraints.simplex,
+        )
+        probs_x2 = pyro.param(
+            "probs_x2",
+            torch.rand(hidden_dim, hidden_dim),
+            constraint=constraints.simplex,
+        )
+        mix_lambda = pyro.param(
+            "mix_lambda", torch.tensor(0.5), constraint=constraints.unit_interval
+        )
+        # we use broadcasting to combine two tensors of shape (hidden_dim, hidden_dim) and
+        # (hidden_dim, 1, hidden_dim) to obtain a tensor of shape (hidden_dim, hidden_dim, hidden_dim)
+        probs_x = mix_lambda * probs_x1 + (1.0 - mix_lambda) * probs_x2.unsqueeze(-2)
+
+    probs_y = pyro.param(
+        "probs_y",
+        torch.rand(hidden_dim, data_dim),
+        constraint=constraints.unit_interval,
+    )
+    tones_plate = pyro.plate("tones", data_dim, dim=-1)
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-2) as batch:
+        lengths = lengths[batch]
+        x_curr, x_prev = torch.tensor(0), torch.tensor(0)
+        # we need to pass the argument `history=2' to `pyro.markov()`
+        # since our model is now 2-markov
+        for t in pyro.markov(range(lengths.max()), history=2):
+            with poutine.mask(mask=(t < lengths).unsqueeze(-1)):
+                probs_x_t = Vindex(probs_x)[x_prev, x_curr]
+                x_prev, x_curr = x_curr, pyro.sample(
+                    "x_{}".format(t),
+                    dist.Categorical(probs_x_t),
+                    infer={"enumerate": "parallel"},
+                )
+                with tones_plate:
+                    probs_y_t = probs_y[x_curr.squeeze(-1)]
+                    pyro.sample(
+                        "y_{}".format(t),
+                        dist.Bernoulli(probs_y_t),
+                        obs=sequences[batch, t],
+                    )
+
+
+# Next we demonstrate how to parallelize the neural HMM above using Pyro's
+# DiscreteHMM distribution. This model is equivalent to model_5 above, but we
+# manually unroll loops and fuse ops, leading to a single sample statement.
+# DiscreteHMM can lead to over 10x speedup in models where it is applicable.
+def model_7(sequences, lengths, args, batch_size=None, include_prior=True):
+    with ignore_jit_warnings():
+        num_sequences, max_length, data_dim = map(int, sequences.shape)
+        assert lengths.shape == (num_sequences,)
+        assert lengths.max() <= max_length
+
+    # Initialize a global module instance if needed.
+    global tones_generator
+    if tones_generator is None:
+        tones_generator = TonesGenerator(args, data_dim)
+    pyro.module("tones_generator", tones_generator)
+
+    with poutine.mask(mask=include_prior):
+        probs_x = pyro.sample(
+            "probs_x",
+            dist.Dirichlet(0.9 * torch.eye(args.hidden_dim) + 0.1).to_event(1),
+        )
+    with pyro.plate("sequences", num_sequences, batch_size, dim=-1) as batch:
+        lengths = lengths[batch]
+        y = sequences[batch] if args.jit else sequences[batch, : lengths.max()]
+        x = torch.arange(args.hidden_dim)
+        t = torch.arange(y.size(1))
+        init_logits = torch.full((args.hidden_dim,), -float("inf"))
+        init_logits[0] = 0
+        trans_logits = probs_x.log()
+        with ignore_jit_warnings():
+            obs_dist = dist.Bernoulli(
+                logits=tones_generator(x, y.unsqueeze(-2))
+            ).to_event(1)
+            obs_dist = obs_dist.mask((t < lengths.unsqueeze(-1)).unsqueeze(-1))
+            hmm_dist = dist.DiscreteHMM(init_logits, trans_logits, obs_dist)
+        pyro.sample("y", hmm_dist, obs=y)
+
+
+models = {
+    name[len("model_") :]: model
+    for name, model in globals().items()
+    if name.startswith("model_")
+}
+
+
+def main(args):
+    if args.cuda:
+        torch.set_default_device("cuda")
+
+    logging.info("Loading data")
+    data = poly.load_data(poly.JSB_CHORALES)
+
+    logging.info("-" * 40)
+    model = models[args.model]
+    logging.info(
+        "Training {} on {} sequences".format(
+            model.__name__, len(data["train"]["sequences"])
+        )
+    )
+    sequences = data["train"]["sequences"]
+    lengths = data["train"]["sequence_lengths"]
+
+    # find all the notes that are present at least once in the training set
+    present_notes = (sequences == 1).sum(0).sum(0) > 0
+    # remove notes that are never played (we remove 37/88 notes)
+    sequences = sequences[..., present_notes]
+
+    if args.truncate:
+        lengths = lengths.clamp(max=args.truncate)
+        sequences = sequences[:, : args.truncate]
+    num_observations = float(lengths.sum())
+    pyro.set_rng_seed(args.seed)
+    pyro.clear_param_store()
+
+    # We'll train using MAP Baum-Welch, i.e. MAP estimation while marginalizing
+    # out the hidden state x. This is accomplished via an automatic guide that
+    # learns point estimates of all of our conditional probability tables,
+    # named probs_*.
+    guide = AutoDelta(
+        poutine.block(model, expose_fn=lambda msg: msg["name"].startswith("probs_"))
+    )
+
+    # To help debug our tensor shapes, let's print the shape of each site's
+    # distribution, value, and log_prob tensor. Note this information is
+    # automatically printed on most errors inside SVI.
+    if args.print_shapes:
+        first_available_dim = -2 if model is model_0 else -3
+        guide_trace = poutine.trace(guide).get_trace(
+            sequences, lengths, args=args, batch_size=args.batch_size
+        )
+        model_trace = poutine.trace(
+            poutine.replay(poutine.enum(model, first_available_dim), guide_trace)
+        ).get_trace(sequences, lengths, args=args, batch_size=args.batch_size)
+        logging.info(model_trace.format_shapes())
+
+    # Enumeration requires a TraceEnum elbo and declaring the max_plate_nesting.
+    # All of our models have two plates: "data" and "tones".
+    optim = Adam({"lr": args.learning_rate})
+    if args.tmc:
+        if args.jit:
+            raise NotImplementedError("jit support not yet added for TraceTMC_ELBO")
+        elbo = TraceTMC_ELBO(max_plate_nesting=1 if model is model_0 else 2)
+        tmc_model = poutine.infer_config(
+            model,
+            lambda msg: (
+                {"num_samples": args.tmc_num_samples, "expand": False}
+                if msg["infer"].get("enumerate", None) == "parallel"
+                else {}
+            ),
+        )  # noqa: E501
+        svi = SVI(tmc_model, guide, optim, elbo)
+    else:
+        Elbo = JitTraceEnum_ELBO if args.jit else TraceEnum_ELBO
+        elbo = Elbo(
+            max_plate_nesting=1 if model is model_0 else 2,
+            strict_enumeration_warning=(model is not model_7),
+            jit_options={"time_compilation": args.time_compilation},
+        )
+        svi = SVI(model, guide, optim, elbo)
+
+    # We'll train on small minibatches.
+    logging.info("Step\tLoss")
+    for step in range(args.num_steps):
+        loss = svi.step(sequences, lengths, args=args, batch_size=args.batch_size)
+        logging.info("{: >5d}\t{}".format(step, loss / num_observations))
+
+    if args.jit and args.time_compilation:
+        logging.debug(
+            "time to compile: {} s.".format(elbo._differentiable_loss.compile_time)
+        )
+
+    # We evaluate on the entire training dataset,
+    # excluding the prior term so our results are comparable across models.
+    train_loss = elbo.loss(model, guide, sequences, lengths, args, include_prior=False)
+    logging.info("training loss = {}".format(train_loss / num_observations))
+
+    # Finally we evaluate on the test dataset.
+    logging.info("-" * 40)
+    logging.info(
+        "Evaluating on {} test sequences".format(len(data["test"]["sequences"]))
+    )
+    sequences = data["test"]["sequences"][..., present_notes]
+    lengths = data["test"]["sequence_lengths"]
+    if args.truncate:
+        lengths = lengths.clamp(max=args.truncate)
+    num_observations = float(lengths.sum())
+
+    # note that since we removed unseen notes above (to make the problem a bit easier and for
+    # numerical stability) this test loss may not be directly comparable to numbers
+    # reported on this dataset elsewhere.
+    test_loss = elbo.loss(
+        model, guide, sequences, lengths, args=args, include_prior=False
+    )
+    logging.info("test loss = {}".format(test_loss / num_observations))
+
+    # We expect models with higher capacity to perform better,
+    # but eventually overfit to the training set.
+    capacity = sum(
+        value.reshape(-1).size(0) for value in pyro.get_param_store().values()
+    )
+    logging.info("{} capacity = {} parameters".format(model.__name__, capacity))
+
+
+if __name__ == "__main__":
+    assert pyro.__version__.startswith("1.8.6")
+    parser = argparse.ArgumentParser(
+        description="MAP Baum-Welch learning Bach Chorales"
+    )
+    parser.add_argument(
+        "-m",
+        "--model",
+        default="1",
+        type=str,
+        help="one of: {}".format(", ".join(sorted(models.keys()))),
+    )
+    parser.add_argument("-n", "--num-steps", default=50, type=int)
+    parser.add_argument("-b", "--batch-size", default=8, type=int)
+    parser.add_argument("-d", "--hidden-dim", default=16, type=int)
+    parser.add_argument("-nn", "--nn-dim", default=48, type=int)
+    parser.add_argument("-nc", "--nn-channels", default=2, type=int)
+    parser.add_argument("-lr", "--learning-rate", default=0.05, type=float)
+    parser.add_argument("-t", "--truncate", type=int)
+    parser.add_argument("-p", "--print-shapes", action="store_true")
+    parser.add_argument("--seed", default=0, type=int)
+    parser.add_argument("--cuda", action="store_true")
+    parser.add_argument("--jit", action="store_true")
+    parser.add_argument("--time-compilation", action="store_true")
+    parser.add_argument("-rp", "--raftery-parameterization", action="store_true")
+    parser.add_argument(
+        "--tmc",
+        action="store_true",
+        help="Use Tensor Monte Carlo instead of exact enumeration "
+        "to estimate the marginal likelihood. You probably don't want to do this, "
+        "except to see that TMC makes Monte Carlo gradient estimation feasible "
+        "even with very large numbers of non-reparametrized variables.",
+    )
+    parser.add_argument("--tmc-num-samples", default=10, type=int)
+    args = parser.parse_args()
+    main(args)
diff --git a/pyro/kalman-filter.ipynb b/pyro/kalman-filter.ipynb
new file mode 100644
index 0000000..8c802ee
--- /dev/null
+++ b/pyro/kalman-filter.ipynb
@@ -0,0 +1,218 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2e5c7973",
+   "metadata": {},
+   "source": [
+    "# https://pyro.ai/examples/ekf.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9ebb6829",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1.8.6'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import math\n",
+    "\n",
+    "import torch\n",
+    "import pyro\n",
+    "import pyro.distributions as dist\n",
+    "from pyro.infer.autoguide import AutoDelta\n",
+    "from pyro.optim import Adam\n",
+    "from pyro.infer import SVI, Trace_ELBO, config_enumerate\n",
+    "from pyro.contrib.tracking.extended_kalman_filter import EKFState\n",
+    "from pyro.contrib.tracking.distributions import EKFDistribution\n",
+    "from pyro.contrib.tracking.dynamic_models import NcvContinuous\n",
+    "from pyro.contrib.tracking.measurements import PositionMeasurement\n",
+    "\n",
+    "pyro.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "87dcc0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dt = 1e-2\n",
+    "num_frames = 10\n",
+    "dim = 4\n",
+    "\n",
+    "# Continuous model\n",
+    "ncv = NcvContinuous(dim, 2.0)\n",
+    "\n",
+    "# Truth trajectory\n",
+    "xs_truth = torch.zeros(num_frames, dim)\n",
+    "# initial direction\n",
+    "theta0_truth = 0.0\n",
+    "# initial state\n",
+    "with torch.no_grad():\n",
+    "    xs_truth[0, :] = torch.tensor([0.0, 0.0,  math.cos(theta0_truth), math.sin(theta0_truth)])\n",
+    "    for frame_num in range(1, num_frames):\n",
+    "        # sample independent process noise\n",
+    "        dx = pyro.sample('process_noise_{}'.format(frame_num), ncv.process_noise_dist(dt))\n",
+    "        xs_truth[frame_num, :] = ncv(xs_truth[frame_num-1, :], dt=dt) + dx\n",
+    "        \n",
+    "# Measurements\n",
+    "measurements = []\n",
+    "mean = torch.zeros(2)\n",
+    "# no correlations\n",
+    "cov = 1e-5 * torch.eye(2)\n",
+    "with torch.no_grad():\n",
+    "    # sample independent measurement noise\n",
+    "    dzs = pyro.sample('dzs', dist.MultivariateNormal(mean, cov).expand((num_frames,)))\n",
+    "    # compute measurement means\n",
+    "    zs = xs_truth[:, :2] + dzs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "046c16cb",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jbris/miniconda3/envs/data_assim/lib/python3.10/site-packages/torch/autograd/__init__.py:251: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 11070). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n",
+      "  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "loss:  -15.20763874053955\n",
+      "loss:  -15.339868545532227\n",
+      "loss:  -15.413694381713867\n",
+      "loss:  -15.473196029663086\n",
+      "loss:  -15.507671356201172\n",
+      "loss:  -15.523503303527832\n",
+      "loss:  -15.5301513671875\n",
+      "loss:  -15.532791137695312\n",
+      "loss:  -15.533793449401855\n",
+      "loss:  -15.534193992614746\n",
+      "loss:  -15.534348487854004\n",
+      "loss:  -15.534411430358887\n",
+      "loss:  -15.534439086914062\n",
+      "loss:  -15.534448623657227\n",
+      "loss:  -15.534452438354492\n",
+      "loss:  -15.534453392028809\n",
+      "loss:  -15.534455299377441\n",
+      "loss:  -15.534454345703125\n",
+      "loss:  -15.534454345703125\n",
+      "loss:  -15.534454345703125\n",
+      "loss:  -15.534455299377441\n",
+      "loss:  -15.534455299377441\n",
+      "loss:  -15.534454345703125\n",
+      "loss:  -15.534453392028809\n",
+      "loss:  -15.534456253051758\n"
+     ]
+    }
+   ],
+   "source": [
+    "def model(data):\n",
+    "    # a HalfNormal can be used here as well\n",
+    "    R = pyro.sample('pv_cov', dist.HalfCauchy(2e-6)) * torch.eye(4)\n",
+    "    Q = pyro.sample('measurement_cov', dist.HalfCauchy(1e-6)) * torch.eye(2)\n",
+    "    # observe the measurements\n",
+    "    pyro.sample('track_{}'.format(i), EKFDistribution(xs_truth[0], R, ncv,\n",
+    "                                                      Q, time_steps=num_frames),\n",
+    "                obs=data)\n",
+    "\n",
+    "guide = AutoDelta(model)  # MAP estimation\n",
+    "\n",
+    "optim = pyro.optim.Adam({'lr': 2e-2})\n",
+    "svi = SVI(model, guide, optim, loss=Trace_ELBO(retain_graph=True))\n",
+    "\n",
+    "pyro.set_rng_seed(0)\n",
+    "pyro.clear_param_store()\n",
+    "\n",
+    "for i in range(250):\n",
+    "    loss = svi.step(zs)\n",
+    "    if not i % 10:\n",
+    "        print('loss: ', loss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "bb429939",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[<pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c35346980>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c35347c10>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c35347d60>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c35345d20>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c35344730>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c354ffe80>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c354ff0d0>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c354fded0>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c354ff9d0>,\n",
+       " <pyro.contrib.tracking.extended_kalman_filter.EKFState at 0x7f7c354feb90>]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "R = guide()['pv_cov'] * torch.eye(4)\n",
+    "Q = guide()['measurement_cov'] * torch.eye(2)\n",
+    "ekf_dist = EKFDistribution(xs_truth[0], R, ncv, Q, time_steps=num_frames)\n",
+    "states= ekf_dist.filter_states(zs)\n",
+    "states"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88a05fba",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/pyro/mixed_hmm/README.md b/pyro/mixed_hmm/README.md
new file mode 100644
index 0000000..649c825
--- /dev/null
+++ b/pyro/mixed_hmm/README.md
@@ -0,0 +1,44 @@
+# Hierarchical mixed-effect hidden Markov models
+
+Note: This is a cleaned-up version of the seal experiments in [Bingham et al 2019] that is a simplified variant of some of the analysis in the [momentuHMM harbour seal example](https://github.com/bmcclintock/momentuHMM/blob/master/vignettes/harbourSealExample.R) [McClintock et al 2018].
+
+Recent advances in sensor technology have made it possible to capture the movements of multiple wild animals within a single population at high spatiotemporal resolution over long periods of time [McClintock et al 2013, Towner et al 2016]. Discrete state-space models, where the latent state is thought of as corresponding to a behavior state such as "foraging" or "resting", have become popular computational tools for analyzing these new datasets thanks to their interpretability and tractability.
+
+This example applies several different hierarchical discrete state-space models to location data recorded from a colony of harbour seals on foraging excursions in the North Sea [McClintock et al 2013].
+The raw data are irregularly sampled time series (roughly 5-15 minutes between samples) of GPS coordinates and diving activity for each individual in the colony (10 male and 7 female) over the course of a single day recorded by lightweight tracking devices physically attached to each animal by researchers. They have been preprocessed using the momentuHMM example code into smoothed, temporally regular series of step sizes, turn angles, and diving activity for each individual.
+
+The models are special cases of a time-inhomogeneous discrete state space model
+whose state transition distribution is specified by a hierarchical generalized linear mixed model (GLMM).
+At each timestep `t`, for each individual trajectory `b` in each group `a`, we have
+
+```
+logit(p(x[t,a,b] = state i | x[t-1,a,b] = state j)) =
+    (epsilon_G[a] + epsilon_I[a,b] + Z_I[a,b].T @ beta1 + Z_G[a].T @ beta2 + Z_T[t,a,b].T @ beta3)[i,j]
+```
+
+where `a,b` correspond to plate indices, `epsilon_G` and `epsilon_I` are independent random variables for each group and individual within each group respectively, `Z`s are covariates, and `beta`s are parameter vectors.
+
+The random variables `epsilon` may be either discrete or continuous.
+If continuous, they are normally distributed.
+If discrete, they are sampled from a set of three possible values shared across the innermost plate of a particular variable.
+That is, for each individual trajectory `b` in each group `a`, we sample single random effect values for an entire trajectory:
+
+```
+iota_G[a] ~ Categorical(pi_G)
+epsilon_G[a] = Theta_G[iota_G[a]]
+iota_I[a,b] ~ Categorical(pi_I[a])
+epsilon_I[a,b] = Theta_I[a][iota_I[a,b]]
+```
+
+Here `pi_G`, `Theta_G`, `pi_I`, and `Theta_I` are all learnable real-valued parameter vectors and `epsilon` values are batches of vectors the size of state transition matrices.
+
+Observations `y[t,a,b]` are represented as sequences of real-valued step lengths and turn angles, modelled by zero-inflated Gamma and von Mises likelihoods respectively.
+The seal models also include a third observed variable indicating the amount of diving activity between successive locations, which we model with a zero-inflated Beta distribution following [McClintock et al 2018].
+
+We grouped animals by sex and implemented versions of this model with (i) no random effects (as a baseline), and with random effects present at the (ii) group, (iii) individual, or (iv) group+individual levels. Unlike the models in [Towner et al 2016], we do not consider fixed effects on any of the parameters.
+
+# References
+* [Obermeyer et al 2019] Obermeyer, F.\*, Bingham, E.\*, Jankowiak, M.\*, Chiu, J., Pradhan, N., Rush, A., and Goodman, N.  Tensor Variable Elimination for Plated Factor Graphs, 2019
+* [McClintock et al 2013] McClintock, B. T., Russell, D. J., Matthiopoulos, J., and King, R.  Combining individual animal movement and ancillary biotelemetry data to investigate population-level activity budgets. Ecology, 94(4):838–849, 2013
+* [McClintock et al 2018] McClintock, B. T. and Michelot,T. momentuhmm: R package for generalized hidden markov models of animal movement. Methods in Ecology and  Evolution,  9(6): 1518–1530, 2018. doi: 10.1111/2041-210X.12995
+* [Towner et al 2016] Towner, A. V., Leos-Barajas, V., Langrock, R., Schick, R. S., Smale, M. J., Kaschke, T., Jewell, O. J., and Papastamatiou, Y. P.  Sex-specific and individual preferences for hunting strategies in white sharks. Functional Ecology, 30(8):1397–1407, 2016.
diff --git a/pyro/mixed_hmm/__init__.py b/pyro/mixed_hmm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pyro/mixed_hmm/experiment.py b/pyro/mixed_hmm/experiment.py
new file mode 100644
index 0000000..460a512
--- /dev/null
+++ b/pyro/mixed_hmm/experiment.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2017-2019 Uber Technologies, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import functools
+import json
+import os
+import uuid
+
+import torch
+from model import guide_generic, model_generic
+from seal_data import prepare_seal
+
+import pyro
+import pyro.poutine as poutine
+from pyro.infer import TraceEnum_ELBO
+
+
+def aic_num_parameters(model, guide=None):
+    """
+    hacky AIC param count that includes all parameters in the model and guide
+    """
+
+    def _size(tensor):
+        """product of shape"""
+        s = 1
+        for d in tensor.shape:
+            s = s * d
+        return s
+
+    with poutine.block(), poutine.trace(param_only=True) as param_capture:
+        TraceEnum_ELBO(max_plate_nesting=2).differentiable_loss(model, guide)
+
+    return sum(_size(node["value"]) for node in param_capture.trace.nodes.values())
+
+
+def run_expt(args):
+    data_dir = args["folder"]
+    dataset = "seal"  # args["dataset"]
+    seed = args["seed"]
+    optim = args["optim"]
+    lr = args["learnrate"]
+    timesteps = args["timesteps"]
+    schedule = (
+        [] if not args["schedule"] else [int(i) for i in args["schedule"].split(",")]
+    )
+    random_effects = {"group": args["group"], "individual": args["individual"]}
+
+    pyro.set_rng_seed(seed)  # reproducible random effect parameter init
+
+    filename = os.path.join(data_dir, "prep_seal_data.csv")
+    config = prepare_seal(filename, random_effects)
+
+    model = functools.partial(model_generic, config)  # for JITing
+    guide = functools.partial(guide_generic, config)
+
+    # count the number of parameters once
+    num_parameters = aic_num_parameters(model, guide)
+
+    losses = []
+    # SGD
+    if optim == "sgd":
+        loss_fn = TraceEnum_ELBO(max_plate_nesting=2).differentiable_loss
+        with pyro.poutine.trace(param_only=True) as param_capture:
+            loss_fn(model, guide)
+        params = [
+            site["value"].unconstrained() for site in param_capture.trace.nodes.values()
+        ]
+        optimizer = torch.optim.Adam(params, lr=lr)
+
+        if schedule:
+            scheduler = torch.optim.lr_scheduler.MultiStepLR(
+                optimizer, milestones=schedule, gamma=0.5
+            )
+            schedule_step_loss = False
+        else:
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")
+            schedule_step_loss = True
+
+        for t in range(timesteps):
+            optimizer.zero_grad()
+            loss = loss_fn(model, guide)
+            loss.backward()
+            optimizer.step()
+            scheduler.step(loss.item() if schedule_step_loss else t)
+            losses.append(loss.item())
+
+            print(
+                "Loss: {}, AIC[{}]: ".format(loss.item(), t),
+                2.0 * loss + 2.0 * num_parameters,
+            )
+
+    # LBFGS
+    elif optim == "lbfgs":
+        loss_fn = TraceEnum_ELBO(max_plate_nesting=2).differentiable_loss
+        with pyro.poutine.trace(param_only=True) as param_capture:
+            loss_fn(model, guide)
+        params = [
+            site["value"].unconstrained() for site in param_capture.trace.nodes.values()
+        ]
+        optimizer = torch.optim.LBFGS(params, lr=lr)
+
+        if schedule:
+            scheduler = torch.optim.lr_scheduler.MultiStepLR(
+                optimizer, milestones=schedule, gamma=0.5
+            )
+            schedule_step_loss = False
+        else:
+            schedule_step_loss = True
+            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")
+
+        for t in range(timesteps):
+
+            def closure():
+                optimizer.zero_grad()
+                loss = loss_fn(model, guide)
+                loss.backward()
+                return loss
+
+            loss = optimizer.step(closure)
+            scheduler.step(loss.item() if schedule_step_loss else t)
+            losses.append(loss.item())
+            print(
+                "Loss: {}, AIC[{}]: ".format(loss.item(), t),
+                2.0 * loss + 2.0 * num_parameters,
+            )
+
+    else:
+        raise ValueError("{} not supported optimizer".format(optim))
+
+    aic_final = 2.0 * losses[-1] + 2.0 * num_parameters
+    print("AIC final: {}".format(aic_final))
+
+    results = {}
+    results["args"] = args
+    results["sizes"] = config["sizes"]
+    results["likelihoods"] = losses
+    results["likelihood_final"] = losses[-1]
+    results["aic_final"] = aic_final
+    results["aic_num_parameters"] = num_parameters
+
+    if args["resultsdir"] is not None and os.path.exists(args["resultsdir"]):
+        re_str = "g" + (
+            "n"
+            if args["group"] is None
+            else "d" if args["group"] == "discrete" else "c"
+        )
+        re_str += "i" + (
+            "n"
+            if args["individual"] is None
+            else "d" if args["individual"] == "discrete" else "c"
+        )
+        results_filename = "expt_{}_{}_{}.json".format(
+            dataset, re_str, str(uuid.uuid4().hex)[0:5]
+        )
+        with open(os.path.join(args["resultsdir"], results_filename), "w") as f:
+            json.dump(results, f)
+
+    return results
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-g", "--group", default="none", type=str)
+    parser.add_argument("-i", "--individual", default="none", type=str)
+    parser.add_argument("-f", "--folder", default="./", type=str)
+    parser.add_argument("-o", "--optim", default="sgd", type=str)
+    parser.add_argument("-lr", "--learnrate", default=0.05, type=float)
+    parser.add_argument("-t", "--timesteps", default=1000, type=int)
+    parser.add_argument("-r", "--resultsdir", default="./results", type=str)
+    parser.add_argument("-s", "--seed", default=101, type=int)
+    parser.add_argument("--schedule", default="", type=str)
+    args = parser.parse_args()
+
+    if args.group == "none":
+        args.group = None
+    if args.individual == "none":
+        args.individual = None
+
+    run_expt(vars(args))
diff --git a/pyro/mixed_hmm/model.py b/pyro/mixed_hmm/model.py
new file mode 100644
index 0000000..1a87c23
--- /dev/null
+++ b/pyro/mixed_hmm/model.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2017-2019 Uber Technologies, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from torch.distributions import constraints
+
+import pyro
+import pyro.distributions as dist
+from pyro import poutine
+from pyro.infer import config_enumerate
+from pyro.ops.indexing import Vindex
+
+
+def guide_generic(config):
+    """generic mean-field guide for continuous random effects"""
+    N_state = config["sizes"]["state"]
+
+    if config["group"]["random"] == "continuous":
+        loc_g = pyro.param("loc_group", lambda: torch.zeros((N_state**2,)))
+        scale_g = pyro.param(
+            "scale_group",
+            lambda: torch.ones((N_state**2,)),
+            constraint=constraints.positive,
+        )
+
+    # initialize individual-level random effect parameters
+    N_c = config["sizes"]["group"]
+    if config["individual"]["random"] == "continuous":
+        loc_i = pyro.param(
+            "loc_individual",
+            lambda: torch.zeros(
+                (
+                    N_c,
+                    N_state**2,
+                )
+            ),
+        )
+        scale_i = pyro.param(
+            "scale_individual",
+            lambda: torch.ones(
+                (
+                    N_c,
+                    N_state**2,
+                )
+            ),
+            constraint=constraints.positive,
+        )
+
+    N_c = config["sizes"]["group"]
+    with pyro.plate("group", N_c, dim=-1):
+        if config["group"]["random"] == "continuous":
+            pyro.sample(
+                "eps_g",
+                dist.Normal(loc_g, scale_g).to_event(1),
+            )  # infer={"num_samples": 10})
+
+        N_s = config["sizes"]["individual"]
+        with pyro.plate("individual", N_s, dim=-2), poutine.mask(
+            mask=config["individual"]["mask"]
+        ):
+            # individual-level random effects
+            if config["individual"]["random"] == "continuous":
+                pyro.sample(
+                    "eps_i",
+                    dist.Normal(loc_i, scale_i).to_event(1),
+                )  # infer={"num_samples": 10})
+
+
+@config_enumerate
+def model_generic(config):
+    """Hierarchical mixed-effects hidden markov model"""
+
+    MISSING = config["MISSING"]
+    N_v = config["sizes"]["random"]
+    N_state = config["sizes"]["state"]
+
+    # initialize group-level random effect parameterss
+    if config["group"]["random"] == "discrete":
+        probs_e_g = pyro.param(
+            "probs_e_group",
+            lambda: torch.randn((N_v,)).abs(),
+            constraint=constraints.simplex,
+        )
+        theta_g = pyro.param("theta_group", lambda: torch.randn((N_v, N_state**2)))
+    elif config["group"]["random"] == "continuous":
+        loc_g = torch.zeros((N_state**2,))
+        scale_g = torch.ones((N_state**2,))
+
+    # initialize individual-level random effect parameters
+    N_c = config["sizes"]["group"]
+    if config["individual"]["random"] == "discrete":
+        probs_e_i = pyro.param(
+            "probs_e_individual",
+            lambda: torch.randn(
+                (
+                    N_c,
+                    N_v,
+                )
+            ).abs(),
+            constraint=constraints.simplex,
+        )
+        theta_i = pyro.param(
+            "theta_individual", lambda: torch.randn((N_c, N_v, N_state**2))
+        )
+    elif config["individual"]["random"] == "continuous":
+        loc_i = torch.zeros(
+            (
+                N_c,
+                N_state**2,
+            )
+        )
+        scale_i = torch.ones(
+            (
+                N_c,
+                N_state**2,
+            )
+        )
+
+    # initialize likelihood parameters
+    # observation 1: step size (step ~ Gamma)
+    step_zi_param = pyro.param("step_zi_param", lambda: torch.ones((N_state, 2)))
+    step_concentration = pyro.param(
+        "step_param_concentration",
+        lambda: torch.randn((N_state,)).abs(),
+        constraint=constraints.positive,
+    )
+    step_rate = pyro.param(
+        "step_param_rate",
+        lambda: torch.randn((N_state,)).abs(),
+        constraint=constraints.positive,
+    )
+
+    # observation 2: step angle (angle ~ VonMises)
+    angle_concentration = pyro.param(
+        "angle_param_concentration",
+        lambda: torch.randn((N_state,)).abs(),
+        constraint=constraints.positive,
+    )
+    angle_loc = pyro.param("angle_param_loc", lambda: torch.randn((N_state,)).abs())
+
+    # observation 3: dive activity (omega ~ Beta)
+    omega_zi_param = pyro.param("omega_zi_param", lambda: torch.ones((N_state, 2)))
+    omega_concentration0 = pyro.param(
+        "omega_param_concentration0",
+        lambda: torch.randn((N_state,)).abs(),
+        constraint=constraints.positive,
+    )
+    omega_concentration1 = pyro.param(
+        "omega_param_concentration1",
+        lambda: torch.randn((N_state,)).abs(),
+        constraint=constraints.positive,
+    )
+
+    # initialize gamma to uniform
+    gamma = torch.zeros((N_state**2,))
+
+    N_c = config["sizes"]["group"]
+    with pyro.plate("group", N_c, dim=-1):
+        # group-level random effects
+        if config["group"]["random"] == "discrete":
+            # group-level discrete effect
+            e_g = pyro.sample("e_g", dist.Categorical(probs_e_g))
+            eps_g = Vindex(theta_g)[..., e_g, :]
+        elif config["group"]["random"] == "continuous":
+            eps_g = pyro.sample(
+                "eps_g",
+                dist.Normal(loc_g, scale_g).to_event(1),
+            )  # infer={"num_samples": 10})
+        else:
+            eps_g = 0.0
+
+        # add group-level random effect to gamma
+        gamma = gamma + eps_g
+
+        N_s = config["sizes"]["individual"]
+        with pyro.plate("individual", N_s, dim=-2), poutine.mask(
+            mask=config["individual"]["mask"]
+        ):
+            # individual-level random effects
+            if config["individual"]["random"] == "discrete":
+                # individual-level discrete effect
+                e_i = pyro.sample("e_i", dist.Categorical(probs_e_i))
+                eps_i = Vindex(theta_i)[..., e_i, :]
+                # assert eps_i.shape[-3:] == (1, N_c, N_state ** 2) and eps_i.shape[0] == N_v
+            elif config["individual"]["random"] == "continuous":
+                eps_i = pyro.sample(
+                    "eps_i",
+                    dist.Normal(loc_i, scale_i).to_event(1),
+                )  # infer={"num_samples": 10})
+            else:
+                eps_i = 0.0
+
+            # add individual-level random effect to gamma
+            gamma = gamma + eps_i
+
+            y = torch.tensor(0).long()
+
+            N_t = config["sizes"]["timesteps"]
+            for t in pyro.markov(range(N_t)):
+                with poutine.mask(mask=config["timestep"]["mask"][..., t]):
+                    gamma_t = gamma  # per-timestep variable
+
+                    # finally, reshape gamma as batch of transition matrices
+                    gamma_t = gamma_t.reshape(
+                        tuple(gamma_t.shape[:-1]) + (N_state, N_state)
+                    )
+
+                    # we've accounted for all effects, now actually compute gamma_y
+                    gamma_y = Vindex(gamma_t)[..., y, :]
+                    y = pyro.sample("y_{}".format(t), dist.Categorical(logits=gamma_y))
+
+                    # observation 1: step size
+                    step_dist = dist.Gamma(
+                        concentration=Vindex(step_concentration)[..., y],
+                        rate=Vindex(step_rate)[..., y],
+                    )
+
+                    # zero-inflation with MaskedMixture
+                    step_zi = Vindex(step_zi_param)[..., y, :]
+                    step_zi_mask = config["observations"]["step"][..., t] == MISSING
+                    pyro.sample(
+                        "step_zi_{}".format(t),
+                        dist.Categorical(logits=step_zi),
+                        obs=step_zi_mask.long(),
+                    )
+                    step_zi_zero_dist = dist.Delta(v=torch.tensor(MISSING))
+                    step_zi_dist = dist.MaskedMixture(
+                        step_zi_mask, step_dist, step_zi_zero_dist
+                    )
+
+                    pyro.sample(
+                        "step_{}".format(t),
+                        step_zi_dist,
+                        obs=config["observations"]["step"][..., t],
+                    )
+
+                    # observation 2: step angle
+                    angle_dist = dist.VonMises(
+                        concentration=Vindex(angle_concentration)[..., y],
+                        loc=Vindex(angle_loc)[..., y],
+                    )
+                    pyro.sample(
+                        "angle_{}".format(t),
+                        angle_dist,
+                        obs=config["observations"]["angle"][..., t],
+                    )
+
+                    # observation 3: dive activity
+                    omega_dist = dist.Beta(
+                        concentration0=Vindex(omega_concentration0)[..., y],
+                        concentration1=Vindex(omega_concentration1)[..., y],
+                    )
+
+                    # zero-inflation with MaskedMixture
+                    omega_zi = Vindex(omega_zi_param)[..., y, :]
+                    omega_zi_mask = config["observations"]["omega"][..., t] == MISSING
+                    pyro.sample(
+                        "omega_zi_{}".format(t),
+                        dist.Categorical(logits=omega_zi),
+                        obs=omega_zi_mask.long(),
+                    )
+
+                    omega_zi_zero_dist = dist.Delta(v=torch.tensor(MISSING))
+                    omega_zi_dist = dist.MaskedMixture(
+                        omega_zi_mask, omega_dist, omega_zi_zero_dist
+                    )
+
+                    pyro.sample(
+                        "omega_{}".format(t),
+                        omega_zi_dist,
+                        obs=config["observations"]["omega"][..., t],
+                    )
diff --git a/pyro/mixed_hmm/seal_data.py b/pyro/mixed_hmm/seal_data.py
new file mode 100644
index 0000000..a702375
--- /dev/null
+++ b/pyro/mixed_hmm/seal_data.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2017-2019 Uber Technologies, Inc.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from urllib.request import urlopen
+
+import pandas as pd
+import torch
+
+MISSING = 1e-6
+
+
+def download_seal_data(filename):
+    """download the preprocessed seal data and save it to filename"""
+    url = "https://d2hg8soec8ck9v.cloudfront.net/datasets/prep_seal_data.csv"
+    with open(filename, "wb") as f:
+        f.write(urlopen(url).read())
+
+
+def prepare_seal(filename, random_effects):
+    if not os.path.exists(filename):
+        download_seal_data(filename)
+
+    seal_df = pd.read_csv(filename)
+    obs_keys = ["step", "angle", "omega"]
+    # data format for z1, z2:
+    # single tensor with shape (individual, group, time, coords)
+    observations = torch.zeros((20, 2, 1800, len(obs_keys))).fill_(float("-inf"))
+    for g, (group, group_df) in enumerate(seal_df.groupby("sex")):
+        for i, (ind, ind_df) in enumerate(group_df.groupby("ID")):
+            for o, obs_key in enumerate(obs_keys):
+                observations[i, g, 0 : len(ind_df), o] = torch.tensor(
+                    ind_df[obs_key].values
+                )
+
+    observations[torch.isnan(observations)] = float("-inf")
+
+    # make masks
+    # mask_i should mask out individuals, it applies at all timesteps
+    mask_i = (observations > float("-inf")).any(dim=-1).any(dim=-1)  # time nonempty
+
+    # mask_t handles padding for time series of different length
+    mask_t = (observations > float("-inf")).all(dim=-1)  # include non-inf
+
+    # temporary hack to avoid zero-inflation issues
+    # observations[observations == 0.] = MISSING
+    observations[(observations == 0.0) | (observations == float("-inf"))] = MISSING
+    assert not torch.isnan(observations).any()
+
+    # observations = observations[..., 5:11, :]  # truncate for testing
+
+    config = {
+        "MISSING": MISSING,
+        "sizes": {
+            "state": 3,
+            "random": 4,
+            "group": observations.shape[1],
+            "individual": observations.shape[0],
+            "timesteps": observations.shape[2],
+        },
+        "group": {"random": random_effects["group"], "fixed": None},
+        "individual": {
+            "random": random_effects["individual"],
+            "fixed": None,
+            "mask": mask_i,
+        },
+        "timestep": {"random": None, "fixed": None, "mask": mask_t},
+        "observations": {
+            "step": observations[..., 0],
+            "angle": observations[..., 1],
+            "omega": observations[..., 2],
+        },
+    }
+
+    return config
diff --git a/requirements.txt b/requirements.txt
index 0909c70..2151ded 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,7 +68,7 @@ ipython-genutils==0.2.0
 ipywidgets==8.1.1
 iterative-ensemble-smoother==0.2.0
 jedi==0.19.1
-Jinja2==3.1.2
+Jinja2==3.1.3
 joblib==1.3.2
 jsonschema==4.20.0
 jsonschema-specifications==2023.11.2
@@ -91,7 +91,7 @@ matplotlib==3.8.2
 matplotlib-inline==0.1.6
 miniKanren==1.0.3
 mistune==3.0.2
-# mpi4py==3.1.4
+mpi4py==3.1.5
 mpl-tools==0.2.50
 mpmath==1.3.0
 msgpack==1.0.7
@@ -128,7 +128,7 @@ openturns==1.21
 opt-einsum==3.3.0
 ordered-set==4.1.0
 packaging==23.2
-pandas==2.1
+pandas==2.1.0
 pandocfilters==1.5.0
 parso==0.8.3
 patlib==0.3.5
@@ -156,7 +156,6 @@ PyQt5-Qt5==5.15.2
 PyQt5-sip==12.13.0
 pyro-api==0.1.2
 pyro-ppl==1.8.6
-pysip==1.0.0
 pytensor==2.18.4
 python-dateutil==2.8.2
 python-multipart==0.0.6
@@ -201,6 +200,7 @@ tinycss2==1.2.1
 tomli==2.0.1
 toolz==0.12.0
 torch==2.1.2
+torchvision==0.16.2
 tornado==6.1
 tqdm==4.66.1
 traitlets==5.14.0