DoubleML · SvenKlaassen · Jul 7, 2025 · Jun 2, 2025 · Jun 2, 2025 · Jun 3, 2025
diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py
@@ -106,6 +106,8 @@ def __init__(
             force_all_x_finite=force_all_x_finite,
             force_all_d_finite=False,
         )
+        # reset index to ensure a simple RangeIndex
+        self.data.reset_index(drop=True, inplace=True)
         if self.n_treat != 1:
             raise ValueError("Only one treatment column is allowed for panel data.")
 
@@ -139,7 +141,7 @@ def _data_summary_str(self):
             f"Id variable: {self.id_col}\n"
         )
 
-        data_summary += f"No. Observations: {self.n_obs}\n"
+        data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n"
         return data_summary
 
     @classmethod
@@ -213,9 +215,9 @@ def id_var_unique(self):
         return self._id_var_unique
 
     @property
-    def n_obs(self):
+    def n_ids(self):
         """
-        The number of observations. For panel data, the number of unique values for id_col.
+        The number of unique values for id_col.
         """
         return len(self._id_var_unique)
 

diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py
@@ -56,7 +56,7 @@ def test_id_col_setter():
     dml_data.id_col = "id_new"
     assert np.array_equal(dml_data.id_var, id_comp)
     assert dml_data._id_var_unique == np.unique(id_comp)
-    assert dml_data.n_obs == 1
+    assert dml_data.n_ids == 1
 
     msg = "Invalid id variable id_col. a13 is no data column."
     with pytest.raises(ValueError, match=msg):
@@ -169,7 +169,8 @@ def test_panel_data_properties():
 
     assert np.array_equal(dml_data.id_var, df["id"].values)
     assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
-    assert dml_data.n_obs == len(np.unique(df["id"].values))
+    assert dml_data.n_obs == df.shape[0]
+    assert dml_data.n_ids == len(np.unique(df["id"].values))
     assert dml_data.g_col == "d"
     assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
     assert dml_data.n_groups == len(np.unique(df["d"].values))

diff --git a/doubleml/data/utils/panel_data_utils.py b/doubleml/data/utils/panel_data_utils.py
@@ -1,8 +1,58 @@
+import pandas as pd
+
 valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"}
 
+# Units that can be used with pd.Timedelta (unambiguous)
+timedelta_compatible_units = {"D", "h", "m", "s", "ms", "us", "ns"}
+
+# Units that require period arithmetic (ambiguous)
+period_only_units = {"Y", "M"}
+
 
 def _is_valid_datetime_unit(unit):
     if unit not in valid_datetime_units:
         raise ValueError("Invalid datetime unit.")
     else:
         return unit
+
+
+def _is_timedelta_compatible(unit):
+    """Check if a datetime unit can be used with pd.Timedelta."""
+    return unit in timedelta_compatible_units
+
+
+def _subtract_periods_safe(datetime_values, reference_datetime, periods, unit):
+    """
+    Safely subtract periods from datetime values, handling both timedelta-compatible
+    and period-only units.
+
+    Parameters
+    ----------
+    datetime_values : pandas.Series or numpy.array
+        Array of datetime values to compare
+    reference_datetime : datetime-like
+        Reference datetime to subtract periods from
+    periods : int
+        Number of periods to subtract
+    unit : str
+        Datetime unit
+
+    Returns
+    -------
+    numpy.array
+        Boolean array indicating which datetime_values are >= (reference_datetime - periods)
+    """
+    if periods == 0:
+        # No anticipation periods, so no datetime arithmetic needed
+        return datetime_values >= reference_datetime
+
+    if _is_timedelta_compatible(unit):
+        # Use Timedelta for unambiguous units
+        period_offset = pd.Timedelta(periods, unit=unit)
+        return datetime_values >= (reference_datetime - period_offset)
+    else:
+        # Use Period arithmetic for ambiguous units like 'M' and 'Y'
+        ref_period = pd.Period(reference_datetime, freq=unit)
+        ref_minus_periods = ref_period - periods
+        datetime_periods = pd.PeriodIndex(datetime_values, freq=unit)
+        return datetime_periods >= ref_minus_periods
diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py
@@ -6,12 +6,14 @@
 from .did_aggregation import DoubleMLDIDAggregation
 from .did_binary import DoubleMLDIDBinary
 from .did_cs import DoubleMLDIDCS
+from .did_cs_binary import DoubleMLDIDCSBinary
 from .did_multi import DoubleMLDIDMulti
 
 __all__ = [
     "DoubleMLDIDAggregation",
     "DoubleMLDID",
     "DoubleMLDIDCS",
     "DoubleMLDIDBinary",
+    "DoubleMLDIDCSBinary",
     "DoubleMLDIDMulti",
 ]
diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py
@@ -3,9 +3,11 @@
 """
 
 from .dgp_did_CS2021 import make_did_CS2021
+from .dgp_did_cs_CS2021 import make_did_cs_CS2021
 from .dgp_did_SZ2020 import make_did_SZ2020
 
 __all__ = [
     "make_did_SZ2020",
     "make_did_CS2021",
+    "make_did_cs_CS2021",
 ]
diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py
@@ -0,0 +1,191 @@
+import numpy as np
+
+from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021
+
+# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
+# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
+# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021)
+
+
+def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs):
+    """
+    Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on
+    Callaway and Sant'Anna (2021).
+
+    This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups.
+    The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
+    and optionally a never-treated group that serves as a control. The true average treatment effect on the
+    treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.
+
+    The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
+    and propensity score features are derived:
+
+    - DGP 1: Outcome and propensity score are linear (in Z)
+    - DGP 2: Outcome is linear, propensity score is nonlinear
+    - DGP 3: Outcome is nonlinear, propensity score is linear
+    - DGP 4: Outcome and propensity score are nonlinear
+    - DGP 5: Outcome is linear, propensity score is constant (experimental setting)
+    - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)
+
+    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
+    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
+
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
+    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
+
+    For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:
+
+    1. Time-varying outcome regression function for each time period :math:`t`:
+
+       .. math::
+
+           f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)
+
+    2. Group-specific propensity function for each treatment group :math:`g`:
+
+       .. math::
+
+           f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
+           (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)
+
+    where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
+    scale parameter (default: 0.9).
+
+    The panel data model is defined with the following components:
+
+    1. Time effects: :math:`\\delta_t = t` for time period :math:`t`
+
+    2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group
+
+    3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:
+
+       .. math::
+
+           \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)
+
+       where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
+       :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).
+
+    4. Potential outcomes for unit :math:`i` in period :math:`t`:
+
+       .. math::
+
+           Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}
+
+           Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})
+
+       where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.
+
+    5. Observed outcomes:
+
+       .. math::
+
+           Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}
+
+    6. Treatment assignment:
+
+       For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:
+
+       .. math::
+
+           P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}
+
+       For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:
+
+       .. math::
+
+           P(G_i = g) = \\frac{1}{G} \\text{ for all } g
+
+    7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased
+    to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant).
+    for each
+
+
+    The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:
+
+    .. math::
+
+        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
+
+        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
+
+        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
+
+        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
+
+        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
+
+        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0
+
+    where settings 5-6 correspond to experimental designs with equal probability across treatment groups.
+
+
+    Parameters
+    ----------
+    n_obs : int, default=1000
+        The number of observations to simulate.
+
+    dgp_type : int, default=1
+        The data generating process to be used (1-6).
+
+    include_never_treated : bool, default=True
+        Whether to include units that are never treated.
+
+    lambda_t : float, default=0.5
+        Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are
+        generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description).
+
+    time_type : str, default="datetime"
+        Type of time variable. Either "datetime" or "float".
+
+    **kwargs
+        Additional keyword arguments. Accepts the following parameters:
+
+        `c` (float, default=0.0):
+            Parameter for correlation structure in X.
+
+        `dim_x` (int, default=4):
+            Dimension of feature vectors.
+
+        `xi` (float, default=0.9):
+            Scale parameter for the propensity score function.
+
+        `n_periods` (int, default=5):
+            Number of time periods.
+
+        `anticipation_periods` (int, default=0):
+            Number of periods before treatment where anticipation effects occur.
+
+        `n_pre_treat_periods` (int, default=2):
+            Number of pre-treatment periods.
+
+        `start_date` (str, default="2025-01"):
+            Start date for datetime time variables.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing the simulated panel data.
+
+    References
+    ----------
+    Callaway, B. and Sant’Anna, P. H. (2021),
+    Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
+    doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
+    """
+
+    n_obs_panel = int(np.ceil(n_obs / lambda_t))
+    df_panel = make_did_CS2021(
+        n_obs=n_obs_panel,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        time_type=time_type,
+        **kwargs,
+    )
+
+    # for each time period, randomly select units to observe
+    observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index)))
+    df_repeated_cs = df_panel[observed_units == 1].copy()
+
+    return df_repeated_cs
diff --git a/doubleml/did/did.py b/doubleml/did/did.py
@@ -37,7 +37,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -47,7 +47,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str