diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py
index f548ae6a..4ba659ce 100644
--- a/doubleml/data/panel_data.py
+++ b/doubleml/data/panel_data.py
@@ -106,6 +106,8 @@ def __init__(
             force_all_x_finite=force_all_x_finite,
             force_all_d_finite=False,
         )
+        # reset index to ensure a simple RangeIndex
+        self.data.reset_index(drop=True, inplace=True)
         if self.n_treat != 1:
             raise ValueError("Only one treatment column is allowed for panel data.")
 
@@ -139,7 +141,7 @@ def _data_summary_str(self):
             f"Id variable: {self.id_col}\n"
         )
 
-        data_summary += f"No. Observations: {self.n_obs}\n"
+        data_summary += f"No. Unique Ids: {self.n_ids}\n" f"No. Observations: {self.n_obs}\n"
         return data_summary
 
     @classmethod
@@ -213,9 +215,9 @@ def id_var_unique(self):
         return self._id_var_unique
 
     @property
-    def n_obs(self):
+    def n_ids(self):
         """
-        The number of observations. For panel data, the number of unique values for id_col.
+        The number of unique values for id_col.
         """
         return len(self._id_var_unique)
 
diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py
index 2f2250ba..e1a7c925 100644
--- a/doubleml/data/tests/test_panel_data.py
+++ b/doubleml/data/tests/test_panel_data.py
@@ -56,7 +56,7 @@ def test_id_col_setter():
     dml_data.id_col = "id_new"
     assert np.array_equal(dml_data.id_var, id_comp)
     assert dml_data._id_var_unique == np.unique(id_comp)
-    assert dml_data.n_obs == 1
+    assert dml_data.n_ids == 1
 
     msg = "Invalid id variable id_col. a13 is no data column."
     with pytest.raises(ValueError, match=msg):
@@ -169,7 +169,8 @@ def test_panel_data_properties():
 
     assert np.array_equal(dml_data.id_var, df["id"].values)
     assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
-    assert dml_data.n_obs == len(np.unique(df["id"].values))
+    assert dml_data.n_obs == df.shape[0]
+    assert dml_data.n_ids == len(np.unique(df["id"].values))
     assert dml_data.g_col == "d"
     assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
     assert dml_data.n_groups == len(np.unique(df["d"].values))
diff --git a/doubleml/data/utils/panel_data_utils.py b/doubleml/data/utils/panel_data_utils.py
index abd365eb..cc94d39f 100644
--- a/doubleml/data/utils/panel_data_utils.py
+++ b/doubleml/data/utils/panel_data_utils.py
@@ -1,8 +1,58 @@
+import pandas as pd
+
 valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"}
 
+# Units that can be used with pd.Timedelta (unambiguous)
+timedelta_compatible_units = {"D", "h", "m", "s", "ms", "us", "ns"}
+
+# Units that require period arithmetic (ambiguous)
+period_only_units = {"Y", "M"}
+
 
 def _is_valid_datetime_unit(unit):
     if unit not in valid_datetime_units:
         raise ValueError("Invalid datetime unit.")
     else:
         return unit
+
+
+def _is_timedelta_compatible(unit):
+    """Check if a datetime unit can be used with pd.Timedelta."""
+    return unit in timedelta_compatible_units
+
+
+def _subtract_periods_safe(datetime_values, reference_datetime, periods, unit):
+    """
+    Safely subtract periods from datetime values, handling both timedelta-compatible
+    and period-only units.
+
+    Parameters
+    ----------
+    datetime_values : pandas.Series or numpy.array
+        Array of datetime values to compare
+    reference_datetime : datetime-like
+        Reference datetime to subtract periods from
+    periods : int
+        Number of periods to subtract
+    unit : str
+        Datetime unit
+
+    Returns
+    -------
+    numpy.array
+        Boolean array indicating which datetime_values are >= (reference_datetime - periods)
+    """
+    if periods == 0:
+        # No anticipation periods, so no datetime arithmetic needed
+        return datetime_values >= reference_datetime
+
+    if _is_timedelta_compatible(unit):
+        # Use Timedelta for unambiguous units
+        period_offset = pd.Timedelta(periods, unit=unit)
+        return datetime_values >= (reference_datetime - period_offset)
+    else:
+        # Use Period arithmetic for ambiguous units like 'M' and 'Y'
+        ref_period = pd.Period(reference_datetime, freq=unit)
+        ref_minus_periods = ref_period - periods
+        datetime_periods = pd.PeriodIndex(datetime_values, freq=unit)
+        return datetime_periods >= ref_minus_periods
diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py
index 354ffaa5..369353ef 100644
--- a/doubleml/did/__init__.py
+++ b/doubleml/did/__init__.py
@@ -6,6 +6,7 @@
 from .did_aggregation import DoubleMLDIDAggregation
 from .did_binary import DoubleMLDIDBinary
 from .did_cs import DoubleMLDIDCS
+from .did_cs_binary import DoubleMLDIDCSBinary
 from .did_multi import DoubleMLDIDMulti
 
 __all__ = [
@@ -13,5 +14,6 @@
     "DoubleMLDID",
     "DoubleMLDIDCS",
     "DoubleMLDIDBinary",
+    "DoubleMLDIDCSBinary",
     "DoubleMLDIDMulti",
 ]
diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py
index aaa5fc0a..306e7b10 100644
--- a/doubleml/did/datasets/__init__.py
+++ b/doubleml/did/datasets/__init__.py
@@ -3,9 +3,11 @@
 """
 
 from .dgp_did_CS2021 import make_did_CS2021
+from .dgp_did_cs_CS2021 import make_did_cs_CS2021
 from .dgp_did_SZ2020 import make_did_SZ2020
 
 __all__ = [
     "make_did_SZ2020",
     "make_did_CS2021",
+    "make_did_cs_CS2021",
 ]
diff --git a/doubleml/did/datasets/dgp_did_cs_CS2021.py b/doubleml/did/datasets/dgp_did_cs_CS2021.py
new file mode 100644
index 00000000..08021270
--- /dev/null
+++ b/doubleml/did/datasets/dgp_did_cs_CS2021.py
@@ -0,0 +1,191 @@
+import numpy as np
+
+from doubleml.did.datasets.dgp_did_CS2021 import make_did_CS2021
+
+# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
+# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
+# Cross-sectional version of the data generating process (DGP) for Callaway and Sant'Anna (2021)
+
+
+def make_did_cs_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, lambda_t=0.5, time_type="datetime", **kwargs):
+    """
+    Generate synthetic repeated cross-sectional data for difference-in-differences analysis based on
+    Callaway and Sant'Anna (2021).
+
+    This function creates repeated cross-sectional data with heterogeneous treatment effects across time periods and groups.
+    The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
+    and optionally a never-treated group that serves as a control. The true average treatment effect on the
+    treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.
+
+    The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
+    and propensity score features are derived:
+
+    - DGP 1: Outcome and propensity score are linear (in Z)
+    - DGP 2: Outcome is linear, propensity score is nonlinear
+    - DGP 3: Outcome is nonlinear, propensity score is linear
+    - DGP 4: Outcome and propensity score are nonlinear
+    - DGP 5: Outcome is linear, propensity score is constant (experimental setting)
+    - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)
+
+    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
+    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
+
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
+    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
+
+    For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:
+
+    1. Time-varying outcome regression function for each time period :math:`t`:
+
+       .. math::
+
+           f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)
+
+    2. Group-specific propensity function for each treatment group :math:`g`:
+
+       .. math::
+
+           f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
+           (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)
+
+    where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
+    scale parameter (default: 0.9).
+
+    The panel data model is defined with the following components:
+
+    1. Time effects: :math:`\\delta_t = t` for time period :math:`t`
+
+    2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group
+
+    3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:
+
+       .. math::
+
+           \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)
+
+       where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
+       :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).
+
+    4. Potential outcomes for unit :math:`i` in period :math:`t`:
+
+       .. math::
+
+           Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}
+
+           Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})
+
+       where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.
+
+    5. Observed outcomes:
+
+       .. math::
+
+           Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}
+
+    6. Treatment assignment:
+
+       For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:
+
+       .. math::
+
+           P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}
+
+       For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:
+
+       .. math::
+
+           P(G_i = g) = \\frac{1}{G} \\text{ for all } g
+
+    7. Steps 1-6 generate panel data. To obtain repeated cross-sectional data, the number of generated individuals is increased
+    to `n_obs/lambda_t`, where `lambda_t` denotes the probability to observe a unit at each time period (time constant).
+    for each
+
+
+    The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:
+
+    .. math::
+
+        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
+
+        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
+
+        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
+
+        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
+
+        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
+
+        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0
+
+    where settings 5-6 correspond to experimental designs with equal probability across treatment groups.
+
+
+    Parameters
+    ----------
+    n_obs : int, default=1000
+        The number of observations to simulate.
+
+    dgp_type : int, default=1
+        The data generating process to be used (1-6).
+
+    include_never_treated : bool, default=True
+        Whether to include units that are never treated.
+
+    lambda_t : float, default=0.5
+        Probability of observing a unit at each time period. Note that internally `n_obs/lambda_t` individuals are
+        generated of which only a fraction `lambda_t` is observed at each time period (see Step 7 in the DGP description).
+
+    time_type : str, default="datetime"
+        Type of time variable. Either "datetime" or "float".
+
+    **kwargs
+        Additional keyword arguments. Accepts the following parameters:
+
+        `c` (float, default=0.0):
+            Parameter for correlation structure in X.
+
+        `dim_x` (int, default=4):
+            Dimension of feature vectors.
+
+        `xi` (float, default=0.9):
+            Scale parameter for the propensity score function.
+
+        `n_periods` (int, default=5):
+            Number of time periods.
+
+        `anticipation_periods` (int, default=0):
+            Number of periods before treatment where anticipation effects occur.
+
+        `n_pre_treat_periods` (int, default=2):
+            Number of pre-treatment periods.
+
+        `start_date` (str, default="2025-01"):
+            Start date for datetime time variables.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing the simulated panel data.
+
+    References
+    ----------
+    Callaway, B. and Sant’Anna, P. H. (2021),
+    Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
+    doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
+    """
+
+    n_obs_panel = int(np.ceil(n_obs / lambda_t))
+    df_panel = make_did_CS2021(
+        n_obs=n_obs_panel,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        time_type=time_type,
+        **kwargs,
+    )
+
+    # for each time period, randomly select units to observe
+    observed_units = np.random.binomial(1, lambda_t, size=(len(df_panel.index)))
+    df_repeated_cs = df_panel[observed_units == 1].copy()
+
+    return df_repeated_cs
diff --git a/doubleml/did/did.py b/doubleml/did/did.py
index 7a671993..56bfe79c 100644
--- a/doubleml/did/did.py
+++ b/doubleml/did/did.py
@@ -37,7 +37,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -47,7 +47,7 @@ class DoubleMLDID(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py
index e4d309db..83e49cd0 100644
--- a/doubleml/did/did_binary.py
+++ b/doubleml/did/did_binary.py
@@ -70,7 +70,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -80,7 +80,7 @@ class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
@@ -124,6 +124,12 @@ def __init__(
         super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False)
 
         self._check_data(self._dml_data)
+        # for did panel data the scores are based on the number of unique ids
+        self._n_obs = obj_dml_data.n_ids
+        self._score_dim = (self._n_obs, self.n_rep, self._dml_data.n_treat)
+        # reinitialze arrays
+        self._initialize_arrays()
+
         g_values = self._dml_data.g_values
         t_values = self._dml_data.t_values
 
@@ -157,10 +163,10 @@ def __init__(
 
         # Preprocess data
         # Y1, Y0 might be needed if we want to support custom estimators and scores; currently only output y_diff
-        self._panel_data_wide = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval)
+        self._data_subset = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval)
 
         # Handling id values to match pairwise evaluation & simultaneous inference
-        id_panel_data = self._panel_data_wide[self._dml_data.id_col].values
+        id_panel_data = self._data_subset[self._dml_data.id_col].values
         id_original = self._dml_data.id_var_unique
         if not np.all(np.isin(id_panel_data, id_original)):
             raise ValueError("The id values in the panel data are not a subset of the original id values.")
@@ -171,14 +177,13 @@ def __init__(
 
         # Numeric values for positions of the entries in id_panel_data inside id_original
         # np.nonzero(np.isin(id_original, id_panel_data))
-        self._n_subset = self._panel_data_wide.shape[0]
-        self._n_obs = self._n_subset  # Effective sample size used for resampling
-        self._n_treated_subset = self._panel_data_wide["G_indicator"].sum()
+        self._n_obs_subset = self._data_subset.shape[0]  # Effective sample size used for resampling
+        self._n_treated_subset = self._data_subset["G_indicator"].sum()
 
         # Save x and y for later ML estimation
-        self._x_panel = self._panel_data_wide.loc[:, self._dml_data.x_cols].values
-        self._y_panel = self._panel_data_wide.loc[:, "y_diff"].values
-        self._g_panel = self._panel_data_wide.loc[:, "G_indicator"].values
+        self._x_data_subset = self._data_subset.loc[:, self._dml_data.x_cols].values
+        self._y_data_subset = self._data_subset.loc[:, "y_diff"].values
+        self._g_data_subset = self._data_subset.loc[:, "G_indicator"].values
 
         valid_scores = ["observational", "experimental"]
         _check_score(self.score, valid_scores, allow_callable=False)
@@ -191,7 +196,8 @@ def __init__(
             )
 
         # set stratication for resampling
-        self._strata = self._panel_data_wide["G_indicator"]
+        self._strata = self._data_subset["G_indicator"]
+        self._n_obs_sample_splitting = self.n_obs_subset
         if draw_sample_splitting:
             self.draw_sample_splitting()
 
@@ -233,58 +239,17 @@ def __init__(
         self._sensitivity_implemented = True
         self._external_predictions_implemented = True
 
-    def __str__(self):
-        class_name = self.__class__.__name__
-        header = f"================== {class_name} Object ==================\n"
-        data_summary = self._dml_data._data_summary_str()
-        score_info = (
-            f"Score function: {str(self.score)}\n"
-            f"Treatment group: {str(self.g_value)}\n"
-            f"Pre-treatment period: {str(self.t_value_pre)}\n"
-            f"Evaluation period: {str(self.t_value_eval)}\n"
-            f"Control group: {str(self.control_group)}\n"
-            f"Anticipation periods: {str(self.anticipation_periods)}\n"
-            f"Effective sample size: {str(self.n_obs)}\n"
-        )
-        learner_info = ""
-        for key, value in self.learner.items():
-            learner_info += f"Learner {key}: {str(value)}\n"
-        if self.nuisance_loss is not None:
-            learner_info += "Out-of-sample Performance:\n"
-            is_classifier = [value for value in self._is_classifier.values()]
-            is_regressor = [not value for value in is_classifier]
-            if any(is_regressor):
-                learner_info += "Regression:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is False]:
-                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
-            if any(is_classifier):
-                learner_info += "Classification:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is True]:
-                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
-
-        if self._is_cluster_data:
-            resampling_info = (
-                f"No. folds per cluster: {self._n_folds_per_cluster}\n"
-                f"No. folds: {self.n_folds}\n"
-                f"No. repeated sample splits: {self.n_rep}\n"
-            )
-        else:
-            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
-        fit_summary = str(self.summary)
-        res = (
-            header
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ Score & algorithm ------------------\n"
-            + score_info
-            + "\n------------------ Machine learner   ------------------\n"
-            + learner_info
-            + "\n------------------ Resampling        ------------------\n"
-            + resampling_info
-            + "\n------------------ Fit summary       ------------------\n"
-            + fit_summary
-        )
-        return res
+    def _format_score_info_str(self):
+        lines = [
+            f"Score function: {str(self.score)}",
+            f"Treatment group: {str(self.g_value)}",
+            f"Pre-treatment period: {str(self.t_value_pre)}",
+            f"Evaluation period: {str(self.t_value_eval)}",
+            f"Control group: {str(self.control_group)}",
+            f"Anticipation periods: {str(self.anticipation_periods)}",
+            f"Effective sample size: {str(self.n_obs_subset)}",
+        ]
+        return "\\n".join(lines)
 
     @property
     def g_value(self):
@@ -336,11 +301,11 @@ def anticipation_periods(self):
         return self._anticipation_periods
 
     @property
-    def panel_data_wide(self):
+    def data_subset(self):
         """
         The preprocessed panel data in wide format.
         """
-        return self._panel_data_wide
+        return self._data_subset
 
     @property
     def id_positions(self):
@@ -371,11 +336,11 @@ def trimming_threshold(self):
         return self._trimming_threshold
 
     @property
-    def n_obs(self):
+    def n_obs_subset(self):
         """
         The number of observations used for estimation.
         """
-        return self._n_subset
+        return self._n_obs_subset
 
     def _initialize_ml_nuisance_params(self):
         if self.score == "observational":
@@ -415,9 +380,10 @@ def _preprocess_data(self, g_value, pre_t, eval_t):
         id_col = self._dml_data.id_col
         g_col = self._dml_data.g_col
 
-        # relevent data subset
-        data_subset_indicator = data[t_col].isin([pre_t, eval_t])
-        data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col])
+        # relevent data subset: Only include units which are observed in both periods
+        relevant_time_data = data[data[t_col].isin([pre_t, eval_t])]
+        ids_with_both_periods_filter = relevant_time_data.groupby(id_col)[t_col].transform("nunique") == 2
+        data_subset = relevant_time_data[ids_with_both_periods_filter].sort_values(by=[id_col, t_col])
 
         # Construct G (treatment group) indicating treatment period in g
         G_indicator = (data_subset[g_col] == g_value).astype(int)
@@ -463,8 +429,8 @@ def _preprocess_data(self, g_value, pre_t, eval_t):
     def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
 
         # Here: d is a binary treatment indicator
-        x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False)
-        x, d = check_X_y(x, self._g_panel, force_all_finite=False)
+        x, y = check_X_y(self._x_data_subset, self._y_data_subset, force_all_finite=False)
+        x, d = check_X_y(x, self._g_data_subset, force_all_finite=False)
         # nuisance g
         # get train indices for d == 0
         smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
@@ -542,7 +508,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat)
 
         extend_kwargs = {
-            "n_obs": self._dml_data.n_obs,
+            "n_obs": self._dml_data.n_ids,
             "id_positions": self.id_positions,
         }
         psi_elements = {
@@ -604,8 +570,8 @@ def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, p_hat):
     def _nuisance_tuning(
         self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
     ):
-        x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False)
-        x, d = check_X_y(x, self._g_panel, force_all_finite=False)
+        x, y = check_X_y(self._x_data_subset, self._y_data_subset, force_all_finite=False)
+        x, d = check_X_y(x, self._g_data_subset, force_all_finite=False)
 
         # get train indices for d == 0 and d == 1
         smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
@@ -669,8 +635,8 @@ def _nuisance_tuning(
         return res
 
     def _sensitivity_element_est(self, preds):
-        y = self._y_panel
-        d = self._g_panel
+        y = self._y_data_subset
+        d = self._g_data_subset
 
         m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions)
         g_hat0 = _get_id_positions(preds["predictions"]["ml_g0"], self.id_positions)
@@ -707,13 +673,13 @@ def _sensitivity_element_est(self, preds):
         psi_nu2 = nu2_score_element - nu2
 
         extend_kwargs = {
-            "n_obs": self._dml_data.n_obs,
+            "n_obs": self._dml_data.n_ids,
             "id_positions": self.id_positions,
             "fill_value": 0.0,
         }
 
         # add scaling to make variance estimation consistent (sample size difference)
-        scaling = self._dml_data.n_obs / self._n_subset
+        scaling = self._dml_data.n_ids / self._n_obs_subset
         element_dict = {
             "sigma2": sigma2,
             "nu2": nu2,
diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py
index ab2af5b9..aa97996f 100644
--- a/doubleml/did/did_cs.py
+++ b/doubleml/did/did_cs.py
@@ -37,7 +37,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
@@ -47,7 +47,7 @@ class DoubleMLDIDCS(LinearScoreMixin, DoubleML):
         Default is ``'observational'``.
 
     in_sample_normalization : bool
-        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
         Default is ``True``.
 
     trimming_rule : str
@@ -219,19 +219,17 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold
         # nuisance estimates of the uncond. treatment prob.
-        p_hat = np.full_like(d, np.nan, dtype="float64")
-        for train_index, test_index in smpls:
-            p_hat[test_index] = np.mean(d[train_index])
+        p_hat = np.full_like(d, d.mean(), dtype="float64")
 
         # nuisance estimates of the uncond. time prob.
-        lambda_hat = np.full_like(t, np.nan, dtype="float64")
-        for train_index, test_index in smpls:
-            lambda_hat[test_index] = np.mean(t[train_index])
+        lambda_hat = np.full_like(t, t.mean(), dtype="float64")
 
         # nuisance g
         smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t)
         if external_predictions["ml_g_d0_t0"] is not None:
-            g_hat_d0_t0 = {"preds": external_predictions["ml_g_d0_t0"], "targets": None, "models": None}
+            g_hat_d0_t0_targets = np.full_like(y, np.nan, dtype="float64")
+            g_hat_d0_t0_targets[(d == 0) & (t == 0)] = y[(d == 0) & (t == 0)]
+            g_hat_d0_t0 = {"preds": external_predictions["ml_g_d0_t0"], "targets": g_hat_d0_t0_targets, "models": None}
         else:
             g_hat_d0_t0 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -247,7 +245,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat_d0_t0["targets"] = g_hat_d0_t0["targets"].astype(float)
             g_hat_d0_t0["targets"][np.invert((d == 0) & (t == 0))] = np.nan
         if external_predictions["ml_g_d0_t1"] is not None:
-            g_hat_d0_t1 = {"preds": external_predictions["ml_g_d0_t1"], "targets": None, "models": None}
+            g_hat_d0_t1_targets = np.full_like(y, np.nan, dtype="float64")
+            g_hat_d0_t1_targets[(d == 0) & (t == 1)] = y[(d == 0) & (t == 1)]
+            g_hat_d0_t1 = {"preds": external_predictions["ml_g_d0_t1"], "targets": g_hat_d0_t1_targets, "models": None}
         else:
             g_hat_d0_t1 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -262,7 +262,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat_d0_t1["targets"] = g_hat_d0_t1["targets"].astype(float)
             g_hat_d0_t1["targets"][np.invert((d == 0) & (t == 1))] = np.nan
         if external_predictions["ml_g_d1_t0"] is not None:
-            g_hat_d1_t0 = {"preds": external_predictions["ml_g_d1_t0"], "targets": None, "models": None}
+            g_hat_d1_t0_targets = np.full_like(y, np.nan, dtype="float64")
+            g_hat_d1_t0_targets[(d == 1) & (t == 0)] = y[(d == 1) & (t == 0)]
+            g_hat_d1_t0 = {"preds": external_predictions["ml_g_d1_t0"], "targets": g_hat_d1_t0_targets, "models": None}
         else:
             g_hat_d1_t0 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -277,7 +279,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             g_hat_d1_t0["targets"] = g_hat_d1_t0["targets"].astype(float)
             g_hat_d1_t0["targets"][np.invert((d == 1) & (t == 0))] = np.nan
         if external_predictions["ml_g_d1_t1"] is not None:
-            g_hat_d1_t1 = {"preds": external_predictions["ml_g_d1_t1"], "targets": None, "models": None}
+            g_hat_d1_t1_targets = np.full_like(y, np.nan, dtype="float64")
+            g_hat_d1_t1_targets[(d == 1) & (t == 1)] = y[(d == 1) & (t == 1)]
+            g_hat_d1_t1 = {"preds": external_predictions["ml_g_d1_t1"], "targets": g_hat_d1_t1_targets, "models": None}
         else:
             g_hat_d1_t1 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -297,7 +301,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         if self.score == "observational":
             # nuisance m
             if external_predictions["ml_m"] is not None:
-                m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None}
+                m_hat = {"preds": external_predictions["ml_m"], "targets": d, "models": None}
             else:
                 m_hat = _dml_cv_predict(
                     self._learner["ml_m"],
diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py
new file mode 100644
index 00000000..1754a895
--- /dev/null
+++ b/doubleml/did/did_cs_binary.py
@@ -0,0 +1,871 @@
+import warnings
+
+import numpy as np
+from sklearn.utils import check_X_y
+
+from doubleml.data.panel_data import DoubleMLPanelData
+from doubleml.did.utils._did_utils import (
+    _check_anticipation_periods,
+    _check_control_group,
+    _check_gt_combination,
+    _check_gt_values,
+    _get_id_positions,
+    _get_never_treated_value,
+    _is_never_treated,
+    _set_id_positions,
+)
+from doubleml.double_ml import DoubleML
+from doubleml.double_ml_score_mixins import LinearScoreMixin
+from doubleml.utils._checks import (
+    _check_bool,
+    _check_finite_predictions,
+    _check_is_propensity,
+    _check_score,
+    _check_trimming,
+)
+from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d
+from doubleml.utils._propensity_score import _trimm
+
+
+class DoubleMLDIDCSBinary(LinearScoreMixin, DoubleML):
+    """Double machine learning for difference-in-differences models with repeated cross sections (binary setting in terms of group and time
+        combinations).
+
+        Parameters
+        ----------
+        obj_dml_data : :class:`DoubleMLPanelData` object
+            The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model.
+
+        g_value : int
+            The value indicating the treatment group (first period with treatment).
+            Default is ``None``. This implements the case for the smallest, non-zero value of G.
+
+        t_value_pre : int
+            The value indicating the baseline pre-treatment period.
+
+        t_value_eval : int
+            The value indicating the period for evaluation.
+
+        ml_g : estimator implementing ``fit()`` and ``predict()``
+            A machine learner implementing ``fit()`` and ``predict()`` methods (e.g.
+            :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(d,X) = E[Y_1-Y_0|D=d, X]`.
+            For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and
+            ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``,
+            ``predict_proba()`` is used otherwise ``predict()``.
+
+        ml_m : classifier implementing ``fit()`` and ``predict_proba()``
+            A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g.
+            :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`.
+            Only relevant for ``score='observational'``.
+
+        control_group : str
+            Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``.
+            Default is ``'never_treated'``.
+
+        anticipation_periods : int
+            Number of anticipation periods. Default is ``0``.
+
+        n_folds : int
+            Number of folds.
+            Default is ``5``.
+
+        n_rep : int
+            Number of repetitions for the sample splitting.
+            Default is ``1``.
+
+        score : str
+            A str (``'observational'`` or ``'experimental'``) specifying the score function.
+            The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent
+            from the pretreatment covariates.
+            Default is ``'observational'``.
+
+        in_sample_normalization : bool
+            Indicates whether to use a slightly different normalization from Sant'Anna and Zhao (2020).
+            Default is ``True``.
+
+        trimming_rule : str
+            A str (``'truncate'`` is the only choice) specifying the trimming approach.
+            Default is ``'truncate'``.
+
+        trimming_threshold : float
+            The threshold used for trimming.
+            Default is ``1e-2``.
+
+        draw_sample_splitting : bool
+            Indicates whether the sample splitting should be drawn during initialization of the object.
+            Default is ``True``.
+
+        print_periods : bool
+            Indicates whether to print information about the evaluated periods.
+            Default is ``False``.
+
+        """
+
+    def __init__(
+        self,
+        obj_dml_data,
+        g_value,
+        t_value_pre,
+        t_value_eval,
+        ml_g,
+        ml_m=None,
+        control_group="never_treated",
+        anticipation_periods=0,
+        n_folds=5,
+        n_rep=1,
+        score="observational",
+        in_sample_normalization=True,
+        trimming_rule="truncate",
+        trimming_threshold=1e-2,
+        draw_sample_splitting=True,
+        print_periods=False,
+    ):
+        super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False)
+
+        self._check_data(self._dml_data)
+        g_values = self._dml_data.g_values
+        t_values = self._dml_data.t_values
+
+        _check_bool(print_periods, "print_periods")
+        self._print_periods = print_periods
+        self._control_group = _check_control_group(control_group)
+        self._never_treated_value = _get_never_treated_value(g_values)
+        self._anticipation_periods = _check_anticipation_periods(anticipation_periods)
+
+        _check_gt_combination(
+            (g_value, t_value_pre, t_value_eval), g_values, t_values, self.never_treated_value, self.anticipation_periods
+        )
+        self._g_value = g_value
+        self._t_value_pre = t_value_pre
+        self._t_value_eval = t_value_eval
+
+        # check if post_treatment evaluation
+        if g_value <= t_value_eval:
+            post_treatment = True
+        else:
+            post_treatment = False
+
+        self._post_treatment = post_treatment
+
+        if self._print_periods:
+            print(
+                f"Evaluation of ATT({g_value}, {t_value_eval}), with pre-treatment period {t_value_pre},\n"
+                + f"post-treatment: {post_treatment}. Control group: {control_group}.\n"
+            )
+
+        # Preprocess data
+        self._data_subset = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval)
+
+        # Handling id values to match pairwise evaluation & simultaneous inference
+        if not np.all(np.isin(self.data_subset.index, self._dml_data.data.index)):
+            raise ValueError("The index values in the data subset are not a subset of the original index values.")
+
+        # Find position of data subset in original data
+        # These entries should be replaced by nuisance predictions, all others should be set to 0.
+        self._id_positions = self.data_subset.index.values
+
+        # Numeric values for positions of the entries in id_panel_data inside id_original
+        # np.nonzero(np.isin(id_original, id_panel_data))
+        self._n_obs_subset = self.data_subset.shape[0]  # Effective sample size used for resampling
+
+        # Save x and y for later ML estimation
+        self._x_data_subset = self.data_subset.loc[:, self._dml_data.x_cols].values
+        self._y_data_subset = self.data_subset.loc[:, self._dml_data.y_col].values
+        self._g_data_subset = self.data_subset.loc[:, "G_indicator"].values
+        self._t_data_subset = self.data_subset.loc[:, "t_indicator"].values
+
+        valid_scores = ["observational", "experimental"]
+        _check_score(self.score, valid_scores, allow_callable=False)
+
+        self._in_sample_normalization = in_sample_normalization
+        if not isinstance(self.in_sample_normalization, bool):
+            raise TypeError(
+                "in_sample_normalization indicator has to be boolean. "
+                + f"Object of type {str(type(self.in_sample_normalization))} passed."
+            )
+
+        # set stratication for resampling
+        self._strata = self.data_subset["G_indicator"] + 2 * self.data_subset["t_indicator"]
+        self._n_obs_sample_splitting = self.n_obs_subset
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+        # check learners
+        ml_g_is_classifier = self._check_learner(ml_g, "ml_g", regressor=True, classifier=True)
+        if self.score == "observational":
+            _ = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True)
+            self._learner = {"ml_g": ml_g, "ml_m": ml_m}
+        else:
+            assert self.score == "experimental"
+            if ml_m is not None:
+                warnings.warn(
+                    (
+                        'A learner ml_m has been provided for score = "experimental" but will be ignored. '
+                        "A learner ml_m is not required for estimation."
+                    )
+                )
+            self._learner = {"ml_g": ml_g}
+
+        if ml_g_is_classifier:
+            if obj_dml_data.binary_outcome:
+                self._predict_method = {"ml_g": "predict_proba"}
+            else:
+                raise ValueError(
+                    f"The ml_g learner {str(ml_g)} was identified as classifier "
+                    "but the outcome variable is not binary with values 0 and 1."
+                )
+        else:
+            self._predict_method = {"ml_g": "predict"}
+
+        if "ml_m" in self._learner:
+            self._predict_method["ml_m"] = "predict_proba"
+        self._initialize_ml_nuisance_params()
+
+        self._trimming_rule = trimming_rule
+        self._trimming_threshold = trimming_threshold
+        _check_trimming(self._trimming_rule, self._trimming_threshold)
+
+        self._sensitivity_implemented = True
+        self._external_predictions_implemented = True
+
+    def _format_score_info_str(self):
+        lines = [
+            f"Score function: {str(self.score)}",
+            f"Treatment group: {str(self.g_value)}",
+            f"Pre-treatment period: {str(self.t_value_pre)}",
+            f"Evaluation period: {str(self.t_value_eval)}",
+            f"Control group: {str(self.control_group)}",
+            f"Anticipation periods: {str(self.anticipation_periods)}",
+            f"Effective sample size: {str(self.n_obs_subset)}",
+        ]
+        return "\n".join(lines)
+
+    # _format_learner_info_str method is inherited from DoubleML base class.
+
+    @property
+    def g_value(self):
+        """
+        The value indicating the treatment group (first period with treatment).
+        """
+        return self._g_value
+
+    @property
+    def t_value_eval(self):
+        """
+        The value indicating the evaluation period.
+        """
+        return self._t_value_eval
+
+    @property
+    def t_value_pre(self):
+        """
+        The value indicating the pre-treatment period.
+        """
+        return self._t_value_pre
+
+    @property
+    def never_treated_value(self):
+        """
+        The value indicating that a unit was never treated.
+        """
+        return self._never_treated_value
+
+    @property
+    def post_treatment(self):
+        """
+        Indicates whether the evaluation period is after the treatment period.
+        """
+        return self._post_treatment
+
+    @property
+    def control_group(self):
+        """
+        The control group.
+        """
+        return self._control_group
+
+    @property
+    def anticipation_periods(self):
+        """
+        The number of anticipation periods.
+        """
+        return self._anticipation_periods
+
+    @property
+    def data_subset(self):
+        """
+        The preprocessed data subset.
+        """
+        return self._data_subset
+
+    @property
+    def id_positions(self):
+        """
+        The positions of the id values in the original data.
+        """
+        return self._id_positions
+
+    @property
+    def in_sample_normalization(self):
+        """
+        Indicates whether the in sample normalization of weights are used.
+        """
+        return self._in_sample_normalization
+
+    @property
+    def trimming_rule(self):
+        """
+        Specifies the used trimming rule.
+        """
+        return self._trimming_rule
+
+    @property
+    def trimming_threshold(self):
+        """
+        Specifies the used trimming threshold.
+        """
+        return self._trimming_threshold
+
+    @property
+    def n_obs_subset(self):
+        """
+        The number of observations used for estimation.
+        """
+        return self._n_obs_subset
+
+    def _initialize_ml_nuisance_params(self):
+        if self.score == "observational":
+            valid_learner = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"]
+        else:
+            assert self.score == "experimental"
+            valid_learner = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"]
+        self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner}
+
+    def _check_data(self, obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLPanelData):
+            raise TypeError(
+                "For repeated outcomes the data must be of DoubleMLPanelData type. "
+                f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise NotImplementedError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "At the moment there are not DiD models with instruments implemented."
+            )
+
+        one_treat = obj_dml_data.n_treat == 1
+        if not (one_treat):
+            raise ValueError(
+                "Incompatible data. "
+                "To fit an DID model with DML "
+                "exactly one variable needs to be specified as treatment variable."
+            )
+        _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values)
+        return
+
+    def _preprocess_data(self, g_value, pre_t, eval_t):
+        data = self._dml_data.data
+
+        t_col = self._dml_data.t_col
+        id_col = self._dml_data.id_col
+        g_col = self._dml_data.g_col
+
+        # relevant data subset
+        data_subset_indicator = data[t_col].isin([pre_t, eval_t])
+        data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col])
+
+        # Construct G (treatment group) indicating treatment period in g
+        G_indicator = (data_subset[g_col] == g_value).astype(int)
+
+        # Construct C (control group) indicating never treated or not yet treated
+        never_treated = _is_never_treated(data_subset[g_col], self.never_treated_value).reshape(-1)
+        if self.control_group == "never_treated":
+            C_indicator = never_treated.astype(int)
+
+        elif self.control_group == "not_yet_treated":
+            # adjust max_g_value for anticipation periods
+            t_values = self._dml_data.t_values
+            max_g_value = t_values[min(np.where(t_values == eval_t)[0][0] + self.anticipation_periods, len(t_values) - 1)]
+            # not in G just as a additional check
+            later_treated = (data_subset[g_col] > max_g_value) & (G_indicator == 0)
+            not_yet_treated = never_treated | later_treated
+            C_indicator = not_yet_treated.astype(int)
+
+        if np.sum(C_indicator) == 0:
+            raise ValueError("No observations in the control group.")
+
+        data_subset = data_subset.assign(C_indicator=C_indicator, G_indicator=G_indicator)
+        # reduce to relevant subset
+        data_subset = data_subset[(data_subset["G_indicator"] == 1) | (data_subset["C_indicator"] == 1)]
+        # check if G and C are disjoint
+        assert sum(G_indicator & C_indicator) == 0
+
+        # add time indicator
+        data_subset = data_subset.assign(t_indicator=data_subset[t_col] == eval_t)
+        return data_subset
+
+    def _estimate_conditional_g(
+        self, x, y, d_val, t_val, d_arr, t_arr, smpls_cond, external_prediction, learner_param_key, n_jobs_cv, return_models
+    ):
+        """Helper function to estimate conditional g_hat for fixed d and t."""
+        g_hat_cond = {}
+        condition = (d_arr == d_val) & (t_arr == t_val)
+
+        if external_prediction is not None:
+            ml_g_targets = np.full_like(y, np.nan, dtype="float64")
+            ml_g_targets[condition] = y[condition]
+            ml_pred = _get_id_positions(external_prediction, self.id_positions)
+            g_hat_cond = {"preds": ml_pred, "targets": ml_g_targets, "models": None}
+        else:
+            g_hat_cond = _dml_cv_predict(
+                self._learner["ml_g"],
+                x,
+                y,
+                smpls_cond,
+                n_jobs=n_jobs_cv,
+                est_params=self._get_params(learner_param_key),
+                method=self._predict_method["ml_g"],
+                return_models=return_models,
+            )
+            _check_finite_predictions(g_hat_cond["preds"], self._learner["ml_g"], "ml_g", smpls_cond)
+            g_hat_cond["targets"] = g_hat_cond["targets"].astype(float)
+            g_hat_cond["targets"][~condition] = np.nan
+        return g_hat_cond
+
+    def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
+
+        # Here: d is a binary treatment indicator
+        x, y = check_X_y(X=self._x_data_subset, y=self._y_data_subset, force_all_finite=False)
+        _, d = check_X_y(x, self._g_data_subset, force_all_finite=False)  # (d is the G_indicator)
+        _, t = check_X_y(x, self._t_data_subset, force_all_finite=False)
+
+        # THIS DIFFERS FROM THE PAPER due to stratified splitting this should be the same for each fold
+        # nuisance estimates of the uncond. treatment prob.
+        p_hat = np.full_like(d, d.mean(), dtype="float64")
+        lambda_hat = np.full_like(t, t.mean(), dtype="float64")
+
+        # nuisance g
+        smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t)
+
+        g_hat_d0_t0 = self._estimate_conditional_g(
+            x, y, 0, 0, d, t, smpls_d0_t0, external_predictions["ml_g_d0_t0"], "ml_g_d0_t0", n_jobs_cv, return_models
+        )
+        g_hat_d0_t1 = self._estimate_conditional_g(
+            x, y, 0, 1, d, t, smpls_d0_t1, external_predictions["ml_g_d0_t1"], "ml_g_d0_t1", n_jobs_cv, return_models
+        )
+        g_hat_d1_t0 = self._estimate_conditional_g(
+            x, y, 1, 0, d, t, smpls_d1_t0, external_predictions["ml_g_d1_t0"], "ml_g_d1_t0", n_jobs_cv, return_models
+        )
+        g_hat_d1_t1 = self._estimate_conditional_g(
+            x, y, 1, 1, d, t, smpls_d1_t1, external_predictions["ml_g_d1_t1"], "ml_g_d1_t1", n_jobs_cv, return_models
+        )
+
+        # only relevant for observational setting
+        m_hat = {"preds": None, "targets": None, "models": None}
+        if self.score == "observational":
+            # nuisance m
+            if external_predictions["ml_m"] is not None:
+                ml_m_pred = _get_id_positions(external_predictions["ml_m"], self.id_positions)
+                m_hat = {"preds": ml_m_pred, "targets": d, "models": None}
+            else:
+                m_hat = _dml_cv_predict(
+                    self._learner["ml_m"],
+                    x,
+                    d,
+                    smpls=smpls,
+                    n_jobs=n_jobs_cv,
+                    est_params=self._get_params("ml_m"),
+                    method=self._predict_method["ml_m"],
+                    return_models=return_models,
+                )
+
+            _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
+            _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
+            m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
+
+        psi_a, psi_b = self._score_elements(
+            y,
+            d,
+            t,
+            g_hat_d0_t0["preds"],
+            g_hat_d0_t1["preds"],
+            g_hat_d1_t0["preds"],
+            g_hat_d1_t1["preds"],
+            m_hat["preds"],
+            p_hat,
+            lambda_hat,
+        )
+
+        extend_kwargs = {
+            "n_obs": self._dml_data.n_obs,
+            "id_positions": self.id_positions,
+        }
+        psi_elements = {
+            "psi_a": _set_id_positions(psi_a, fill_value=0.0, **extend_kwargs),
+            "psi_b": _set_id_positions(psi_b, fill_value=0.0, **extend_kwargs),
+        }
+        preds = {
+            "predictions": {
+                "ml_g_d0_t0": _set_id_positions(g_hat_d0_t0["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d0_t1": _set_id_positions(g_hat_d0_t1["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d1_t0": _set_id_positions(g_hat_d1_t0["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d1_t1": _set_id_positions(g_hat_d1_t1["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_m": _set_id_positions(m_hat["preds"], fill_value=np.nan, **extend_kwargs),
+            },
+            "targets": {
+                "ml_g_d0_t0": _set_id_positions(g_hat_d0_t0["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d0_t1": _set_id_positions(g_hat_d0_t1["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d1_t0": _set_id_positions(g_hat_d1_t0["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_g_d1_t1": _set_id_positions(g_hat_d1_t1["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_m": _set_id_positions(m_hat["targets"], fill_value=np.nan, **extend_kwargs),
+            },
+            "models": {
+                "ml_g_d0_t0": g_hat_d0_t0["models"],
+                "ml_g_d0_t1": g_hat_d0_t1["models"],
+                "ml_g_d1_t0": g_hat_d1_t0["models"],
+                "ml_g_d1_t1": g_hat_d1_t1["models"],
+                "ml_m": m_hat["models"],
+            },
+        }
+
+        return psi_elements, preds
+
+    def _score_elements(self, y, d, t, g_hat_d0_t0, g_hat_d0_t1, g_hat_d1_t0, g_hat_d1_t1, m_hat, p_hat, lambda_hat):
+        # calculate residuals
+        resid_d0_t0 = y - g_hat_d0_t0
+        resid_d0_t1 = y - g_hat_d0_t1
+        resid_d1_t0 = y - g_hat_d1_t0
+        resid_d1_t1 = y - g_hat_d1_t1
+
+        d1t1 = np.multiply(d, t)
+        d1t0 = np.multiply(d, 1.0 - t)
+        d0t1 = np.multiply(1.0 - d, t)
+        d0t0 = np.multiply(1.0 - d, 1.0 - t)
+
+        if self.score == "observational":
+            if self.in_sample_normalization:
+                weight_psi_a = np.divide(d, np.mean(d))
+                weight_g_d1_t1 = weight_psi_a
+                weight_g_d1_t0 = -1.0 * weight_psi_a
+                weight_g_d0_t1 = -1.0 * weight_psi_a
+                weight_g_d0_t0 = weight_psi_a
+
+                weight_resid_d1_t1 = np.divide(d1t1, np.mean(d1t1))
+                weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.mean(d1t0))
+
+                prop_weighting = np.divide(m_hat, 1.0 - m_hat)
+                unscaled_d0_t1 = np.multiply(d0t1, prop_weighting)
+                weight_resid_d0_t1 = -1.0 * np.divide(unscaled_d0_t1, np.mean(unscaled_d0_t1))
+
+                unscaled_d0_t0 = np.multiply(d0t0, prop_weighting)
+                weight_resid_d0_t0 = np.divide(unscaled_d0_t0, np.mean(unscaled_d0_t0))
+            else:
+                weight_psi_a = np.divide(d, p_hat)
+                weight_g_d1_t1 = weight_psi_a
+                weight_g_d1_t0 = -1.0 * weight_psi_a
+                weight_g_d0_t1 = -1.0 * weight_psi_a
+                weight_g_d0_t0 = weight_psi_a
+
+                weight_resid_d1_t1 = np.divide(d1t1, np.multiply(p_hat, lambda_hat))
+                weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat))
+
+                prop_weighting = np.divide(m_hat, 1.0 - m_hat)
+                weight_resid_d0_t1 = -1.0 * np.multiply(np.divide(d0t1, np.multiply(p_hat, lambda_hat)), prop_weighting)
+                weight_resid_d0_t0 = np.multiply(np.divide(d0t0, np.multiply(p_hat, 1.0 - lambda_hat)), prop_weighting)
+        else:
+            assert self.score == "experimental"
+            if self.in_sample_normalization:
+                weight_psi_a = np.ones_like(y)
+                weight_g_d1_t1 = weight_psi_a
+                weight_g_d1_t0 = -1.0 * weight_psi_a
+                weight_g_d0_t1 = -1.0 * weight_psi_a
+                weight_g_d0_t0 = weight_psi_a
+
+                weight_resid_d1_t1 = np.divide(d1t1, np.mean(d1t1))
+                weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.mean(d1t0))
+                weight_resid_d0_t1 = -1.0 * np.divide(d0t1, np.mean(d0t1))
+                weight_resid_d0_t0 = np.divide(d0t0, np.mean(d0t0))
+            else:
+                weight_psi_a = np.ones_like(y)
+                weight_g_d1_t1 = weight_psi_a
+                weight_g_d1_t0 = -1.0 * weight_psi_a
+                weight_g_d0_t1 = -1.0 * weight_psi_a
+                weight_g_d0_t0 = weight_psi_a
+
+                weight_resid_d1_t1 = np.divide(d1t1, np.multiply(p_hat, lambda_hat))
+                weight_resid_d1_t0 = -1.0 * np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat))
+                weight_resid_d0_t1 = -1.0 * np.divide(d0t1, np.multiply(1.0 - p_hat, lambda_hat))
+                weight_resid_d0_t0 = np.divide(d0t0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat))
+
+        # set score elements
+        psi_a = -1.0 * weight_psi_a
+
+        # psi_b
+        psi_b_1 = (
+            np.multiply(weight_g_d1_t1, g_hat_d1_t1)
+            + np.multiply(weight_g_d1_t0, g_hat_d1_t0)
+            + np.multiply(weight_g_d0_t0, g_hat_d0_t0)
+            + np.multiply(weight_g_d0_t1, g_hat_d0_t1)
+        )
+        psi_b_2 = (
+            np.multiply(weight_resid_d1_t1, resid_d1_t1)
+            + np.multiply(weight_resid_d1_t0, resid_d1_t0)
+            + np.multiply(weight_resid_d0_t0, resid_d0_t0)
+            + np.multiply(weight_resid_d0_t1, resid_d0_t1)
+        )
+
+        psi_b = psi_b_1 + psi_b_2
+
+        return psi_a, psi_b
+
+    def _nuisance_tuning(
+        self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
+    ):
+        x, y = check_X_y(X=self._x_data_subset, y=self._y_data_subset, force_all_finite=False)
+        _, d = check_X_y(x, self._g_data_subset, force_all_finite=False)  # (d is the G_indicator)
+        _, t = check_X_y(x, self._t_data_subset, force_all_finite=False)
+
+        if scoring_methods is None:
+            scoring_methods = {"ml_g": None, "ml_m": None}
+
+        # nuisance training sets conditional on d and t
+        smpls_d0_t0, smpls_d0_t1, smpls_d1_t0, smpls_d1_t1 = _get_cond_smpls_2d(smpls, d, t)
+        train_inds = [train_index for (train_index, _) in smpls]
+        train_inds_d0_t0 = [train_index for (train_index, _) in smpls_d0_t0]
+        train_inds_d0_t1 = [train_index for (train_index, _) in smpls_d0_t1]
+        train_inds_d1_t0 = [train_index for (train_index, _) in smpls_d1_t0]
+        train_inds_d1_t1 = [train_index for (train_index, _) in smpls_d1_t1]
+
+        tune_args = {
+            "n_folds_tune": n_folds_tune,
+            "n_jobs_cv": n_jobs_cv,
+            "search_mode": search_mode,
+            "n_iter_randomized_search": n_iter_randomized_search,
+        }
+
+        g_d0_t0_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d0_t0,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            **tune_args,
+        )
+
+        g_d0_t1_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d0_t1,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            **tune_args,
+        )
+
+        g_d1_t0_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d1_t0,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            **tune_args,
+        )
+
+        g_d1_t1_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d1_t1,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            **tune_args,
+        )
+
+        m_tune_res = list()
+        if self.score == "observational":
+            m_tune_res = _dml_tune(
+                d,
+                x,
+                train_inds,
+                self._learner["ml_m"],
+                param_grids["ml_m"],
+                scoring_methods["ml_m"],
+                **tune_args,
+            )
+
+        g_d0_t0_best_params = [xx.best_params_ for xx in g_d0_t0_tune_res]
+        g_d0_t1_best_params = [xx.best_params_ for xx in g_d0_t1_tune_res]
+        g_d1_t0_best_params = [xx.best_params_ for xx in g_d1_t0_tune_res]
+        g_d1_t1_best_params = [xx.best_params_ for xx in g_d1_t1_tune_res]
+
+        if self.score == "observational":
+            m_best_params = [xx.best_params_ for xx in m_tune_res]
+            params = {
+                "ml_g_d0_t0": g_d0_t0_best_params,
+                "ml_g_d0_t1": g_d0_t1_best_params,
+                "ml_g_d1_t0": g_d1_t0_best_params,
+                "ml_g_d1_t1": g_d1_t1_best_params,
+                "ml_m": m_best_params,
+            }
+            tune_res = {
+                "g_d0_t0_tune": g_d0_t0_tune_res,
+                "g_d0_t1_tune": g_d0_t1_tune_res,
+                "g_d1_t0_tune": g_d1_t0_tune_res,
+                "g_d1_t1_tune": g_d1_t1_tune_res,
+                "m_tune": m_tune_res,
+            }
+        else:
+            params = {
+                "ml_g_d0_t0": g_d0_t0_best_params,
+                "ml_g_d0_t1": g_d0_t1_best_params,
+                "ml_g_d1_t0": g_d1_t0_best_params,
+                "ml_g_d1_t1": g_d1_t1_best_params,
+            }
+            tune_res = {
+                "g_d0_t0_tune": g_d0_t0_tune_res,
+                "g_d0_t1_tune": g_d0_t1_tune_res,
+                "g_d1_t0_tune": g_d1_t0_tune_res,
+                "g_d1_t1_tune": g_d1_t1_tune_res,
+            }
+
+        res = {"params": params, "tune_res": tune_res}
+
+        return res
+
+    def _sensitivity_element_est(self, preds):
+        y = self._y_data_subset
+        d = self._g_data_subset
+        t = self._t_data_subset
+
+        m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions)
+        g_hat_d0_t0 = _get_id_positions(preds["predictions"]["ml_g_d0_t0"], self.id_positions)
+        g_hat_d0_t1 = _get_id_positions(preds["predictions"]["ml_g_d0_t1"], self.id_positions)
+        g_hat_d1_t0 = _get_id_positions(preds["predictions"]["ml_g_d1_t0"], self.id_positions)
+        g_hat_d1_t1 = _get_id_positions(preds["predictions"]["ml_g_d1_t1"], self.id_positions)
+
+        d0t0 = np.multiply(1.0 - d, 1.0 - t)
+        d0t1 = np.multiply(1.0 - d, t)
+        d1t0 = np.multiply(d, 1.0 - t)
+        d1t1 = np.multiply(d, t)
+
+        g_hat = (
+            np.multiply(d0t0, g_hat_d0_t0)
+            + np.multiply(d0t1, g_hat_d0_t1)
+            + np.multiply(d1t0, g_hat_d1_t0)
+            + np.multiply(d1t1, g_hat_d1_t1)
+        )
+        sigma2_score_element = np.square(y - g_hat)
+        sigma2 = np.mean(sigma2_score_element)
+        psi_sigma2 = sigma2_score_element - sigma2
+
+        # calc m(W,alpha) and Riesz representer
+        p_hat = np.mean(d)
+        lambda_hat = np.mean(t)
+        if self.score == "observational":
+            propensity_weight_d0 = np.divide(m_hat, 1.0 - m_hat)
+            if self.in_sample_normalization:
+                weight_d0t1 = np.multiply(d0t1, propensity_weight_d0)
+                weight_d0t0 = np.multiply(d0t0, propensity_weight_d0)
+                mean_weight_d0t1 = np.mean(weight_d0t1)
+                mean_weight_d0t0 = np.mean(weight_d0t0)
+
+                m_alpha = np.multiply(
+                    np.divide(d, p_hat),
+                    np.divide(1.0, np.mean(d1t1))
+                    + np.divide(1.0, np.mean(d1t0))
+                    + np.divide(propensity_weight_d0, mean_weight_d0t1)
+                    + np.divide(propensity_weight_d0, mean_weight_d0t0),
+                )
+
+                rr = (
+                    np.divide(d1t1, np.mean(d1t1))
+                    - np.divide(d1t0, np.mean(d1t0))
+                    - np.divide(weight_d0t1, mean_weight_d0t1)
+                    + np.divide(weight_d0t0, mean_weight_d0t0)
+                )
+            else:
+                m_alpha_1 = np.divide(1.0, lambda_hat) + np.divide(1.0, 1.0 - lambda_hat)
+                m_alpha = np.multiply(np.divide(d, np.square(p_hat)), np.multiply(m_alpha_1, 1.0 + propensity_weight_d0))
+
+                rr_1 = np.divide(t, np.multiply(p_hat, lambda_hat)) + np.divide(1.0 - t, np.multiply(p_hat, 1.0 - lambda_hat))
+                rr_2 = d + np.multiply(1.0 - d, propensity_weight_d0)
+                rr = np.multiply(rr_1, rr_2)
+        else:
+            assert self.score == "experimental"
+            if self.in_sample_normalization:
+                m_alpha = (
+                    np.divide(1.0, np.mean(d1t1))
+                    + np.divide(1.0, np.mean(d1t0))
+                    + np.divide(1.0, np.mean(d0t1))
+                    + np.divide(1.0, np.mean(d0t0))
+                )
+                rr = (
+                    np.divide(d1t1, np.mean(d1t1))
+                    - np.divide(d1t0, np.mean(d1t0))
+                    - np.divide(d0t1, np.mean(d0t1))
+                    + np.divide(d0t0, np.mean(d0t0))
+                )
+            else:
+                m_alpha = (
+                    np.divide(1.0, np.multiply(p_hat, lambda_hat))
+                    + np.divide(1.0, np.multiply(p_hat, 1.0 - lambda_hat))
+                    + np.divide(1.0, np.multiply(1.0 - p_hat, lambda_hat))
+                    + np.divide(1.0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat))
+                )
+                rr = (
+                    np.divide(d1t1, np.multiply(p_hat, lambda_hat))
+                    - np.divide(d1t0, np.multiply(p_hat, 1.0 - lambda_hat))
+                    - np.divide(d0t1, np.multiply(1.0 - p_hat, lambda_hat))
+                    + np.divide(d0t0, np.multiply(1.0 - p_hat, 1.0 - lambda_hat))
+                )
+
+        nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr)
+        nu2 = np.mean(nu2_score_element)
+        psi_nu2 = nu2_score_element - nu2
+
+        extend_kwargs = {
+            "n_obs": self._dml_data.n_obs,
+            "id_positions": self.id_positions,
+            "fill_value": 0.0,
+        }
+
+        # add scaling to make variance estimation consistent (sample size difference)
+        scaling = self._dml_data.n_obs / self._n_obs_subset
+        element_dict = {
+            "sigma2": sigma2,
+            "nu2": nu2,
+            "psi_sigma2": scaling * _set_id_positions(psi_sigma2, **extend_kwargs),
+            "psi_nu2": scaling * _set_id_positions(psi_nu2, **extend_kwargs),
+            "riesz_rep": scaling * _set_id_positions(rr, **extend_kwargs),
+        }
+        return element_dict
+
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+
+        Parameters
+        ----------
+        benchmarking_set : list
+            List of features to be used for benchmarking.
+
+        fit_args : dict, optional
+            Additional arguments for the fit method.
+            Default is None.
+
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        if self.score == "experimental":
+            warnings.warn(
+                "Sensitivity benchmarking for experimental score may not be meaningful. "
+                "Consider using score='observational' for conditional treatment assignment.",
+                UserWarning,
+            )
+
+        return super().sensitivity_benchmark(benchmarking_set, fit_args)
diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py
index 8c5d5163..972f6e64 100644
--- a/doubleml/did/did_multi.py
+++ b/doubleml/did/did_multi.py
@@ -10,8 +10,10 @@
 from sklearn.base import clone
 
 from doubleml.data import DoubleMLPanelData
+from doubleml.data.utils.panel_data_utils import _subtract_periods_safe
 from doubleml.did.did_aggregation import DoubleMLDIDAggregation
 from doubleml.did.did_binary import DoubleMLDIDBinary
+from doubleml.did.did_cs_binary import DoubleMLDIDCSBinary
 from doubleml.did.utils._aggregation import (
     _check_did_aggregation_dict,
     _compute_did_eventstudy_aggregation_weights,
@@ -31,7 +33,7 @@
 from doubleml.did.utils._plot import add_jitter
 from doubleml.double_ml import DoubleML
 from doubleml.double_ml_framework import concat
-from doubleml.utils._checks import _check_score, _check_trimming
+from doubleml.utils._checks import _check_bool, _check_score, _check_trimming
 from doubleml.utils._descriptive import generate_summary
 from doubleml.utils.gain_statistics import gain_statistics
 
@@ -56,8 +58,10 @@ class DoubleMLDIDMulti:
         :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`.
         Only relevant for ``score='observational'``. Default is ``None``.
 
-    gt_combinations : array-like
-        A list of tuples with the group-time combinations to be evaluated.
+    gt_combinations : array-like or str
+        A list of tuples with the group-time combinations to be evaluated. Can be a string with the value
+        ``'standard'``,  ``'all'`` or  ``'universal'``, which constructs the corresponding combinations automatically.
+        Default is ``'standard'``.
 
     control_group : str
         Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``.
@@ -80,6 +84,10 @@ class DoubleMLDIDMulti:
         from the pretreatment covariates.
         Default is ``'observational'``.
 
+    panel : bool
+        Indicates whether to rely on panel data structure (``True``) or repeated cross sections (``False``).
+        Default is ``True``.
+
     in_sample_normalization : bool
         Indicates whether to use in-sample normalization of weights.
         Default is ``True``.
@@ -140,6 +148,7 @@ def __init__(
         n_folds=5,
         n_rep=1,
         score="observational",
+        panel=True,
         in_sample_normalization=True,
         trimming_rule="truncate",
         trimming_threshold=1e-2,
@@ -179,6 +188,14 @@ def __init__(
         valid_scores = ["observational", "experimental"]
         _check_score(self.score, valid_scores, allow_callable=False)
 
+        _check_bool(panel, "panel")
+        self._panel = panel
+        # set score dim (n_elements, n_thetas, n_rep), just for checking purposes
+        if self.panel:
+            self._score_dim = (self._dml_data.n_ids, self.n_gt_atts, self.n_rep)
+        else:
+            self._score_dim = (self._dml_data.n_obs, self.n_gt_atts, self.n_rep)
+
         # initialize framework which is constructed after the fit method is called
         self._framework = None
 
@@ -332,6 +349,13 @@ def never_treated_value(self):
         """
         return self._never_treated_value
 
+    @property
+    def panel(self):
+        """
+        Indicates whether to rely on panel data structure (``True``) or repeated cross sections (``False``).
+        """
+        return self._panel
+
     @property
     def in_sample_normalization(self):
         """
@@ -968,8 +992,9 @@ def plot_effects(
         first_treated_periods = sorted(df["First Treated"].unique())
         n_periods = len(first_treated_periods)
 
-        # Set up colors
-        colors = dict(zip(["pre", "post"], sns.color_palette(color_palette)[:2]))
+        # Set up colors - ensure 'post' always gets the second color
+        palette_colors = sns.color_palette(color_palette)
+        colors = {"pre": palette_colors[0], "post": palette_colors[1], "anticipation": palette_colors[2]}
 
         # Check if x-axis is datetime or convert to float
         is_datetime = pd.api.types.is_datetime64_any_dtype(df["Evaluation Period"])
@@ -1013,9 +1038,20 @@ def plot_effects(
             Line2D([0], [0], color="red", linestyle=":", alpha=0.7, label="Treatment start"),
             Line2D([0], [0], color="black", linestyle="--", alpha=0.5, label="Zero effect"),
             Line2D([0], [0], marker="o", color=colors["pre"], linestyle="None", label="Pre-treatment", markersize=5),
-            Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5),
         ]
-        legend_ax.legend(handles=legend_elements, loc="center", ncol=4, mode="expand", borderaxespad=0.0)
+
+        if self.anticipation_periods > 0:
+            legend_elements.append(
+                Line2D(
+                    [0], [0], marker="o", color=colors["anticipation"], linestyle="None", label="Anticipation", markersize=5
+                )
+            )
+
+        legend_elements.append(
+            Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5)
+        )
+
+        legend_ax.legend(handles=legend_elements, loc="center", ncol=len(legend_elements), mode="expand", borderaxespad=0.0)
 
         # Set title and layout
         plt.suptitle(title, y=1.02)
@@ -1036,7 +1072,7 @@ def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_
         period : int or datetime
             Treatment period for this group.
         colors : dict
-            Dictionary with 'pre' and 'post' color values.
+            Dictionary with 'pre', 'anticipation' (if applicable), and 'post' color values.
         is_datetime : bool
             Whether the x-axis represents datetime values.
         jitter_value : float
@@ -1053,56 +1089,64 @@ def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_
         ax.axvline(x=period, color="red", linestyle=":", alpha=0.7)
         ax.axhline(y=0, color="black", linestyle="--", alpha=0.5)
 
-        # Split and jitter data
-        pre_treatment = add_jitter(
-            period_df[period_df["Pre-Treatment"]],
-            "Evaluation Period",
-            is_datetime=is_datetime,
-            jitter_value=jitter_value,
-        )
-        post_treatment = add_jitter(
-            period_df[~period_df["Pre-Treatment"]],
-            "Evaluation Period",
-            is_datetime=is_datetime,
-            jitter_value=jitter_value,
-        )
-
-        # Plot pre-treatment points
-        if not pre_treatment.empty:
-            ax.scatter(pre_treatment["jittered_x"], pre_treatment["Estimate"], color=colors["pre"], alpha=0.8, s=30)
-            ax.errorbar(
-                pre_treatment["jittered_x"],
-                pre_treatment["Estimate"],
-                yerr=[
-                    pre_treatment["Estimate"] - pre_treatment["CI Lower"],
-                    pre_treatment["CI Upper"] - pre_treatment["Estimate"],
-                ],
-                fmt="o",
-                capsize=3,
-                color=colors["pre"],
-                markersize=4,
-                markeredgewidth=1,
-                linewidth=1,
+        # Categorize periods
+        if is_datetime:
+            # For datetime, use safe period arithmetic that handles both timedelta-compatible and period-only units
+            anticipation_ge_mask = _subtract_periods_safe(
+                period_df["Evaluation Period"], period, self.anticipation_periods, self._dml_data.datetime_unit
+            )
+            anticipation_mask = (
+                (self.anticipation_periods > 0)
+                & period_df["Pre-Treatment"]
+                & anticipation_ge_mask
+                & (period_df["Evaluation Period"] < period)
             )
+        else:
+            # For numeric periods, simple arithmetic works
+            anticipation_mask = (
+                (self.anticipation_periods > 0)
+                & period_df["Pre-Treatment"]
+                & (period_df["Evaluation Period"] >= period - self.anticipation_periods)
+                & (period_df["Evaluation Period"] < period)
+            )
+
+        pre_treatment_mask = period_df["Pre-Treatment"] & ~anticipation_mask
+        post_treatment_mask = ~period_df["Pre-Treatment"]
+
+        # Define category mappings
+        categories = [("pre", pre_treatment_mask), ("anticipation", anticipation_mask), ("post", post_treatment_mask)]
 
-        # Plot post-treatment points
-        if not post_treatment.empty:
-            ax.scatter(post_treatment["jittered_x"], post_treatment["Estimate"], color=colors["post"], alpha=0.8, s=30)
-            ax.errorbar(
-                post_treatment["jittered_x"],
-                post_treatment["Estimate"],
-                yerr=[
-                    post_treatment["Estimate"] - post_treatment["CI Lower"],
-                    post_treatment["CI Upper"] - post_treatment["Estimate"],
-                ],
-                fmt="o",
-                capsize=3,
-                color=colors["post"],
-                markersize=4,
-                markeredgewidth=1,
-                linewidth=1,
+        # Plot each category
+        for category_name, mask in categories:
+            if not mask.any():
+                continue
+
+            category_data = add_jitter(
+                period_df[mask],
+                "Evaluation Period",
+                is_datetime=is_datetime,
+                jitter_value=jitter_value,
             )
 
+            if not category_data.empty:
+                ax.scatter(
+                    category_data["jittered_x"], category_data["Estimate"], color=colors[category_name], alpha=0.8, s=30
+                )
+                ax.errorbar(
+                    category_data["jittered_x"],
+                    category_data["Estimate"],
+                    yerr=[
+                        category_data["Estimate"] - category_data["CI Lower"],
+                        category_data["CI Upper"] - category_data["Estimate"],
+                    ],
+                    fmt="o",
+                    capsize=3,
+                    color=colors[category_name],
+                    markersize=4,
+                    markeredgewidth=1,
+                    linewidth=1,
+                )
+
         # Format axes
         if is_datetime:
             period_str = np.datetime64(period, self._dml_data.datetime_unit)
@@ -1250,7 +1294,10 @@ def _check_external_predictions(self, external_predictions):
                 + f"Passed keys: {set(external_predictions.keys())}."
             )
 
-        expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"]
+        if self.panel:
+            expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"]
+        else:
+            expected_learner_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"]
         for key, value in external_predictions.items():
             if not isinstance(value, dict):
                 raise TypeError(
@@ -1268,12 +1315,7 @@ def _rename_external_predictions(self, external_predictions):
         d_col = self._dml_data.d_cols[0]
         ext_pred_dict = {gt_combination: {d_col: {}} for gt_combination in self.gt_labels}
         for gt_combination in self.gt_labels:
-            if "ml_g0" in external_predictions[gt_combination]:
-                ext_pred_dict[gt_combination][d_col]["ml_g0"] = external_predictions[gt_combination]["ml_g0"]
-            if "ml_g1" in external_predictions[gt_combination]:
-                ext_pred_dict[gt_combination][d_col]["ml_g1"] = external_predictions[gt_combination]["ml_g1"]
-            if "ml_m" in external_predictions[gt_combination]:
-                ext_pred_dict[gt_combination][d_col]["ml_m"] = external_predictions[gt_combination]["ml_m"]
+            ext_pred_dict[gt_combination][d_col].update(external_predictions[gt_combination])
 
         return ext_pred_dict
 
@@ -1304,9 +1346,15 @@ def _initialize_models(self):
             "draw_sample_splitting": True,
             "print_periods": self._print_periods,
         }
+        if self.panel:
+            ModelClass = DoubleMLDIDBinary
+        else:
+            ModelClass = DoubleMLDIDCSBinary
+
+        # iterate over all group-time combinations
         for i_model, (g_value, t_value_pre, t_value_eval) in enumerate(self.gt_combinations):
             # initialize models for all levels
-            model = DoubleMLDIDBinary(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs)
+            model = ModelClass(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs)
 
             modellist[i_model] = model
 
diff --git a/doubleml/did/tests/_utils_did_cs_manual.py b/doubleml/did/tests/_utils_did_cs_manual.py
index f14a52a0..ce6f8870 100644
--- a/doubleml/did/tests/_utils_did_cs_manual.py
+++ b/doubleml/did/tests/_utils_did_cs_manual.py
@@ -178,12 +178,12 @@ def fit_nuisance_did_cs(
             m_hat_list.append(np.zeros_like(g_hat_d1_t1_list[idx], dtype="float64"))
 
     p_hat_list = []
-    for train_index, _ in smpls:
-        p_hat_list.append(np.mean(d[train_index]))
+    for _ in smpls:
+        p_hat_list.append(np.mean(d))
 
     lambda_hat_list = []
-    for train_index, _ in smpls:
-        lambda_hat_list.append(np.mean(t[train_index]))
+    for _ in smpls:
+        lambda_hat_list.append(np.mean(t))
 
     return g_hat_d0_t0_list, g_hat_d0_t1_list, g_hat_d1_t0_list, g_hat_d1_t1_list, m_hat_list, p_hat_list, lambda_hat_list
 
diff --git a/doubleml/did/tests/_utils_did_manual.py b/doubleml/did/tests/_utils_did_manual.py
index e314c301..b067e44d 100644
--- a/doubleml/did/tests/_utils_did_manual.py
+++ b/doubleml/did/tests/_utils_did_manual.py
@@ -104,7 +104,7 @@ def fit_nuisance_did(
         m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls, trimming_threshold=trimming_threshold)
 
     p_hat_list = []
-    for train_index, _ in smpls:
+    for _ in smpls:
         p_hat_list.append(np.mean(d))
 
     return g_hat0_list, g_hat1_list, m_hat_list, p_hat_list
diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py
index 0e323ec9..54eb4074 100644
--- a/doubleml/did/tests/test_datasets.py
+++ b/doubleml/did/tests/test_datasets.py
@@ -3,7 +3,7 @@
 import pytest
 
 from doubleml import DoubleMLData
-from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020
+from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020
 
 msg_inv_return_type = "Invalid return_type."
 
@@ -77,3 +77,55 @@ def test_make_did_CS2021_exceptions():
     msg = r"time_type must be one of \('datetime', 'float'\). Got 2."
     with pytest.raises(ValueError, match=msg):
         _ = make_did_CS2021(n_obs=100, time_type=2)
+
+
+@pytest.fixture(scope="function", params=[0.5, 0.1])
+def lambda_t(request):
+    return request.param
+
+
+@pytest.mark.ci
+def test_make_did_cs_CS2021_return_types(dgp_type, include_never_treated, lambda_t, time_type, anticipation_periods):
+    np.random.seed(3141)
+    df = make_did_cs_CS2021(
+        n_obs=100,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        lambda_t=lambda_t,
+        time_type=time_type,
+        anticipation_periods=anticipation_periods,
+    )
+    assert isinstance(df, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_panel_vs_cs_make_did_CS2021(dgp_type, include_never_treated, time_type, anticipation_periods):
+    np.random.seed(3141)
+    df_cs = make_did_cs_CS2021(
+        n_obs=100,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        lambda_t=1.0,
+        time_type=time_type,
+        anticipation_periods=anticipation_periods,
+    )
+
+    np.random.seed(3141)
+    df_panel = make_did_CS2021(
+        n_obs=100,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        time_type=time_type,
+        anticipation_periods=anticipation_periods,
+    )
+
+    # check if df_cs close to df_panel
+    assert df_cs.shape[0] == df_panel.shape[0]
+    # Select numerical columns
+    df_cs_numeric = df_cs.select_dtypes(include=np.number)
+    df_panel_numeric = df_panel.select_dtypes(include=np.number)
+
+    # Ensure the same numerical columns are being compared, in the same order
+    pd.testing.assert_index_equal(df_cs_numeric.columns, df_panel_numeric.columns)
+
+    assert np.allclose(df_cs_numeric.values, df_panel_numeric.values, atol=1e-5, rtol=1e-5)
diff --git a/doubleml/did/tests/test_did_binary_control_groups.py b/doubleml/did/tests/test_did_binary_control_groups.py
index b8406b15..627cf50a 100644
--- a/doubleml/did/tests/test_did_binary_control_groups.py
+++ b/doubleml/did/tests/test_did_binary_control_groups.py
@@ -21,7 +21,7 @@ def test_control_groups_different():
     dml_did_never_treated = dml.did.DoubleMLDIDBinary(control_group="never_treated", **args)
     dml_did_not_yet_treated = dml.did.DoubleMLDIDBinary(control_group="not_yet_treated", **args)
 
-    assert dml_did_never_treated._n_subset != dml_did_not_yet_treated._n_subset
+    assert dml_did_never_treated.n_obs_subset != dml_did_not_yet_treated.n_obs_subset
     # same treatment group
     assert dml_did_never_treated._n_treated_subset == dml_did_not_yet_treated._n_treated_subset
 
diff --git a/doubleml/did/tests/test_did_binary_external_predictions.py b/doubleml/did/tests/test_did_binary_external_predictions.py
index ccc136d0..0a6cf2f0 100644
--- a/doubleml/did/tests/test_did_binary_external_predictions.py
+++ b/doubleml/did/tests/test_did_binary_external_predictions.py
@@ -40,7 +40,7 @@ def doubleml_did_fixture(did_score, n_rep):
     }
 
     dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
-    all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_panel)
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_data_subset)
     dml_did.set_sample_splitting(all_smpls)
 
     np.random.seed(3141)
@@ -112,7 +112,7 @@ def doubleml_did_panel_fixture(did_score, n_rep):
     }
 
     dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
-    all_smpls = draw_smpls(n_obs=dml_did._n_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel)
+    all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset)
     dml_did.set_sample_splitting(all_smpls)
 
     np.random.seed(3141)
diff --git a/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py
new file mode 100644
index 00000000..a921efee
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_external_predictions_unbalanced.py
@@ -0,0 +1,93 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDBinary
+from doubleml.did.datasets import make_did_cs_CS2021
+from doubleml.tests._utils import draw_smpls
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_panel_fixture(did_score, n_rep):
+    n_obs = 500
+    n_folds = 5
+    dgp = 1
+
+    ext_predictions = {"d": {}}
+    df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float")
+    dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_panel_data,
+        "g_value": 2,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "score": did_score,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset)
+    dml_did.set_sample_splitting(all_smpls)
+
+    np.random.seed(3141)
+    dml_did.fit(store_predictions=True)
+
+    all_keys = ["ml_g0", "ml_g1"]
+    for key in all_keys:
+        ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0]
+    if did_score == "observational":
+        ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0]
+    dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs)
+    dml_did_ext.set_sample_splitting(all_smpls)
+    np.random.seed(3141)
+    dml_did_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_panel_coef(doubleml_did_panel_fixture):
+    assert math.isclose(doubleml_did_panel_fixture["coef"], doubleml_did_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_se(doubleml_did_panel_fixture):
+    assert math.isclose(doubleml_did_panel_fixture["se"], doubleml_did_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_score(doubleml_did_panel_fixture):
+    assert np.allclose(doubleml_did_panel_fixture["score"], doubleml_did_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_nuisance_loss(doubleml_did_panel_fixture):
+    for key, value in doubleml_did_panel_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py
index 1eacdf6a..426b413c 100644
--- a/doubleml/did/tests/test_did_binary_vs_did_panel.py
+++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py
@@ -78,7 +78,7 @@ def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normaliza
     )
     dml_did_binary_obj.fit()
 
-    df_wide = dml_did_binary_obj._panel_data_wide.copy()
+    df_wide = dml_did_binary_obj.data_subset.copy()
     dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"])
     dml_did_obj = dml.DoubleMLDID(
         dml_data,
@@ -178,7 +178,7 @@ def test_sensitivity_elements(dml_did_binary_vs_did_fixture):
         )
     for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
         dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"]
-        scaling = dml_binary_obj._n_subset / dml_binary_obj._dml_data.n_obs
+        scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_ids
         binary_sensitivity_element = scaling * _get_id_positions(
             dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions
         )
diff --git a/doubleml/did/tests/test_did_cs_binary_control_groups.py b/doubleml/did/tests/test_did_cs_binary_control_groups.py
new file mode 100644
index 00000000..ea4f2933
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_control_groups.py
@@ -0,0 +1,31 @@
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+df = dml.did.datasets.make_did_cs_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=4, time_type="float")
+dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+args = {
+    "obj_dml_data": dml_data,
+    "ml_g": LinearRegression(),
+    "ml_m": LogisticRegression(),
+    "g_value": 2,
+    "t_value_pre": 0,
+    "t_value_eval": 1,
+    "score": "observational",
+    "n_rep": 1,
+}
+
+
+def test_control_groups_different():
+    dml_did_never_treated = dml.did.DoubleMLDIDCSBinary(control_group="never_treated", **args)
+    dml_did_not_yet_treated = dml.did.DoubleMLDIDCSBinary(control_group="not_yet_treated", **args)
+
+    assert dml_did_never_treated.n_obs_subset != dml_did_not_yet_treated.n_obs_subset
+    # same treatment group
+    assert dml_did_never_treated.data_subset["G_indicator"].sum() == dml_did_not_yet_treated.data_subset["G_indicator"].sum()
+
+    dml_did_never_treated.fit()
+    dml_did_not_yet_treated.fit()
+
+    assert dml_did_never_treated.coef != dml_did_not_yet_treated.coef
diff --git a/doubleml/did/tests/test_did_cs_binary_exceptions.py b/doubleml/did/tests/test_did_cs_binary_exceptions.py
new file mode 100644
index 00000000..b506da2d
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_exceptions.py
@@ -0,0 +1,152 @@
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+
+valid_arguments = {
+    "obj_dml_data": dml_data,
+    "ml_g": LinearRegression(),
+    "ml_m": LogisticRegression(),
+    "g_value": 1,
+    "t_value_pre": 0,
+    "t_value_eval": 1,
+    "score": "observational",
+    "n_rep": 1,
+    "draw_sample_splitting": True,
+}
+
+
+@pytest.mark.ci
+def test_input():
+    # control group
+    msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"control_group": 0}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    # g value
+    msg = r"The value test is not in the set of treatment group values \[0 1\]."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": "test"}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    msg = r"The never treated group is not allowed as treatment group \(g_value=0\)."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": 0}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    msg = r"The never treated group is not allowed as treatment group \(g_value=0\)."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": 0.0}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    # t values
+    msg = r"The value test is not in the set of evaluation period values \[0 1\]."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"t_value_pre": "test"}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"t_value_eval": "test"}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    # in-sample normalization
+    msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"in_sample_normalization": "test"}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+    # ml_g classifier
+    msg = r"The ml_g learner LogisticRegression\(\) was identified as"
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"ml_g": LogisticRegression()}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_no_control_group_exception():
+    msg = "No observations in the control group."
+    with pytest.raises(ValueError, match=msg):
+        invalid_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+        invalid_data.data["d"] = 1.0
+        invalid_arguments = {"obj_dml_data": invalid_data, "control_group": "not_yet_treated"}
+        _ = dml.did.DoubleMLDIDCSBinary(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_check_data_exceptions():
+    """Test exception handling for _check_data method in DoubleMLDIDCSBinary"""
+    df = pd.DataFrame(np.random.normal(size=(10, 5)), columns=[f"Col_{i}" for i in range(5)])
+
+    # Test 1: Data has to be DoubleMLPanelData
+    invalid_data_types = [
+        dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"),
+    ]
+
+    for invalid_data in invalid_data_types:
+        msg = r"For repeated outcomes the data must be of DoubleMLPanelData type\."
+        with pytest.raises(TypeError, match=msg):
+            _ = dml.did.DoubleMLDIDCSBinary(
+                obj_dml_data=invalid_data,
+                ml_g=LinearRegression(),
+                ml_m=LogisticRegression(),
+                g_value=1,
+                t_value_pre=0,
+                t_value_eval=1,
+            )
+
+    # Test 2: Data cannot have instrumental variables
+    df_with_z = dml_data.data.copy()
+    dml_data_with_z = dml.data.DoubleMLPanelData(
+        df_with_z, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z1"], x_cols=["Z2", "Z3", "Z4"]
+    )
+
+    msg = r"Incompatible data. Z1 have been set as instrumental variable\(s\)."
+    with pytest.raises(NotImplementedError, match=msg):
+        _ = dml.did.DoubleMLDIDCSBinary(
+            obj_dml_data=dml_data_with_z,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+        )
+
+    # Test 3: Data must have exactly one treatment variable (using mock)
+    with patch.object(dml_data.__class__, "n_treat", property(lambda self: 2)):
+        msg = (
+            "Incompatible data. To fit an DID model with DML exactly one variable needs to be specified as treatment variable."
+        )
+        with pytest.raises(ValueError, match=msg):
+            _ = dml.did.DoubleMLDIDCSBinary(
+                obj_dml_data=dml_data,
+                ml_g=LinearRegression(),
+                ml_m=LogisticRegression(),
+                g_value=1,
+                t_value_pre=0,
+                t_value_eval=1,
+            )
+
+
+@pytest.mark.ci
+def test_benchmark_warning():
+    """Test warning when sensitivity_benchmark is called with experimental score"""
+    args = {
+        "obj_dml_data": dml_data,
+        "ml_g": LinearRegression(),
+        "ml_m": LogisticRegression(),
+        "g_value": 1,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "n_rep": 1,
+    }
+    # Create a DID model with experimental score
+    did_model = dml.did.DoubleMLDIDCSBinary(**args, score="experimental")
+    did_model.fit()
+    with pytest.warns(UserWarning, match="Sensitivity benchmarking for experimental score may not be meaningful"):
+        did_model.sensitivity_benchmark(["Z1", "Z2"])
diff --git a/doubleml/did/tests/test_did_cs_binary_external_predictions.py b/doubleml/did/tests/test_did_cs_binary_external_predictions.py
new file mode 100644
index 00000000..f6b77f0b
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_external_predictions.py
@@ -0,0 +1,171 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDCSBinary
+from doubleml.did.datasets import make_did_cs_CS2021, make_did_SZ2020
+from doubleml.tests._utils import draw_smpls
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_cs_fixture(did_score, n_rep):
+    n_obs = 500
+    n_folds = 5
+
+    ext_predictions = {"d": {}}
+    dml_data = make_did_SZ2020(n_obs=n_obs, return_type="DoubleMLPanelData")
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "g_value": 1,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "score": did_score,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did = DoubleMLDIDCSBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    strata = dml_did.data_subset["G_indicator"] + 2 * dml_did.data_subset["t_indicator"]
+    all_smpls = draw_smpls(2 * n_obs, n_folds, n_rep=n_rep, groups=strata)
+    dml_did.set_sample_splitting(all_smpls)
+
+    np.random.seed(3141)
+    dml_did.fit(store_predictions=True)
+
+    all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"]
+    for key in all_keys:
+        ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0]
+    if did_score == "observational":
+        ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0]
+
+    dml_did_ext = DoubleMLDIDCSBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs)
+    dml_did_ext.set_sample_splitting(all_smpls)
+    np.random.seed(3141)
+    dml_did_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coef(doubleml_did_cs_fixture):
+    assert math.isclose(doubleml_did_cs_fixture["coef"], doubleml_did_cs_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_se(doubleml_did_cs_fixture):
+    assert math.isclose(doubleml_did_cs_fixture["se"], doubleml_did_cs_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_score(doubleml_did_cs_fixture):
+    assert np.allclose(doubleml_did_cs_fixture["score"], doubleml_did_cs_fixture["score_ext"], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_nuisance_loss(doubleml_did_cs_fixture):
+    for key, value in doubleml_did_cs_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_cs_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_cs_panel_fixture(did_score, n_rep):
+    n_obs = 500
+    n_folds = 5
+    dgp = 1
+
+    ext_predictions = {"d": {}}
+    df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float")
+    dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_panel_data,
+        "g_value": 2,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "score": did_score,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did = DoubleMLDIDCSBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    all_smpls = draw_smpls(n_obs=dml_did.n_obs_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_data_subset)
+    dml_did.set_sample_splitting(all_smpls)
+
+    np.random.seed(3141)
+    dml_did.fit(store_predictions=True)
+
+    all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"]
+    for key in all_keys:
+        ext_predictions["d"][key] = dml_did.predictions[key][:, :, 0]
+    if did_score == "observational":
+        ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0]
+    dml_did_ext = DoubleMLDIDCSBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs)
+    dml_did_ext.set_sample_splitting(all_smpls)
+    np.random.seed(3141)
+    dml_did_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_panel_coef(doubleml_did_cs_panel_fixture):
+    assert math.isclose(
+        doubleml_did_cs_panel_fixture["coef"], doubleml_did_cs_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3
+    )
+
+
+@pytest.mark.ci
+def test_panel_se(doubleml_did_cs_panel_fixture):
+    assert math.isclose(
+        doubleml_did_cs_panel_fixture["se"], doubleml_did_cs_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3
+    )
+
+
+@pytest.mark.ci
+def test_panel_score(doubleml_did_cs_panel_fixture):
+    assert np.allclose(
+        doubleml_did_cs_panel_fixture["score"], doubleml_did_cs_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3
+    )
+
+
+@pytest.mark.ci
+def test_panel_nuisance_loss(doubleml_did_cs_panel_fixture):
+    for key, value in doubleml_did_cs_panel_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_cs_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
diff --git a/doubleml/did/tests/test_did_cs_binary_placebo.py b/doubleml/did/tests/test_did_cs_binary_placebo.py
new file mode 100644
index 00000000..61def691
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_placebo.py
@@ -0,0 +1,58 @@
+import numpy as np
+import pytest
+from lightgbm import LGBMClassifier, LGBMRegressor
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDCSBinary
+from doubleml.did.datasets import make_did_CS2021
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_fixture(did_score, n_rep):
+    n_obs = 500
+    dgp = 5  # has to be experimental (for experimental score to be valid)
+    df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3)
+    dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "g_value": dml_data.g_values[0],
+        "t_value_pre": dml_data.t_values[0],
+        "t_value_eval": dml_data.t_values[1],
+        "ml_g": LGBMRegressor(verbose=-1),
+        "ml_m": LGBMClassifier(verbose=-1),
+        "score": did_score,
+        "n_rep": n_rep,
+        "n_folds": 5,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did = DoubleMLDIDCSBinary(**kwargs)
+
+    np.random.seed(3141)
+    dml_did.fit()
+    ci = dml_did.confint(level=0.99)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "ci_lower": ci.iloc[0, 0],
+        "ci_upper": ci.iloc[0, 1],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_zero(doubleml_did_fixture):
+    assert doubleml_did_fixture["ci_lower"] <= 0.0
+    assert doubleml_did_fixture["ci_upper"] >= 0.0
diff --git a/doubleml/did/tests/test_did_cs_binary_stdout.py b/doubleml/did/tests/test_did_cs_binary_stdout.py
new file mode 100644
index 00000000..16135636
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_stdout.py
@@ -0,0 +1,49 @@
+import io
+from contextlib import redirect_stdout
+
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+
+
+@pytest.mark.ci
+def test_print_periods():
+    """Test that print_periods parameter correctly controls output printing."""
+
+    # Create test data
+    dml_data = dml.did.datasets.make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")
+
+    # Test 1: Default case (print_periods=False) - should not print anything
+    f = io.StringIO()
+    with redirect_stdout(f):
+        _ = dml.did.DoubleMLDIDCSBinary(
+            obj_dml_data=dml_data,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+            print_periods=False,  # Default
+        )
+    output_default = f.getvalue()
+    assert output_default.strip() == "", "Expected no output with print_periods=False"
+
+    # Test 2: With print_periods=True - should print information
+    f = io.StringIO()
+    with redirect_stdout(f):
+        _ = dml.did.DoubleMLDIDCSBinary(
+            obj_dml_data=dml_data,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+            print_periods=True,
+        )
+    output_print = f.getvalue()
+    assert "Evaluation of ATT(1, 1), with pre-treatment period 0" in output_print
+    assert "post-treatment: True" in output_print
+    assert "Control group: never_treated" in output_print
diff --git a/doubleml/did/tests/test_did_cs_binary_tune.py b/doubleml/did/tests/test_did_cs_binary_tune.py
new file mode 100644
index 00000000..0bd2c6ab
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_tune.py
@@ -0,0 +1,221 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import LogisticRegression
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_did_cs_manual import fit_did_cs, tune_nuisance_did_cs
+from ._utils_did_manual import boot_did
+
+
+@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
+def learner_g(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[LogisticRegression()])
+def learner_m(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def tune_on_folds(request):
+    return request.param
+
+
+def get_par_grid(learner):
+    if learner.__class__ in [RandomForestRegressor]:
+        par_grid = {"n_estimators": [5, 10, 20]}
+    else:
+        assert learner.__class__ in [LogisticRegression]
+        par_grid = {"C": np.logspace(-4, 2, 10)}
+    return par_grid
+
+
+@pytest.fixture(scope="module")
+def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sample_normalization, tune_on_folds):
+    par_grid = {"ml_g": get_par_grid(learner_g), "ml_m": get_par_grid(learner_m)}
+    n_folds_tune = 4
+
+    boot_methods = ["normal"]
+    n_folds = 2
+    n_rep_boot = 499
+
+    # collect data
+    dml_panel_data = generate_data_did_binary
+    df = dml_panel_data._data.sort_values(by=["id", "t"])
+    # Reorder data before to make both approaches compatible
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+    obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    n_obs = df.shape[0]
+    strata = df["d"] + 2 * df["t"]  # only valid since it values are binary
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=strata)
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner_g)
+    ml_m = clone(learner_m)
+
+    dml_args = {
+        "ml_g": ml_g,
+        "ml_m": ml_m,
+        "n_folds": n_folds,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary(
+        dml_panel_data,
+        g_value=1,
+        t_value_pre=0,
+        t_value_eval=1,
+        **dml_args,
+    )
+
+    dml_did_obj = dml.DoubleMLDIDCS(
+        obj_dml_data,
+        **dml_args,
+    )
+
+    # synchronize the sample splitting
+    dml_did_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    # tune hyperparameters
+    np.random.seed(3141)
+    tune_res = dml_did_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False)
+    assert isinstance(tune_res, dml.DoubleMLDIDCS)
+    np.random.seed(3141)
+    tune_res_binary = dml_did_binary_obj.tune(
+        par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False
+    )
+    assert isinstance(tune_res_binary, dml.did.DoubleMLDIDCSBinary)
+
+    dml_did_obj.fit()
+    dml_did_binary_obj.fit()
+
+    # manual fit
+    y = df["y"].values
+    d = df["d"].values
+    x = df[["Z1", "Z2", "Z3", "Z4"]].values
+    t = df["t"].values
+    np.random.seed(3141)
+    smpls = all_smpls[0]
+
+    if tune_on_folds:
+        g_d0_t0_params, g_d0_t1_params, g_d1_t0_params, g_d1_t1_params, m_params = tune_nuisance_did_cs(
+            y, x, d, t, clone(learner_g), clone(learner_m), smpls, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"]
+        )
+    else:
+        xx = [(np.arange(len(y)), np.array([]))]
+        g_d0_t0_params, g_d0_t1_params, g_d1_t0_params, g_d1_t1_params, m_params = tune_nuisance_did_cs(
+            y, x, d, t, clone(learner_g), clone(learner_m), xx, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"]
+        )
+        g_d0_t0_params = g_d0_t0_params * n_folds
+        g_d0_t1_params = g_d0_t1_params * n_folds
+        g_d1_t0_params = g_d1_t0_params * n_folds
+        g_d1_t1_params = g_d1_t1_params * n_folds
+        if score == "observational":
+            m_params = m_params * n_folds
+        else:
+            assert score == "experimental"
+            m_params = None
+
+    res_manual = fit_did_cs(
+        y,
+        x,
+        d,
+        t,
+        clone(learner_g),
+        clone(learner_m),
+        all_smpls,
+        score,
+        in_sample_normalization,
+        g_d0_t0_params=g_d0_t0_params,
+        g_d0_t1_params=g_d0_t1_params,
+        g_d1_t0_params=g_d1_t0_params,
+        g_d1_t1_params=g_d1_t1_params,
+        m_params=m_params,
+    )
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "coef_manual": res_manual["theta"],
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "se_manual": res_manual["se"],
+        "boot_methods": boot_methods,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_did(
+            y,
+            res_manual["thetas"],
+            res_manual["ses"],
+            res_manual["all_psi_a"],
+            res_manual["all_psi_b"],
+            all_smpls,
+            bootstrap,
+            n_rep_boot,
+        )
+
+        np.random.seed(3141)
+        dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1)
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_did_coef(dml_did_fixture):
+    assert math.isclose(dml_did_fixture["coef"][0], dml_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_did_fixture["coef_binary"][0], dml_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_did_se(dml_did_fixture):
+    assert math.isclose(dml_did_fixture["se"][0], dml_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_did_fixture["se_binary"][0], dml_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_boot(dml_did_fixture):
+    for bootstrap in dml_did_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_fixture["boot_t_stat" + bootstrap + "_manual"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+        assert np.allclose(
+            dml_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_fixture["boot_t_stat" + bootstrap + "_binary"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py
new file mode 100644
index 00000000..8fab2615
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_panel.py
@@ -0,0 +1,202 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.did.utils._did_utils import _get_id_positions
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold):
+    n_obs = 500
+    dpg = 1
+
+    # collect data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type)
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "ml_g": clone(learner[0]),
+        "ml_m": clone(learner[1]),
+        "n_folds": 3,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary(
+        dml_panel_data,
+        g_value=dml_panel_data.g_values[0],
+        t_value_pre=dml_panel_data.t_values[0],
+        t_value_eval=dml_panel_data.t_values[1],
+        **dml_args,
+    )
+    dml_did_binary_obj.fit()
+
+    df_subset = dml_did_binary_obj.data_subset.copy()
+    dml_data = dml.data.DoubleMLData(
+        df_subset, y_col="y", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"], t_col="t_indicator"
+    )
+    dml_did_obj = dml.DoubleMLDIDCS(
+        dml_data,
+        **dml_args,
+    )
+
+    # use external predictions (sample splitting is hard to synchronize)
+    ext_predictions = {"G_indicator": {}}
+    ext_predictions["G_indicator"]["ml_g_d0_t0"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g_d0_t0"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    ext_predictions["G_indicator"]["ml_g_d0_t1"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g_d0_t1"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    ext_predictions["G_indicator"]["ml_g_d1_t0"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g_d1_t0"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    ext_predictions["G_indicator"]["ml_g_d1_t1"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g_d1_t1"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    if score == "observational":
+        ext_predictions["G_indicator"]["ml_m"] = _get_id_positions(
+            dml_did_binary_obj.predictions["ml_m"][:, :, 0], dml_did_binary_obj._id_positions
+        )
+    dml_did_obj.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "nuisance_loss": dml_did_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+        "dml_did_binary_obj": dml_did_binary_obj,
+    }
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+
+    dml_did_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+@pytest.mark.ci
+def test_ses(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+# No Boostrap Tests as the observations are not ordered in the same way
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_binary_vs_did_fixture):
+    assert (
+        dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items():
+        assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_binary_vs_did_fixture):
+    sensitivity_element_names = ["sigma2", "nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+    for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
+        dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"]
+        scaling = dml_binary_obj.n_obs_subset / dml_binary_obj._dml_data.n_obs
+        binary_sensitivity_element = scaling * _get_id_positions(
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            binary_sensitivity_element,
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_binary_vs_did_fixture):
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py
new file mode 100644
index 00000000..73e6b827
--- /dev/null
+++ b/doubleml/did/tests/test_did_cs_binary_vs_did_cs_two_period.py
@@ -0,0 +1,282 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_did_cs_manual import fit_did_cs, fit_sensitivity_elements_did_cs
+from ._utils_did_manual import boot_did
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_cs_binary_vs_did_cs_fixture(generate_data_did_binary, learner, score, in_sample_normalization, trimming_threshold):
+    boot_methods = ["normal"]
+    n_folds = 2
+    n_rep_boot = 499
+
+    # collect data
+    dml_panel_data = generate_data_did_binary
+    df = dml_panel_data._data.sort_values(by=["id", "t"])
+    # Reorder data before to make both approaches compatible
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+    obj_dml_data = dml.DoubleMLData(df, y_col="y", d_cols="d", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    n_obs = df.shape[0]
+    all_smpls = draw_smpls(n_obs, n_folds)
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    dml_args = {
+        "ml_g": ml_g,
+        "ml_m": ml_m,
+        "n_folds": n_folds,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary(
+        dml_panel_data,
+        g_value=1,
+        t_value_pre=0,
+        t_value_eval=1,
+        **dml_args,
+    )
+
+    dml_did_obj = dml.DoubleMLDIDCS(
+        obj_dml_data,
+        **dml_args,
+    )
+
+    # synchronize the sample splitting
+    dml_did_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    dml_did_obj.fit()
+    dml_did_binary_obj.fit()
+
+    # manual fit
+    y = df["y"].values
+    d = df["d"].values
+    x = df[["Z1", "Z2", "Z3", "Z4"]].values
+    t = df["t"].values
+
+    np.random.seed(3141)
+    res_manual = fit_did_cs(
+        y,
+        x,
+        d,
+        t,
+        clone(learner[0]),
+        clone(learner[1]),
+        all_smpls,
+        score,
+        in_sample_normalization,
+        trimming_threshold=trimming_threshold,
+    )
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "coef_manual": res_manual["theta"],
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "se_manual": res_manual["se"],
+        "nuisance_loss": dml_did_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+        "boot_methods": boot_methods,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_did(
+            y,
+            res_manual["thetas"],
+            res_manual["ses"],
+            res_manual["all_psi_a"],
+            res_manual["all_psi_b"],
+            all_smpls,
+            bootstrap,
+            n_rep_boot,
+        )
+
+        np.random.seed(3141)
+        dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1)
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+    res_dict["sensitivity_elements_manual"] = fit_sensitivity_elements_did_cs(
+        y,
+        d,
+        t,
+        all_coef=dml_did_obj.all_coef,
+        predictions=dml_did_obj.predictions,
+        score=score,
+        in_sample_normalization=in_sample_normalization,
+        n_rep=1,
+    )
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+
+    dml_did_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_cs_binary_vs_did_cs_fixture):
+    assert math.isclose(
+        dml_did_cs_binary_vs_did_cs_fixture["coef"][0],
+        dml_did_cs_binary_vs_did_cs_fixture["coef_manual"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+    assert math.isclose(
+        dml_did_cs_binary_vs_did_cs_fixture["coef_binary"][0],
+        dml_did_cs_binary_vs_did_cs_fixture["coef"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_ses(dml_did_cs_binary_vs_did_cs_fixture):
+    assert math.isclose(
+        dml_did_cs_binary_vs_did_cs_fixture["se"][0],
+        dml_did_cs_binary_vs_did_cs_fixture["se_manual"],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+    assert math.isclose(
+        dml_did_cs_binary_vs_did_cs_fixture["se_binary"][0],
+        dml_did_cs_binary_vs_did_cs_fixture["se"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_boot(dml_did_cs_binary_vs_did_cs_fixture):
+    for bootstrap in dml_did_cs_binary_vs_did_cs_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap],
+            dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap + "_manual"],
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap],
+            dml_did_cs_binary_vs_did_cs_fixture["boot_t_stat" + bootstrap + "_binary"],
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_cs_binary_vs_did_cs_fixture):
+    assert (
+        dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss"].keys()
+        == dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss"].items():
+        assert np.allclose(value, dml_did_cs_binary_vs_did_cs_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_cs_binary_vs_did_cs_fixture):
+    sensitivity_element_names = ["sigma2", "nu2", "psi_sigma2", "psi_nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_manual"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+    for sensitivity_element in ["riesz_rep"]:
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_cs_binary_vs_did_cs_fixture):
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key]["lower"],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key]["upper"],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params"][key],
+            dml_did_cs_binary_vs_did_cs_fixture["sensitivity_params_binary"][key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py
index 35512d8f..57b00b31 100644
--- a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py
+++ b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py
@@ -1 +1,198 @@
-# TODO: For each aggregation method check if the manual weights equal the string aggregation method.
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.did.utils._aggregation import (
+    _compute_did_eventstudy_aggregation_weights,
+    _compute_did_group_aggregation_weights,
+    _compute_did_time_aggregation_weights,
+)
+
+
+@pytest.fixture(scope="module", params=["group", "time", "eventstudy"])
+def aggregation_method(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def panel(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_fitted_obj(panel, score):
+    """Create a fitted DML object for testing."""
+    n_obs = 200
+
+    # Create data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=1, time_type="float")
+    dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    # Create and fit model
+    ml_g = LinearRegression()
+    ml_m = LogisticRegression(solver="lbfgs", max_iter=250)
+
+    dml_obj = dml.did.DoubleMLDIDMulti(
+        obj_dml_data=dml_data,
+        ml_g=ml_g,
+        ml_m=ml_m,
+        gt_combinations="standard",
+        panel=panel,
+        score=score,
+        n_folds=3,
+        n_rep=1,
+    )
+    dml_obj.fit()
+
+    return dml_obj
+
+
+def _extract_manual_weights(dml_obj, aggregation_method):
+    """Extract manual weights from the aggregation method."""
+    # Get the mask for non-masked values
+    selected_gt_mask = ~dml_obj.gt_index.mask
+
+    if aggregation_method == "group":
+        # Exclude pre-treatment combinations for group aggregation
+        selected_gt_mask = selected_gt_mask & dml_obj._post_treatment_mask
+        aggregation_dict = _compute_did_group_aggregation_weights(
+            gt_index=dml_obj.gt_index,
+            g_values=dml_obj.g_values,
+            d_values=dml_obj._dml_data.d,
+            selected_gt_mask=selected_gt_mask,
+        )
+        aggregation_dict["method"] = "Group"
+    elif aggregation_method == "time":
+        # Exclude pre-treatment combinations for time aggregation
+        selected_gt_mask = selected_gt_mask & dml_obj._post_treatment_mask
+        aggregation_dict = _compute_did_time_aggregation_weights(
+            gt_index=dml_obj.gt_index,
+            g_values=dml_obj.g_values,
+            t_values=dml_obj.t_values,
+            d_values=dml_obj._dml_data.d,
+            selected_gt_mask=selected_gt_mask,
+        )
+        aggregation_dict["method"] = "Time"
+    else:
+        assert aggregation_method == "eventstudy"
+        aggregation_dict = _compute_did_eventstudy_aggregation_weights(
+            gt_index=dml_obj.gt_index,
+            g_values=dml_obj.g_values,
+            t_values=dml_obj.t_values,
+            d_values=dml_obj._dml_data.d,
+            time_values=dml_obj._dml_data.t,
+            selected_gt_mask=selected_gt_mask,
+        )
+        aggregation_dict["method"] = "Event Study"
+    return aggregation_dict
+
+
+@pytest.mark.ci
+def test_string_vs_manual_weights_aggregation(dml_fitted_obj, aggregation_method):
+    """Test that string aggregation methods produce identical results to manual weights."""
+
+    # Get string-based aggregation result
+    agg_string = dml_fitted_obj.aggregate(aggregation=aggregation_method)
+
+    # Extract manual weights
+    manual_weights_dict = _extract_manual_weights(dml_fitted_obj, aggregation_method)
+
+    # Get manual aggregation result
+    agg_manual = dml_fitted_obj.aggregate(aggregation=manual_weights_dict)
+
+    # Compare aggregated frameworks - coefficients
+    np.testing.assert_allclose(
+        agg_string.aggregated_frameworks.thetas,
+        agg_manual.aggregated_frameworks.thetas,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare aggregated frameworks - standard errors
+    np.testing.assert_allclose(
+        agg_string.aggregated_frameworks.ses,
+        agg_manual.aggregated_frameworks.ses,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare overall aggregated framework - coefficients
+    np.testing.assert_allclose(
+        agg_string.overall_aggregated_framework.thetas,
+        agg_manual.overall_aggregated_framework.thetas,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare overall aggregated framework - standard errors
+    np.testing.assert_allclose(
+        agg_string.overall_aggregated_framework.ses,
+        agg_manual.overall_aggregated_framework.ses,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare aggregation weights
+    np.testing.assert_allclose(
+        agg_string.aggregation_weights,
+        agg_manual.aggregation_weights,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare overall aggregation weights
+    np.testing.assert_allclose(
+        agg_string.overall_aggregation_weights,
+        agg_manual.overall_aggregation_weights,
+        rtol=1e-9,
+        atol=1e-12,
+    )
+
+    # Compare aggregation names
+    assert agg_string.aggregation_names == agg_manual.aggregation_names
+
+    # Compare number of aggregations
+    assert agg_string.n_aggregations == agg_manual.n_aggregations
+
+
+@pytest.mark.ci
+def test_manual_weights_properties(dml_fitted_obj, aggregation_method):
+    """Test that manual weights have the expected properties."""
+
+    manual_weights_dict = _extract_manual_weights(dml_fitted_obj, aggregation_method)
+
+    # Check that required keys are present
+    assert "weight_masks" in manual_weights_dict
+    assert "agg_names" in manual_weights_dict
+    assert "agg_weights" in manual_weights_dict
+
+    weight_masks = manual_weights_dict["weight_masks"]
+    agg_weights = manual_weights_dict["agg_weights"]
+
+    # Check weight masks properties
+    assert isinstance(weight_masks, np.ma.MaskedArray)
+    assert weight_masks.ndim == 4
+    assert weight_masks.shape[:-1] == dml_fitted_obj.gt_index.shape
+
+    # Check that aggregation weights sum to 1
+    assert math.isclose(np.sum(agg_weights), 1.0, rel_tol=1e-9, abs_tol=1e-12)
+
+    # Check that individual weight masks sum to 1 (for non-masked elements)
+    n_aggregations = weight_masks.shape[-1]
+    for i in range(n_aggregations):
+        weights = weight_masks[..., i].compressed()
+        if len(weights) > 0:
+            assert math.isclose(np.sum(weights), 1.0, rel_tol=1e-9, abs_tol=1e-12)
+
+    # Check that weight masks have the same mask as gt_index
+    for i in range(n_aggregations):
+        np.testing.assert_array_equal(weight_masks[..., i].mask, dml_fitted_obj.gt_index.mask)
diff --git a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py
index 0f71d91b..a6ffcd49 100644
--- a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py
+++ b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py
@@ -27,6 +27,11 @@ def score(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def panel(request):
+    return request.param
+
+
 @pytest.fixture(scope="module", params=[True, False])
 def in_sample_normalization(request):
     return request.param
@@ -43,7 +48,7 @@ def time_type(request):
 
 
 @pytest.fixture(scope="module")
-def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_normalization, trimming_threshold):
+def dml_single_gt_aggregation(aggregation, time_type, learner, score, panel, in_sample_normalization, trimming_threshold):
     n_obs = 500
     dpg = 1
 
@@ -56,6 +61,7 @@ def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_
     dml_args = {
         "n_folds": 3,
         "score": score,
+        "panel": panel,
         "in_sample_normalization": in_sample_normalization,
         "trimming_threshold": trimming_threshold,
         "draw_sample_splitting": True,
diff --git a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py
deleted file mode 100644
index d001a4a8..00000000
--- a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py
+++ /dev/null
@@ -1 +0,0 @@
-# TODO: For each aggregation method check if the aggregated weights correspond to certain gt_combinations (group, time etc.)
diff --git a/doubleml/did/tests/test_did_multi_exceptions.py b/doubleml/did/tests/test_did_multi_exceptions.py
index aead8e48..c53d79d3 100644
--- a/doubleml/did/tests/test_did_multi_exceptions.py
+++ b/doubleml/did/tests/test_did_multi_exceptions.py
@@ -18,6 +18,7 @@
     "ml_g": LinearRegression(),
     "ml_m": LogisticRegression(),
     "gt_combinations": [(1, 0, 1)],
+    "panel": True,
 }
 
 
@@ -43,6 +44,12 @@ def test_input():
         invalid_arguments = {"control_group": 0}
         _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
 
+    # non boolean panel
+    msg = "panel has to be boolean. test of type <class 'str'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"panel": "test"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
     # propensity score adjustments
     msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'str'> passed."
     with pytest.raises(TypeError, match=msg):
@@ -93,7 +100,7 @@ def test_exception_learners():
 
 @pytest.mark.ci
 def test_exception_gt_combinations():
-    msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed."
+    msg = r"gt_combinations must be one of \['standard', 'all', 'universal'\]. test was passed."
     with pytest.raises(ValueError, match=msg):
         invalid_arguments = {"gt_combinations": "test"}
         _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
@@ -170,6 +177,12 @@ def test_check_external_predictions():
     valid_pred = {model.gt_labels[0]: {"ml_g0": None, "ml_g1": None, "ml_m": None}}
     model._check_external_predictions(valid_pred)
 
+    model_cs = dml.did.DoubleMLDIDMulti(**valid_arguments | {"panel": False})
+    valid_pred = {
+        model.gt_labels[0]: {"ml_g_d0_t0": None, "ml_g_d0_t1": None, "ml_g_d1_t0": None, "ml_g_d1_t1": None, "ml_m": None}
+    }
+    model_cs._check_external_predictions(valid_pred)
+
 
 @pytest.mark.ci
 def test_exceptions_before_fit():
diff --git a/doubleml/did/tests/test_did_multi_external_predictions.py b/doubleml/did/tests/test_did_multi_external_predictions.py
index 2e7003f9..9bafdc6f 100644
--- a/doubleml/did/tests/test_did_multi_external_predictions.py
+++ b/doubleml/did/tests/test_did_multi_external_predictions.py
@@ -14,6 +14,11 @@ def did_score(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def panel(request):
+    return request.param
+
+
 @pytest.fixture(scope="module", params=[1, 3])
 def n_rep(request):
     return request.param
@@ -30,7 +35,7 @@ def set_ml_g_ext(request):
 
 
 @pytest.fixture(scope="module")
-def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext):
+def doubleml_did_multi_ext_fixture(did_score, panel, n_rep, set_ml_m_ext, set_ml_g_ext):
     n_obs = 500
     n_folds = 5
     dgp = 1
@@ -47,6 +52,7 @@ def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext)
         "obj_dml_data": dml_panel_data,
         "gt_combinations": [(2, 0, 1)],
         "score": did_score,
+        "panel": panel,
         "n_rep": n_rep,
         "n_folds": n_folds,
     }
@@ -69,9 +75,12 @@ def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext)
         ml_m_ext = ml_m
 
     if set_ml_g_ext:
+        g_keys = ["ml_g0", "ml_g1"] if panel else ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"]
         for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels):
-            ext_pred_dict[gt_label]["ml_g0"] = dml_obj.modellist[i_gt_combination].predictions["ml_g0"][:, :, 0]
-            ext_pred_dict[gt_label]["ml_g1"] = dml_obj.modellist[i_gt_combination].predictions["ml_g1"][:, :, 0]
+            predictions = dml_obj.modellist[i_gt_combination].predictions
+            for key in g_keys:
+                ext_pred_dict[gt_label][key] = predictions[key][:, :, 0]
+
         ml_g_ext = DMLDummyRegressor()
     else:
         ml_g_ext = ml_g
@@ -100,3 +109,10 @@ def test_coef(doubleml_did_multi_ext_fixture):
     assert math.isclose(
         doubleml_did_multi_ext_fixture["coef"], doubleml_did_multi_ext_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3
     )
+
+
+@pytest.mark.ci
+def test_se(doubleml_did_multi_ext_fixture):
+    assert math.isclose(
+        doubleml_did_multi_ext_fixture["se"], doubleml_did_multi_ext_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3
+    )
diff --git a/doubleml/did/tests/test_did_multi_placebo.py b/doubleml/did/tests/test_did_multi_placebo.py
index 8f01d426..12435871 100644
--- a/doubleml/did/tests/test_did_multi_placebo.py
+++ b/doubleml/did/tests/test_did_multi_placebo.py
@@ -12,13 +12,18 @@ def did_score(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def panel(request):
+    return request.param
+
+
 @pytest.fixture(scope="module", params=[1, 3])
 def n_rep(request):
     return request.param
 
 
 @pytest.fixture(scope="module")
-def doubleml_did_fixture(did_score, n_rep):
+def doubleml_did_fixture(did_score, panel, n_rep):
     n_obs = 1000
     dgp = 5  # has to be experimental (for experimental score to be valid)
     np.random.seed(42)
@@ -36,6 +41,7 @@ def doubleml_did_fixture(did_score, n_rep):
         "ml_m": LogisticRegression(),
         "gt_combinations": gt_combinations,
         "score": did_score,
+        "panel": panel,
         "n_rep": n_rep,
         "n_folds": 5,
         "draw_sample_splitting": True,
diff --git a/doubleml/did/tests/test_did_multi_plot.py b/doubleml/did/tests/test_did_multi_plot.py
index 2eb15dcc..4a55449d 100644
--- a/doubleml/did/tests/test_did_multi_plot.py
+++ b/doubleml/did/tests/test_did_multi_plot.py
@@ -13,13 +13,23 @@ def did_score(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=[True, False])
+def panel(request):
+    return request.param
+
+
 @pytest.fixture(scope="module", params=[1, 3])
 def n_rep(request):
     return request.param
 
 
+@pytest.fixture(scope="module", params=["standard", "all", "universal"])
+def gt_comb(request):
+    return request.param
+
+
 @pytest.fixture(scope="module")
-def doubleml_did_fixture(did_score, n_rep):
+def doubleml_did_fixture(did_score, panel, n_rep, gt_comb):
     n_obs = 1000
     dgp = 5  # has to be experimental (for experimental score to be valid)
     np.random.seed(42)
@@ -30,8 +40,9 @@ def doubleml_did_fixture(did_score, n_rep):
         "obj_dml_data": dml_data,
         "ml_g": LinearRegression(),
         "ml_m": LogisticRegression(),
-        "gt_combinations": "all",
+        "gt_combinations": gt_comb,
         "score": did_score,
+        "panel": panel,
         "n_rep": n_rep,
         "n_folds": 2,
         "draw_sample_splitting": True,
@@ -124,7 +135,7 @@ def test_plot_effects_color_palette(doubleml_did_fixture):
     assert isinstance(fig, plt.Figure)
 
     # Test with a custom color list
-    custom_colors = [(1, 0, 0), (0, 1, 0)]  # Red and green
+    custom_colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]  # Red, Green, Blue
     fig, _ = dml_obj.plot_effects(color_palette=custom_colors)
     assert isinstance(fig, plt.Figure)
 
diff --git a/doubleml/did/tests/test_did_multi_return_types.py b/doubleml/did/tests/test_did_multi_return_types.py
index 2e12ce10..d797230e 100644
--- a/doubleml/did/tests/test_did_multi_return_types.py
+++ b/doubleml/did/tests/test_did_multi_return_types.py
@@ -13,10 +13,11 @@
 from doubleml.double_ml_framework import DoubleMLFramework
 
 # Test constants
-N_OBS = 200
+N_IDS = 200
 N_REP = 1
 N_FOLDS = 3
 N_REP_BOOT = 314
+N_PERIODS = 5
 
 dml_args = {
     "n_rep": N_REP,
@@ -30,7 +31,7 @@
 datasets = {}
 
 # panel data
-df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float")
+df_panel = make_did_CS2021(n_obs=N_IDS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float")
 df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0])
 datasets["did_panel"] = DoubleMLPanelData(
     df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
@@ -41,10 +42,23 @@
 
 
 dml_objs = [
-    (DoubleMLDIDMulti(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), DoubleMLDIDMulti),
+    (
+        DoubleMLDIDMulti(datasets["did_panel"], panel=True, ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args),
+        DoubleMLDIDMulti,
+    ),
+    (
+        DoubleMLDIDMulti(datasets["did_panel"], panel=False, ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args),
+        DoubleMLDIDMulti,
+    ),
+    (
+        DoubleMLDIDMulti(
+            datasets["did_panel_binary_outcome"], panel=True, ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args
+        ),
+        DoubleMLDIDMulti,
+    ),
     (
         DoubleMLDIDMulti(
-            datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args
+            datasets["did_panel_binary_outcome"], panel=False, ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args
         ),
         DoubleMLDIDMulti,
     ),
@@ -83,13 +97,20 @@ def test_panel_property_types_and_shapes(fitted_dml_obj):
     n_treat = len(fitted_dml_obj.gt_combinations)
     dml_obj = fitted_dml_obj
 
+    if dml_obj.panel:
+        score_dim = (N_IDS, n_treat, N_REP)
+    else:
+        score_dim = (df_panel.shape[0], n_treat, N_REP)
+
+    assert dml_obj._score_dim == score_dim
+
     # check_basic_property_types_and_shapes
     # check that the setting is still in line with the hard-coded values
     assert dml_obj._dml_data.n_treat == 1
     assert dml_obj.n_gt_atts == n_treat
     assert dml_obj.n_rep == N_REP
     assert dml_obj.n_folds == N_FOLDS
-    assert dml_obj._dml_data.n_obs == N_OBS
+    assert dml_obj._dml_data.n_obs == df_panel.shape[0]
     assert dml_obj.n_rep_boot == N_REP_BOOT
 
     assert isinstance(dml_obj.all_coef, np.ndarray)
@@ -111,11 +132,7 @@ def test_panel_property_types_and_shapes(fitted_dml_obj):
     assert dml_obj.t_stat.shape == (n_treat,)
 
     assert isinstance(dml_obj.framework.scaled_psi, np.ndarray)
-    assert dml_obj.framework.scaled_psi.shape == (
-        N_OBS,
-        n_treat,
-        N_REP,
-    )
+    assert dml_obj.framework.scaled_psi.shape == score_dim
 
     assert isinstance(dml_obj.framework, DoubleMLFramework)
     assert isinstance(dml_obj.pval, np.ndarray)
@@ -125,7 +142,10 @@ def test_panel_property_types_and_shapes(fitted_dml_obj):
     assert len(dml_obj._dml_data.binary_treats) == 1
 
     # check_basic_predictions_and_targets
-    expected_keys = ["ml_g0", "ml_g1", "ml_m"]
+    if dml_obj.panel:
+        expected_keys = ["ml_g0", "ml_g1", "ml_m"]
+    else:
+        expected_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1", "ml_m"]
     for key in expected_keys:
         assert isinstance(dml_obj.nuisance_loss[key], np.ndarray)
         assert dml_obj.nuisance_loss[key].shape == (N_REP, n_treat)
@@ -136,6 +156,10 @@ def test_panel_sensitivity_return_types(fitted_dml_obj):
     n_treat = len(fitted_dml_obj.gt_combinations)
     benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]]
     dml_obj = fitted_dml_obj
+    if dml_obj.panel:
+        score_dim = (N_IDS, n_treat, N_REP)
+    else:
+        score_dim = (df_panel.shape[0], n_treat, N_REP)
 
     assert isinstance(dml_obj.sensitivity_elements, dict)
     for key in ["sigma2", "nu2", "max_bias"]:
@@ -143,7 +167,7 @@ def test_panel_sensitivity_return_types(fitted_dml_obj):
         assert dml_obj.sensitivity_elements[key].shape == (1, n_treat, N_REP)
     for key in ["psi_max_bias"]:
         assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
-        assert dml_obj.sensitivity_elements[key].shape == (N_OBS, n_treat, N_REP)
+        assert dml_obj.sensitivity_elements[key].shape == score_dim
 
     assert isinstance(dml_obj.sensitivity_summary, str)
     dml_obj.sensitivity_analysis()
diff --git a/doubleml/did/tests/test_did_multi_vs_binary.py b/doubleml/did/tests/test_did_multi_vs_binary.py
index 40b877b2..15d3fd0c 100644
--- a/doubleml/did/tests/test_did_multi_vs_binary.py
+++ b/doubleml/did/tests/test_did_multi_vs_binary.py
@@ -49,7 +49,7 @@ def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_nor
     n_obs = 500
     dpg = 1
     boot_methods = ["normal"]
-    n_rep_boot = 50000
+    n_rep_boot = 500
 
     # collect data
     df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type)
diff --git a/doubleml/did/tests/test_did_multi_vs_cs_binary.py b/doubleml/did/tests/test_did_multi_vs_cs_binary.py
new file mode 100644
index 00000000..7af8d74d
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_vs_cs_binary.py
@@ -0,0 +1,213 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_cs_CS2021
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.5, 0.1])
+def lambda_t(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_binary_vs_did_multi_fixture(time_type, lambda_t, learner, score, in_sample_normalization, trimming_threshold):
+    n_obs = 500
+    dpg = 1
+    boot_methods = ["normal"]
+    n_rep_boot = 500
+
+    # collect data
+    df = make_did_cs_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type, lambda_t=lambda_t)
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "n_folds": 3,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": True,
+    }
+    gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[1])]
+    dml_did_multi_obj = dml.did.DoubleMLDIDMulti(
+        dml_panel_data,
+        ml_g=learner[0],
+        ml_m=learner[1],
+        gt_combinations=gt_combination,
+        panel=False,
+        **dml_args,
+    )
+    dml_did_multi_obj.fit()
+
+    treatment_col = dml_panel_data.d_cols[0]
+    ext_pred_dict = {treatment_col: {}}
+    all_keys = ["ml_g_d0_t0", "ml_g_d0_t1", "ml_g_d1_t0", "ml_g_d1_t1"]
+    for key in all_keys:
+        ext_pred_dict["d"][key] = dml_did_multi_obj.modellist[0].predictions[key][:, :, 0]
+    if score == "observational":
+        ext_pred_dict[treatment_col]["ml_m"] = dml_did_multi_obj.modellist[0].predictions["ml_m"][:, :, 0]
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDCSBinary(
+        dml_panel_data,
+        g_value=gt_combination[0][0],
+        t_value_pre=gt_combination[0][1],
+        t_value_eval=gt_combination[0][2],
+        ml_g=DMLDummyRegressor(),
+        ml_m=DMLDummyClassifier(),
+        **dml_args,
+    )
+    dml_did_binary_obj.fit(external_predictions=ext_pred_dict)
+
+    res_dict = {
+        "coef_multi": dml_did_multi_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "se_multi": dml_did_multi_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "boot_methods": boot_methods,
+        "nuisance_loss_multi": dml_did_multi_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        dml_did_multi_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        # approximately same ci (bootstrap not identical due to size of score)
+        res_dict["boot_ci" + bootstrap + "_multi"] = dml_did_multi_obj.confint(joint=True)
+        res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True)
+
+    # sensitivity tests
+    res_dict["sensitivity_elements_multi"] = dml_did_multi_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.framework.sensitivity_elements
+
+    dml_did_multi_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params_multi"] = dml_did_multi_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_binary_vs_did_multi_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_multi_fixture["coef_binary"][0],
+        dml_did_binary_vs_did_multi_fixture["coef_multi"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_se(dml_did_binary_vs_did_multi_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_multi_fixture["se_binary"][0],
+        dml_did_binary_vs_did_multi_fixture["se_multi"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_boot(dml_did_binary_vs_did_multi_fixture):
+    for bootstrap in dml_did_binary_vs_did_multi_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_multi"].values,
+            dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_binary"].values,
+            atol=1e-2,
+        )
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_binary_vs_did_multi_fixture):
+    assert (
+        dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].keys()
+        == dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].items():
+        assert np.allclose(value, dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_binary_vs_did_multi_fixture):
+    elements_multi = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_multi"]
+    elements_binary = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_binary"]
+    sensitivity_element_names = ["max_bias", "psi_max_bias", "sigma2", "nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            elements_multi[sensitivity_element],
+            elements_binary[sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_binary_vs_did_multi_fixture):
+    multi_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_multi"]
+    binary_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_binary"]
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            multi_params[key]["lower"],
+            binary_params[key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            multi_params[key]["upper"],
+            binary_params[key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            multi_params[key],
+            binary_params[key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py
index a59cec6c..37105c3e 100644
--- a/doubleml/did/tests/test_return_types.py
+++ b/doubleml/did/tests/test_return_types.py
@@ -4,8 +4,8 @@
 from sklearn.linear_model import Lasso, LogisticRegression
 
 from doubleml.data import DoubleMLData, DoubleMLPanelData
-from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS
-from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020
+from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS, DoubleMLDIDCSBinary
+from doubleml.did.datasets import make_did_CS2021, make_did_cs_CS2021, make_did_SZ2020
 from doubleml.utils._check_return_types import (
     check_basic_predictions_and_targets,
     check_basic_property_types_and_shapes,
@@ -79,7 +79,8 @@ def test_sensitivity_return_types(fitted_dml_obj):
 
 
 # panel data
-df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float")
+N_PERIODS = 5
+df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float")
 df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0])
 datasets["did_panel"] = DoubleMLPanelData(
     df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
@@ -88,6 +89,17 @@ def test_sensitivity_return_types(fitted_dml_obj):
     df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
 )
 
+# Create a dataset for DoubleMLDIDCSBinary
+df_panel_cs = make_did_cs_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=N_PERIODS, time_type="float")
+df_panel_cs["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel_cs.shape[0])
+datasets["did_panel_cs"] = DoubleMLPanelData(
+    df_panel_cs, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+datasets["did_panel_cs_binary_outcome"] = DoubleMLPanelData(
+    df_panel_cs, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+
+
 dml_panel_binary_args = dml_args | {
     "g_value": 2,
     "t_value_pre": 0,
@@ -105,6 +117,19 @@ def test_sensitivity_return_types(fitted_dml_obj):
         ),
         DoubleMLDIDBinary,
     ),
+    (
+        DoubleMLDIDCSBinary(datasets["did_panel_cs"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_panel_binary_args),
+        DoubleMLDIDCSBinary,
+    ),
+    (
+        DoubleMLDIDCSBinary(
+            datasets["did_panel_cs_binary_outcome"],
+            ml_g=LogisticRegression(),
+            ml_m=LogisticRegression(),
+            **dml_panel_binary_args,
+        ),
+        DoubleMLDIDCSBinary,
+    ),
 ]
 
 
@@ -121,12 +146,16 @@ def test_panel_return_types(dml_obj, cls):
     assert isinstance(dml_obj.t_value_pre, (int, np.integer, float, np.floating))
     assert isinstance(dml_obj.post_treatment, bool)
 
-    # Test panel_data_wide property
-    assert isinstance(dml_obj.panel_data_wide, pd.DataFrame)
-    assert dml_obj.panel_data_wide.shape[0] <= N_OBS
-    assert "G_indicator" in dml_obj.panel_data_wide.columns
-    assert "C_indicator" in dml_obj.panel_data_wide.columns
-    assert "y_diff" in dml_obj.panel_data_wide.columns
+    # Test data_subset property
+    assert isinstance(dml_obj.data_subset, pd.DataFrame)
+    if isinstance(dml_obj, DoubleMLDIDBinary):
+        assert dml_obj.data_subset.shape[0] <= N_OBS
+        assert "y_diff" in dml_obj.data_subset.columns
+    elif isinstance(dml_obj, DoubleMLDIDCSBinary):
+        assert dml_obj.data_subset.shape[0] <= N_OBS * 2
+        assert "t_indicator" in dml_obj.data_subset.columns
+    assert "G_indicator" in dml_obj.data_subset.columns
+    assert "C_indicator" in dml_obj.data_subset.columns
 
     # Test id_positions property
     assert isinstance(dml_obj.id_positions, np.ndarray)
@@ -141,7 +170,10 @@ def test_panel_return_types(dml_obj, cls):
 
     # Test n_obs property
     assert isinstance(dml_obj.n_obs, (int, np.integer))
-    assert dml_obj.n_obs <= N_OBS
+    if isinstance(dml_obj, DoubleMLDIDBinary):
+        assert dml_obj.n_obs <= N_OBS
+    elif isinstance(dml_obj, DoubleMLDIDCSBinary):
+        assert dml_obj.n_obs <= N_OBS * N_PERIODS
 
     # Test consistency between properties
     if dml_obj.post_treatment:
@@ -160,12 +192,29 @@ def fitted_panel_dml_obj(request):
 
 @pytest.mark.ci
 def test_panel_property_types_and_shapes(fitted_panel_dml_obj):
-    check_basic_property_types_and_shapes(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT)
-    check_basic_predictions_and_targets(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP)
+    # n_obs for psi, psi_a, psi_b checks within check_basic_property_types_and_shapes
+    # This should be the number of observations used for the score calculation.
+    # For DIDBinary, it's n_ids. For DIDCSBinary, it's _n_obs_subset.
+    # Both are consistently available as fitted_panel_dml_obj.n_obs.
+    actual_score_dim = (fitted_panel_dml_obj.n_obs, N_REP, N_TREAT)
+
+    check_basic_property_types_and_shapes(
+        fitted_panel_dml_obj,
+        n_obs=fitted_panel_dml_obj._dml_data.n_obs,
+        n_treat=N_TREAT,
+        n_rep=N_REP,
+        n_folds=N_FOLDS,
+        n_rep_boot=N_REP_BOOT,
+        score_dim=actual_score_dim,  # Used for psi shape
+    )
+
+    check_basic_predictions_and_targets(fitted_panel_dml_obj, fitted_panel_dml_obj.n_obs, N_TREAT, N_REP)
 
 
 @pytest.mark.ci
 def test_panel_sensitivity_return_types(fitted_panel_dml_obj):
     if fitted_panel_dml_obj._sensitivity_implemented:
         benchmarking_set = [fitted_panel_dml_obj._dml_data.x_cols[0]]
-        check_sensitivity_return_types(fitted_panel_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set)
+        check_sensitivity_return_types(
+            fitted_panel_dml_obj, fitted_panel_dml_obj.n_obs, N_REP, N_TREAT, benchmarking_set=benchmarking_set
+        )
diff --git a/doubleml/did/utils/_did_utils.py b/doubleml/did/utils/_did_utils.py
index bb69a1ef..2e7c011d 100644
--- a/doubleml/did/utils/_did_utils.py
+++ b/doubleml/did/utils/_did_utils.py
@@ -84,12 +84,6 @@ def _check_gt_combination(gt_combination, g_values, t_values, never_treated_valu
     if t_value_pre == t_value_eval:
         raise ValueError(f"The pre-treatment and evaluation period must be different. Got {t_value_pre} for both.")
 
-    if t_value_pre > t_value_eval:
-        raise ValueError(
-            "The pre-treatment period must be before the evaluation period. "
-            f"Got t_value_pre {t_value_pre} and t_value_eval {t_value_eval}."
-        )
-
     # get t_value equal to g_value and adjust for anticipation periods
     maximal_t_pre = t_values[max(np.where(t_values == g_value)[0] - anticipation_periods, 0)]
     if t_value_pre >= maximal_t_pre:
@@ -121,14 +115,16 @@ def _construct_gt_combinations(setting, g_values, t_values, never_treated_value,
     """Construct treatment-time combinations for difference-in-differences analysis.
 
     Parameters:
-        setting (str): Strategy for constructing combinations ('standard' only)
+        setting (str): Strategy for constructing combinations. One of 'standard', 'all', 'universal'.
         g_values (array): Treatment group values, must be sorted
         t_values (array): Time period values, must be sorted
+        never_treated_value (int, float or pd.NaT): Value indicating never-treated units.
+        anticipation_periods (int): Number of anticipation periods.
 
     Returns:
         list: List of (g_val, t_pre, t_eval) tuples
     """
-    valid_settings = ["standard", "all"]
+    valid_settings = ["standard", "all", "universal"]
     if setting not in valid_settings:
         raise ValueError(f"gt_combinations must be one of {valid_settings}. {setting} was passed.")
 
@@ -139,29 +135,34 @@ def _construct_gt_combinations(setting, g_values, t_values, never_treated_value,
         raise ValueError("t_values must be sorted in ascending order.")
 
     gt_combinations = []
-    if setting == "standard":
-        for g_val in treatment_groups:
-            t_values_before_g = t_values[t_values < g_val]
-            if len(t_values_before_g) > anticipation_periods:
-                first_eval_index = anticipation_periods + 1  # first relevant evaluation period index
-                t_before_g = t_values_before_g[-first_eval_index]
-
-                # collect all evaluation periods
-                for i_t_eval, t_eval in enumerate(t_values[first_eval_index:]):
-                    t_previous = t_values[i_t_eval]  # refers to t-anticipation_periods-1
-                    t_pre = min(t_previous, t_before_g)  # if t_previous larger than g_val, use t_before_g
-                    gt_combinations.append((g_val, t_pre, t_eval))
-
-    if setting == "all":
-        for g_val in treatment_groups:
-            t_values_before_g = t_values[t_values < g_val]
-            if len(t_values_before_g) > anticipation_periods:
-                first_eval_index = anticipation_periods + 1  # first relevant evaluation period index
-                for t_eval in t_values[first_eval_index:]:
-                    # all t-values before g_val - anticipation_periods
-                    valid_t_pre_values = t_values[t_values <= min(g_val, t_eval)][:-first_eval_index]
-                    for t_pre in valid_t_pre_values:
-                        gt_combinations.append((g_val, t_pre, t_eval))
+    for g_val in treatment_groups:
+        t_values_before_g = t_values[t_values < g_val]
+        if len(t_values_before_g) <= anticipation_periods:
+            continue
+        first_eval_index = anticipation_periods + 1  # first relevant evaluation period index
+
+        if setting == "standard":
+            t_before_g = t_values_before_g[-first_eval_index]
+            combinations = [
+                (g_val, min(t_values[i_t_eval], t_before_g), t_eval)
+                for i_t_eval, t_eval in enumerate(t_values[first_eval_index:])
+            ]
+            gt_combinations.extend(combinations)
+
+        elif setting == "all":
+            combinations = [
+                (g_val, t_pre, t_eval)
+                for t_eval in t_values[first_eval_index:]
+                for t_pre in t_values[t_values <= min(g_val, t_eval)][:-first_eval_index]
+            ]
+            gt_combinations.extend(combinations)
+
+        elif setting == "universal":
+            # The base period is the last period before treatment, accounting for anticipation.
+            # `g_val - anticipation_periods - 1` is not robust for non-integer or non-consecutive periods.
+            base_period = t_values_before_g[-first_eval_index]
+            combinations = [(g_val, base_period, t_eval) for t_eval in t_values if t_eval != base_period]
+            gt_combinations.extend(combinations)
 
     if len(gt_combinations) == 0:
         raise ValueError(
diff --git a/doubleml/did/utils/tests/test_did_utils.py b/doubleml/did/utils/tests/test_did_utils.py
index df9da7f2..f0ffaf7c 100644
--- a/doubleml/did/utils/tests/test_did_utils.py
+++ b/doubleml/did/utils/tests/test_did_utils.py
@@ -99,11 +99,6 @@ def test_check_gt_combination():
             ValueError,
             "The pre-treatment and evaluation period must be different. Got 1 for both.",
         ),
-        (
-            {"gt_combination": (1, 1, 0)},
-            ValueError,
-            "The pre-treatment period must be before the evaluation period. Got t_value_pre 1 and t_value_eval 0.",
-        ),
         (
             {"gt_combination": (-1, 0, 1)},
             ValueError,
@@ -171,7 +166,7 @@ def test_input_check_gt_values():
 
 @pytest.mark.ci
 def test_construct_gt_combinations():
-    msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed."
+    msg = r"gt_combinations must be one of \['standard', 'all', 'universal'\]. test was passed."
     with pytest.raises(ValueError, match=msg):
         _construct_gt_combinations(
             setting="test",
@@ -256,6 +251,24 @@ def test_construct_gt_combinations():
     ]
     assert all_combinations == expected_all
 
+    # Test universal setting
+    universal_combinations = _construct_gt_combinations(
+        setting="universal",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=0,
+    )
+    expected_universal = [
+        (2, 1, 0),  # g=2, pre=1, eval=0
+        (2, 1, 2),  # g=2, pre=1, eval=2
+        (2, 1, 3),  # g=2, pre=1, eval=3
+        (3, 2, 0),  # g=3, pre=2, eval=0
+        (3, 2, 1),  # g=3, pre=2, eval=1
+        (3, 2, 3),  # g=3, pre=2, eval=3
+    ]
+    assert universal_combinations == expected_universal
+
     # Test standard setting with anticipation periods
     standard_combinations_anticipation = _construct_gt_combinations(
         setting="standard",
@@ -282,6 +295,21 @@ def test_construct_gt_combinations():
     ]
     assert all_combinations_anticipation == expected_all_anticipation
 
+    # Test universal setting with anticipation periods
+    universal_combinations_anticipation = _construct_gt_combinations(
+        setting="universal",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=2,
+    )
+    expected_universal_anticipation = [
+        (3, 0, 1),  # g=3, pre=0, eval=1 with anticipation 2
+        (3, 0, 2),
+        (3, 0, 3),
+    ]
+    assert universal_combinations_anticipation == expected_universal_anticipation
+
 
 @pytest.mark.ci
 def test_construct_gt_index():
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 764865a4..694968bc 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -98,69 +98,99 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
         # perform sample splitting
         self._smpls = None
         self._smpls_cluster = None
+        self._n_obs_sample_splitting = self.n_obs
         if draw_sample_splitting:
             self.draw_sample_splitting()
 
+        self._score_dim = (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs)
         # initialize arrays according to obj_dml_data and the resampling settings
-        (
-            self._psi,
-            self._psi_deriv,
-            self._psi_elements,
-            self._var_scaling_factors,
-            self._coef,
-            self._se,
-            self._all_coef,
-            self._all_se,
-        ) = self._initialize_arrays()
+        self._initialize_arrays()
 
         # initialize instance attributes which are later used for iterating
         self._i_rep = None
         self._i_treat = None
 
-    def __str__(self):
+    def _format_header_str(self):
         class_name = self.__class__.__name__
-        header = f"================== {class_name} Object ==================\n"
-        data_summary = self._dml_data._data_summary_str()
-        score_info = f"Score function: {str(self.score)}\n"
+        return f"================== {class_name} Object =================="
+
+    def _format_score_info_str(self):
+        return f"Score function: {str(self.score)}"
+
+    def _format_learner_info_str(self):
         learner_info = ""
-        for key, value in self.learner.items():
-            learner_info += f"Learner {key}: {str(value)}\n"
+        if self.learner is not None:
+            for key, value in self.learner.items():
+                learner_info += f"Learner {key}: {str(value)}\n"
         if self.nuisance_loss is not None:
             learner_info += "Out-of-sample Performance:\n"
-            is_classifier = [value for value in self._is_classifier.values()]
-            is_regressor = [not value for value in is_classifier]
-            if any(is_regressor):
-                learner_info += "Regression:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is False]:
-                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
-            if any(is_classifier):
-                learner_info += "Classification:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is True]:
-                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
+            # Check if _is_classifier is populated, otherwise, it might be called before fit
+            if self._is_classifier:
+                is_classifier_any = any(self._is_classifier.values())
+                is_regressor_any = any(not v for v in self._is_classifier.values())
+
+                if is_regressor_any:
+                    learner_info += "Regression:\n"
+                    for learner_name in self.params_names:  # Iterate through known learners
+                        if not self._is_classifier.get(learner_name, True):  # Default to not regressor if not found
+                            loss_val = self.nuisance_loss.get(learner_name, "N/A")
+                            learner_info += f"Learner {learner_name} RMSE: {loss_val}\n"
+                if is_classifier_any:
+                    learner_info += "Classification:\n"
+                    for learner_name in self.params_names:  # Iterate through known learners
+                        if self._is_classifier.get(learner_name, False):  # Default to not classifier if not found
+                            loss_val = self.nuisance_loss.get(learner_name, "N/A")
+                            learner_info += f"Learner {learner_name} Log Loss: {loss_val}\n"
+            else:
+                learner_info += " (Run .fit() to see out-of-sample performance)\n"
+        return learner_info.strip()
 
+    def _format_resampling_info_str(self):
         if self._is_cluster_data:
-            resampling_info = (
+            return (
                 f"No. folds per cluster: {self._n_folds_per_cluster}\n"
                 f"No. folds: {self.n_folds}\n"
-                f"No. repeated sample splits: {self.n_rep}\n"
+                f"No. repeated sample splits: {self.n_rep}"
             )
         else:
-            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
-        fit_summary = str(self.summary)
-        res = (
-            header
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ Score & algorithm ------------------\n"
-            + score_info
-            + "\n------------------ Machine learner   ------------------\n"
-            + learner_info
-            + "\n------------------ Resampling        ------------------\n"
-            + resampling_info
-            + "\n------------------ Fit summary       ------------------\n"
-            + fit_summary
+            return f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}"
+
+    def _format_additional_info_str(self):
+        """
+        Hook for subclasses to add additional information to the string representation.
+        Returns an empty string by default.
+        Subclasses should override this method to provide content.
+        The content should not include the 'Additional Information' header itself.
+        """
+        return ""
+
+    def __str__(self):
+        header = self._format_header_str()
+        # Assumes self._dml_data._data_summary_str() exists and is well-formed
+        data_summary = self._dml_data._data_summary_str()
+        score_info = self._format_score_info_str()
+        learner_info = self._format_learner_info_str()
+        resampling_info = self._format_resampling_info_str()
+        fit_summary = str(self.summary)  # Assumes self.summary is well-formed
+
+        representation = (
+            f"{header}\n"
+            f"\n------------------ Data Summary      ------------------\n"
+            f"{data_summary}\n"
+            f"\n------------------ Score & Algorithm ------------------\n"
+            f"{score_info}\n"
+            f"\n------------------ Machine Learner   ------------------\n"
+            f"{learner_info}\n"
+            f"\n------------------ Resampling        ------------------\n"
+            f"{resampling_info}\n"
+            f"\n------------------ Fit Summary       ------------------\n"
+            f"{fit_summary}"
         )
-        return res
+
+        additional_info = self._format_additional_info_str()
+        if additional_info:
+            representation += f"\n\n------------------ Additional Information ------------------\n" f"{additional_info}"
+        return representation
 
     @property
     def n_folds(self):
@@ -855,7 +885,7 @@ def tune(
                         self.set_ml_nuisance_params(nuisance_model, self._dml_data.d_cols[i_d], params)
 
             else:
-                smpls = [(np.arange(self._dml_data.n_obs), np.arange(self._dml_data.n_obs))]
+                smpls = [(np.arange(self.n_obs), np.arange(self.n_obs))]
                 # tune hyperparameters
                 res = self._nuisance_tuning(
                     smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
@@ -1004,7 +1034,7 @@ def _check_fit(self, n_jobs_cv, store_predictions, external_predictions, store_m
                 external_predictions=external_predictions,
                 valid_treatments=self._dml_data.d_cols,
                 valid_learners=self.params_names,
-                n_obs=self._dml_data.n_obs,
+                n_obs=self.n_obs,
                 n_rep=self.n_rep,
             )
         elif not self._external_predictions_implemented and external_predictions is not None:
@@ -1021,9 +1051,7 @@ def _initalize_fit(self, store_predictions, store_models):
             self._initialize_models()
 
         if self._sensitivity_implemented:
-            self._sensitivity_elements = self._initialize_sensitivity_elements(
-                (self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs)
-            )
+            self._sensitivity_elements = self._initialize_sensitivity_elements(self._score_dim)
 
     def _fit_nuisance_and_score_elements(self, n_jobs_cv, store_predictions, external_predictions, store_models):
         ext_prediction_dict = _set_external_predictions(
@@ -1076,30 +1104,24 @@ def _fit_sensitivity_elements(self, nuisance_predictions):
 
     def _initialize_arrays(self):
         # scores
-        psi = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
-        psi_deriv = np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
-        psi_elements = self._initialize_score_elements((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs))
+        self._psi = np.full(self._score_dim, np.nan)
+        self._psi_deriv = np.full(self._score_dim, np.nan)
+        self._psi_elements = self._initialize_score_elements(self._score_dim)
 
-        var_scaling_factors = np.full(self._dml_data.n_treat, np.nan)
+        n_rep = self._score_dim[1]
+        n_thetas = self._score_dim[2]
 
+        self._var_scaling_factors = np.full(n_thetas, np.nan)
         # coefficients and ses
-        coef = np.full(self._dml_data.n_coefs, np.nan)
-        se = np.full(self._dml_data.n_coefs, np.nan)
+        self._coef = np.full(n_thetas, np.nan)
+        self._se = np.full(n_thetas, np.nan)
 
-        all_coef = np.full((self._dml_data.n_coefs, self.n_rep), np.nan)
-        all_se = np.full((self._dml_data.n_coefs, self.n_rep), np.nan)
-
-        return psi, psi_deriv, psi_elements, var_scaling_factors, coef, se, all_coef, all_se
+        self._all_coef = np.full((n_thetas, n_rep), np.nan)
+        self._all_se = np.full((n_thetas, n_rep), np.nan)
 
     def _initialize_predictions_and_targets(self):
-        self._predictions = {
-            learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
-            for learner in self.params_names
-        }
-        self._nuisance_targets = {
-            learner: np.full((self._dml_data.n_obs, self.n_rep, self._dml_data.n_coefs), np.nan)
-            for learner in self.params_names
-        }
+        self._predictions = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
+        self._nuisance_targets = {learner: np.full(self._score_dim, np.nan) for learner in self.params_names}
 
     def _initialize_nuisance_loss(self):
         self._nuisance_loss = {learner: np.full((self.n_rep, self._dml_data.n_coefs), np.nan) for learner in self.params_names}
@@ -1223,12 +1245,6 @@ def draw_sample_splitting(self):
         The samples are drawn according to the attributes
         ``n_folds`` and ``n_rep``.
 
-        Parameters
-        ----------
-        n_obs : int or None
-            The number of observations. If ``None``, the number of observations is set to the number of observations in
-            the data set.
-
         Returns
         -------
         self : object
@@ -1237,14 +1253,14 @@ def draw_sample_splitting(self):
             obj_dml_resampling = DoubleMLClusterResampling(
                 n_folds=self._n_folds_per_cluster,
                 n_rep=self.n_rep,
-                n_obs=self.n_obs,
+                n_obs=self._n_obs_sample_splitting,
                 n_cluster_vars=self._dml_data.n_cluster_vars,
                 cluster_vars=self._dml_data.cluster_vars,
             )
             self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples()
         else:
             obj_dml_resampling = DoubleMLResampling(
-                n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self.n_obs, stratify=self._strata
+                n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._n_obs_sample_splitting, stratify=self._strata
             )
             self._smpls = obj_dml_resampling.split_samples()
 
@@ -1311,19 +1327,12 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
         self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
-            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self.n_obs
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self._n_obs_sample_splitting
         )
 
-        (
-            self._psi,
-            self._psi_deriv,
-            self._psi_elements,
-            self._var_scaling_factors,
-            self._coef,
-            self._se,
-            self._all_coef,
-            self._all_se,
-        ) = self._initialize_arrays()
+        # set sample splitting can update the number of repetitions
+        self._score_dim = (self._score_dim[0], self._n_rep, self._score_dim[2])
+        self._initialize_arrays()
         self._initialize_ml_nuisance_params()
 
         return self
diff --git a/doubleml/irm/apo.py b/doubleml/irm/apo.py
index e8c75172..9fcad876 100644
--- a/doubleml/irm/apo.py
+++ b/doubleml/irm/apo.py
@@ -46,7 +46,7 @@ class DoubleMLAPO(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py
index d2aeaced..29d78f15 100644
--- a/doubleml/irm/cvar.py
+++ b/doubleml/irm/cvar.py
@@ -54,7 +54,7 @@ class DoubleMLCVAR(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py
index a43c0a03..73495fd7 100644
--- a/doubleml/irm/iivm.py
+++ b/doubleml/irm/iivm.py
@@ -45,7 +45,7 @@ class DoubleMLIIVM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
@@ -197,22 +197,13 @@ def __init__(
         self.subgroups = subgroups
         self._external_predictions_implemented = True
 
-    def __str__(self):
-        parent_str = super().__str__()
-
-        # add robust confset
+    def _format_additional_info_str(self):
         if self.framework is None:
-            confset_str = ""
+            return ""
         else:
             confset = self.robust_confset()
             formatted_confset = ", ".join([f"[{lower:.4f}, {upper:.4f}]" for lower, upper in confset])
-            confset_str = (
-                "\n\n--------------- Additional Information ----------------\n"
-                + f"Robust Confidence Set: {formatted_confset}\n"
-            )
-
-        res = parent_str + confset_str
-        return res
+            return f"Robust Confidence Set: {formatted_confset}"
 
     @property
     def normalize_ipw(self):
diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index 9bf5ed35..76e955f9 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -47,7 +47,7 @@ class DoubleMLIRM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py
index c98e8fa2..4b2377ee 100644
--- a/doubleml/irm/lpq.py
+++ b/doubleml/irm/lpq.py
@@ -49,7 +49,7 @@ class DoubleMLLPQ(NonLinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py
index f64dc471..7f40d27d 100644
--- a/doubleml/irm/pq.py
+++ b/doubleml/irm/pq.py
@@ -56,7 +56,7 @@ class DoubleMLPQ(NonLinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index 68b91a9a..9f617e3e 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -39,7 +39,7 @@ class DoubleMLQTE:
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str
diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py
index c84b326d..e7e5d83c 100644
--- a/doubleml/irm/ssm.py
+++ b/doubleml/irm/ssm.py
@@ -39,7 +39,7 @@ class DoubleMLSSM(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py
index ba022688..fdf4e28d 100644
--- a/doubleml/plm/pliv.py
+++ b/doubleml/plm/pliv.py
@@ -45,7 +45,7 @@ class DoubleMLPLIV(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index a81bac48..30ad763e 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -44,7 +44,7 @@ class DoubleMLPLR(LinearScoreMixin, DoubleML):
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     score : str or callable
diff --git a/doubleml/rdd/rdd.py b/doubleml/rdd/rdd.py
index 858ae5ed..be1cd797 100644
--- a/doubleml/rdd/rdd.py
+++ b/doubleml/rdd/rdd.py
@@ -50,7 +50,7 @@ class RDFlex:
         Default is ``5``.
 
     n_rep : int
-        Number of repetitons for the sample splitting.
+        Number of repetitions for the sample splitting.
         Default is ``1``.
 
     cutoff : float or int
diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py
index 54462059..54e72833 100644
--- a/doubleml/utils/_check_return_types.py
+++ b/doubleml/utils/_check_return_types.py
@@ -31,10 +31,14 @@ def check_basic_return_types(dml_obj, cls):
     assert isinstance(dml_obj._dml_data.__str__(), str)
 
 
-def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot):
+def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot, score_dim=None):
     # not checked: learner, learner_names, params, params_names, score
     # already checked: summary
 
+    # use default combination
+    if score_dim is None:
+        score_dim = (n_obs, n_rep, n_treat)
+
     # check that the setting is still in line with the hard-coded values
     assert dml_obj._dml_data.n_treat == n_treat
     assert dml_obj.n_rep == n_rep
@@ -55,35 +59,19 @@ def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_fold
     assert dml_obj.coef.shape == (n_treat,)
 
     assert isinstance(dml_obj.psi, np.ndarray)
-    assert dml_obj.psi.shape == (
-        n_obs,
-        n_rep,
-        n_treat,
-    )
+    assert dml_obj.psi.shape == score_dim
 
     is_nonlinear = isinstance(dml_obj, NonLinearScoreMixin)
     if is_nonlinear:
         for score_element in dml_obj._score_element_names:
             assert isinstance(dml_obj.psi_elements[score_element], np.ndarray)
-            assert dml_obj.psi_elements[score_element].shape == (
-                n_obs,
-                n_rep,
-                n_treat,
-            )
+            assert dml_obj.psi_elements[score_element].shape == score_dim
     else:
         assert isinstance(dml_obj.psi_elements["psi_a"], np.ndarray)
-        assert dml_obj.psi_elements["psi_a"].shape == (
-            n_obs,
-            n_rep,
-            n_treat,
-        )
+        assert dml_obj.psi_elements["psi_a"].shape == score_dim
 
         assert isinstance(dml_obj.psi_elements["psi_b"], np.ndarray)
-        assert dml_obj.psi_elements["psi_b"].shape == (
-            n_obs,
-            n_rep,
-            n_treat,
-        )
+        assert dml_obj.psi_elements["psi_b"].shape == score_dim
 
     assert isinstance(dml_obj.framework, DoubleMLFramework)
     assert isinstance(dml_obj.pval, np.ndarray)