diff --git a/pyfixest/did/did2s.py b/pyfixest/did/did2s.py index 72c4b6af3..bcb65aa44 100644 --- a/pyfixest/did/did2s.py +++ b/pyfixest/did/did2s.py @@ -10,6 +10,8 @@ from pyfixest.estimation.feols_ import Feols from pyfixest.estimation.FormulaParser import FixestFormulaParser from pyfixest.estimation.model_matrix_fixest_ import model_matrix_fixest +from pyfixest.utils.utils import rename_did_coefficients + class DID2S(DID): @@ -146,10 +148,28 @@ def iplot( ) def tidy(self): # noqa: D102 - return self.tidy() + # Get the coefficient table + result = self._coeftable.copy() + # Rename the index + result.index = rename_did_coefficients(result.index) + return result def summary(self): # noqa: D102 - return self.summary() + """ + Return a summary of the estimation results. + + Returns + ------- + pd.DataFrame + A DataFrame with the estimation results and renamed coefficients. + """ + # Get the coefficient table + result = self._coeftable.copy() + + # Rename the index + result.index = rename_did_coefficients(result.index) + + return result def _did2s_estimate( diff --git a/pyfixest/did/lpdid.py b/pyfixest/did/lpdid.py index fbeddf3ee..ab9c88199 100644 --- a/pyfixest/did/lpdid.py +++ b/pyfixest/did/lpdid.py @@ -8,6 +8,7 @@ from pyfixest.estimation.feols_ import Feols from pyfixest.estimation.literals import VcovTypeOptions from pyfixest.report.visualize import _HAS_LETS_PLOT, _coefplot +from pyfixest.utils.utils import rename_did_coefficients class LPDID(DID): @@ -188,10 +189,38 @@ def iplot( ) def tidy(self): # noqa: D102 - return self._coeftable + """ + Return a tidy DataFrame with the estimation results. + + Returns + ------- + pd.DataFrame + A DataFrame with the estimation results and renamed coefficients. + """ + # Get the coefficient table + result = self._coeftable.copy() + + # Rename the index + result.index = rename_did_coefficients(result.index) + + return result def summary(self): # noqa: D102 - return self._coeftable + """ + Return a summary of the estimation results. + + Returns + ------- + pd.DataFrame + A DataFrame with the estimation results and renamed coefficients. + """ + # Get the coefficient table + result = self._coeftable.copy() + + # Rename the index + result.index = rename_did_coefficients(result.index) + + return result def _lpdid_estimate( diff --git a/pyfixest/did/twfe.py b/pyfixest/did/twfe.py index eb4aa4c32..f21a62bef 100644 --- a/pyfixest/did/twfe.py +++ b/pyfixest/did/twfe.py @@ -5,6 +5,7 @@ from pyfixest.did.did import DID from pyfixest.estimation.estimation import feols from pyfixest.estimation.feols_ import Feols +from pyfixest.utils.utils import rename_did_coefficients class TWFE(DID): @@ -113,7 +114,35 @@ def iplot( ) def tidy(self): # noqa: D102 - return self.tidy() + """ + Return a tidy DataFrame with the estimation results. + + Returns + ------- + pd.DataFrame + A DataFrame with the estimation results and renamed coefficients. + """ + # Get the coefficient table + result = self._coeftable.copy() + + # Rename the index + result.index = rename_did_coefficients(result.index) + + return result def summary(self): # noqa: D102 - return self.summary() + """ + Return a summary of the estimation results. + + Returns + ------- + pd.DataFrame + A DataFrame with the estimation results and renamed coefficients. + """ + # Get the coefficient table + result = self._coeftable.copy() + + # Rename the index + result.index = rename_did_coefficients(result.index) + + return result \ No newline at end of file diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py index 2471c4c3e..0bbbaafb1 100644 --- a/pyfixest/estimation/feols_.py +++ b/pyfixest/estimation/feols_.py @@ -52,7 +52,7 @@ _narwhals_to_pandas, _select_order_coefs, ) -from pyfixest.utils.utils import capture_context, get_ssc, simultaneous_crit_val +from pyfixest.utils.utils import capture_context, get_ssc, simultaneous_crit_val, rename_did_coefficients decomposition_type = Literal["gelbach"] prediction_type = Literal["response", "link"] @@ -1961,6 +1961,10 @@ def tidy( UserWarning, ) + ## Check if DiD model and rename coefficients as needed + if hasattr(self, '_is_did_model') and self._is_did_model: + coef_names = rename_did_coefficients(self._coefnames) + tidy_df = pd.DataFrame( { "Coefficient": self._coefnames, diff --git a/pyfixest/utils/utils.py b/pyfixest/utils/utils.py index 00ae5f646..9adc295cc 100644 --- a/pyfixest/utils/utils.py +++ b/pyfixest/utils/utils.py @@ -3,6 +3,8 @@ import numpy as np import pandas as pd +import re +import warnings from formulaic import Formula from formulaic.utils.context import capture_context as _capture_context @@ -355,3 +357,107 @@ def capture_context(context: Union[int, Mapping[str, Any]]) -> Mapping[str, Any] procedure like: `.get_model_matrix(..., context=)`. """ return _capture_context(context + 2) if isinstance(context, int) else context + + +# Compile regex patterns for DiD models once for efficiency +# Pattern to extract the categorical variable name from expressions like "C(variable_name, ...)" +CAT_VAR_PATTERN = re.compile(r'C\(([^,]+)') + +# Pattern to extract the level from expressions like "[T.0.0]" or "[T.-5]" +LEVEL_PATTERN = re.compile(r'\[T\.([^\]]+)\]') + +# Pattern to extract the interaction variable from expressions like ":X1" at the end of a string +INTERACTION_PATTERN = re.compile(r':([^:]+)$') + +def rename_did_coefficients(coef_names): + """ + Rename DID model coefficients to a more concise format. + + This function transforms verbose coefficient names generated by formula systems + (like Patsy or Formulaic) into a more readable format using double colons (::) + as separators between components. + + Specifically designed for Difference-in-Differences (DiD) models and event studies + where coefficient names often include categorical variables with treatment contrasts + and interactions. The function handles coefficient names in the format: + "C(variable, contr.treatment(base=X))[T.level]:interaction" + + The transformation follows these rules: + - "C(f1, contr.treatment(base=1))[T.0.0]:X1" → "f1::0.0::X1" + - "C(rel_year, contr.treatment(base=-1))[T.-5]" → "rel_year::-5" + - "C(f1, contr.treatment(base=1)):X1" → "f1::X1" + - Non-categorical variables remain unchanged + + Parameters + ---------- + coef_names : list or array-like + List of coefficient names to be renamed. These are typically from model.coef().index + or similar sources in DID models like did2s, event_study, or lpdid. + + Returns + ------- + list + List of renamed coefficient names with the same length as the input list. + Non-matching names are preserved as-is. + + Examples + -------- + >>> rename_did_coefficients(["C(f1, contr.treatment(base=1))[T.0.0]:X1"]) + ['f1::0.0::X1'] + >>> rename_did_coefficients(["C(rel_year, contr.treatment(base=-1))[T.-5]"]) + ['rel_year::-5'] + >>> rename_did_coefficients(["C(f1, contr.treatment(base=1)):X1"]) + ['f1::X1'] + >>> rename_did_coefficients(["Intercept"]) + ['Intercept'] + """ + # Initialize empty list to store renamed coefficients + renamed_coefs = [] + + # Process each coefficient name in the input list + for name in coef_names: + try: + # Step 1: Extract the categorical variable name (e.g., "f1" from "C(f1, ...)") + cat_var_match = CAT_VAR_PATTERN.search(name) + if not cat_var_match: + # If the pattern doesn't match, this isn't a categorical variable expression + # Keep the original name (e.g., "Intercept" or regular variables) + renamed_coefs.append(name) + continue + + # Extract the actual variable name from the match + cat_var = cat_var_match.group(1) + + # Step 2: Extract the level value (e.g., "0.0" from "[T.0.0]") + level_match = LEVEL_PATTERN.search(name) + level = "" + if level_match is not None: + level = level_match.group(1) + + # Step 3: Extract any interaction variable (e.g., "X1" from ":X1") + interaction_match = INTERACTION_PATTERN.search(name) + interaction = "" + if interaction_match is not None: + interaction = interaction_match.group(1) + + # Step 4: Construct the new name based on which components are present + # Format: "variable::level::interaction" with components omitted if not present + if interaction and level: + # Both level and interaction exist (e.g., "f1::0.0::X1") + renamed_coefs.append(f"{cat_var}::{level}::{interaction}") + elif level: + # Only level exists (e.g., "rel_year::-5") + renamed_coefs.append(f"{cat_var}::{level}") + elif interaction: + # Only interaction exists (e.g., "f1::X1") + renamed_coefs.append(f"{cat_var}::{interaction}") + else: + # Neither level nor interaction exists (just the variable name) + renamed_coefs.append(cat_var) + except Exception as e: + # Catch any unexpected errors during processing (e.g., malformed coefficient names) + # Log a warning and preserve the original name to avoid breaking the analysis + warnings.warn(f"Error processing coefficient name '{name}': {str(e)}") + renamed_coefs.append(name) + + return renamed_coefs \ No newline at end of file diff --git a/tests/test_did_renaming.py b/tests/test_did_renaming.py new file mode 100644 index 000000000..9ba452ac4 --- /dev/null +++ b/tests/test_did_renaming.py @@ -0,0 +1,111 @@ +# tests/test_did_renaming.py +import numpy as np +import pandas as pd +import pytest +import pyfixest as pf +from pyfixest.utils.utils import rename_did_coefficients + +@pytest.fixture +def sample_data(): + """Create sample data for testing.""" + np.random.seed(123) + n = 1000 + data = pd.DataFrame({ + 'unit': np.repeat(range(100), 10), + 'year': np.tile(range(2010, 2020), 100), + 'X1': np.random.normal(0, 1, n), + 'f1': np.random.choice([0, 1, 2, 3, 4], n), + 'Y': np.random.normal(0, 1, n) + }) + return data + +@pytest.fixture +def did_data(): + """Create sample DID data for testing.""" + np.random.seed(123) + n_units = 100 + periods = 10 + n = n_units * periods + # Create unit-level treatment groups (one value per unit) + unit_groups = np.random.choice([0, 2015, 2016, 2017], n_units, replace=True) + + # Expand to observation level + data = pd.DataFrame({ + 'unit': np.repeat(range(n_units), periods), + 'year': np.tile(range(2010, 2020), n_units), + 'g': np.repeat(unit_groups, periods), # Repeat each unit's group for all periods + 'state': np.random.choice(range(20), n), + 'dep_var': np.random.normal(0, 1, n) + }) + # Add rel_year and treat columns + data['rel_year'] = data['year'] - data['g'] + data['rel_year'] = np.where(data['g'] == 0, np.inf, data['rel_year']) + data['treat'] = np.where(data['g'] <= data['year'], 1, 0) + data['treat'] = np.where(data['g'] == 0, 0, data['treat']) + return data + +def test_rename_did_coefficients(): + """Test the rename_did_coefficients function.""" + test_cases = [ + ("C(f1, contr.treatment(base=1))[T.0.0]:X1", "f1::0.0::X1"), + ("C(rel_year, contr.treatment(base=-1))[T.-5]", "rel_year::-5"), + ] + for original, expected in test_cases: + result = rename_did_coefficients([original])[0] + assert result == expected + +def test_feols_renaming(sample_data): + """Test renaming with feols.""" + fit = pf.feols("Y ~ i(f1, X1, ref = 1)", data=sample_data) + # Get original coefficients + original_coefs = fit.coef() + print("Original coefficient names:", original_coefs.index.tolist()) + # Apply renaming + renamed_index = rename_did_coefficients(original_coefs.index) + print("Renamed coefficient names:", renamed_index) + # More specific assertion + assert any(name.startswith("f1::") for name in renamed_index), \ + f"No renamed coefficients found starting with 'f1::'. Got: {renamed_index}" +def test_did2s_renaming(): + """Test renaming of DID2S coefficient names.""" + # Sample coefficient names that would come from a DID2S model + sample_coef_names = [ + "C(rel_year, contr.treatment(base=-1))[T.-5]", + "C(rel_year, contr.treatment(base=-1))[T.-4]", + "C(rel_year, contr.treatment(base=-1))[T.-3]", + "C(rel_year, contr.treatment(base=-1))[T.-2]", + "C(rel_year, contr.treatment(base=-1))[T.0]", + "C(rel_year, contr.treatment(base=-1))[T.1]", + "C(rel_year, contr.treatment(base=-1))[T.2]" + ] + # Apply the renaming function + renamed_coefs = rename_did_coefficients(sample_coef_names) + # Check that the renaming worked as expected + assert len(renamed_coefs) == len(sample_coef_names), "Length mismatch after renaming" + assert renamed_coefs[0] == "rel_year::-5", f"Expected 'rel_year::-5', got '{renamed_coefs[0]}'" + assert renamed_coefs[4] == "rel_year::0", f"Expected 'rel_year::0', got '{renamed_coefs[4]}'" + assert all("rel_year::" in name for name in renamed_coefs), "Not all coefficients were renamed correctly" + +def test_did2s_renaming_with_model(did_data): + """Test renaming with did2s model.""" + try: + # Create and fit the did2s model + fit_did2s = pf.did2s( + did_data, + yname="dep_var", + first_stage="~ 0 | unit + year", + second_stage="~i(rel_year, ref=-1)", + treatment="treat", + cluster="state", + ) + # Get the coefficient names directly + coef_names = fit_did2s.tidy().index.tolist() + # Apply renaming function + renamed_coefs = rename_did_coefficients(coef_names) + # Check that the length is preserved + assert len(renamed_coefs) == len(coef_names), "Length mismatch after renaming" + # Check that at least some coefficients were renamed as expected + assert any("rel_year::" in name for name in renamed_coefs), \ + f"No renamed coefficients found with 'rel_year::'. Got: {renamed_coefs}" + except Exception as e: + pytest.skip(f"Skipping test due to error in model fitting: {str(e)}") \ No newline at end of file