diff --git a/.github/workflows/ci-tests.yaml b/.github/workflows/ci-tests.yaml index 99e86abfc..a57adf743 100644 --- a/.github/workflows/ci-tests.yaml +++ b/.github/workflows/ci-tests.yaml @@ -52,6 +52,9 @@ jobs: run: | pixi run tests-regular + - name: Build coverage.xml + run: pixi run coverage-report + - name: Upload coverage to Codecov (partial) uses: codecov/codecov-action@v4 with: @@ -101,6 +104,9 @@ jobs: pixi run tests-against-r-core pixi run tests-against-r-extended + - name: Build coverage.xml + run: pixi run coverage-report + - name: Upload coverage to Codecov (partial) uses: codecov/codecov-action@v4 with: @@ -109,18 +115,6 @@ jobs: flags: tests-vs-r files: coverage.xml - merge_coverage: - name: "Merge Coverage" - runs-on: ubuntu-latest - needs: [test, test_slow] - steps: - - name: Final coverage merge - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - partial: false - flags: final - build-docs: name: "Build Docs" runs-on: ubuntu-latest diff --git a/pixi.lock b/pixi.lock index bb0504eb7..9f1050ad5 100644 --- a/pixi.lock +++ b/pixi.lock @@ -10414,7 +10414,7 @@ packages: - pypi: . name: pyfixest version: 0.29.0 - sha256: 60ebc67895f027d059ad28577c381104782bd83cd834ef76149135cfc203b86d + sha256: 10e69f2d47ed0032f3aa67573cbdc9b8948586be0352b0270d48bf4159f678d1 requires_dist: - scipy>=1.6 - formulaic>=1.1.0 diff --git a/pyfixest/did/did.py b/pyfixest/did/did.py index 1269a34bd..824898248 100644 --- a/pyfixest/did/did.py +++ b/pyfixest/did/did.py @@ -26,13 +26,17 @@ class DID(ABC): YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. Datetime variables are currently not accepted. Never treated units must have a value of 0. - xfml : str + cluster : Optional[str] + The name of the cluster variable. + weights : Optional[str] + Default is None. Weights for WLS estimation. If None, all observations + are weighted equally. If a string, the name of the column in `data` that + contains the weights. Must be analytic weights for now. + xfml : Optional[str] The formula for the covariates. - att : str + att : Optional[bool], default=True Whether to estimate the average treatment effect on the treated (ATT) or the canonical event study design with all leads and lags. Default is True. - cluster : str - The name of the cluster variable. """ @abstractmethod @@ -44,8 +48,9 @@ def __init__( tname: str, gname: str, cluster: Optional[str] = None, + weights: Optional[str] = None, xfml: Optional[str] = None, - att: bool = True, + att: Optional[bool] = True, ): # do some checks here @@ -57,9 +62,10 @@ def __init__( self._xfml = xfml self._att = att self._cluster = cluster + self._weights = weights + self._weights_type = "aweights" # check if tname and gname are of type int (either int 64, 32, 8) - for var in [self._tname, self._gname]: if self._data[var].dtype not in [ "int64", diff --git a/pyfixest/did/did2s.py b/pyfixest/did/did2s.py index 0ba81aabd..d07a0316e 100644 --- a/pyfixest/did/did2s.py +++ b/pyfixest/did/did2s.py @@ -36,18 +36,18 @@ class DID2S(DID): YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. Datetime variables are currently not accepted. Never treated units must have a value of 0. - xfml : str - The formula for the covariates. - att : str - Whether to estimate the pooled average treatment effect on the treated - (ATT) or the canonical event study design with all leads and lags / the - ATT for each period. Default is True. cluster : str The name of the cluster variable. - weights : Optional[str]. + weights : Optional[str] Default is None. Weights for WLS estimation. If None, all observations are weighted equally. If a string, the name of the column in `data` that - contains the weights. + contains the weights. Must be analytic weights for now. + xfml : Optional[str] + The formula for the covariates. + att : Optional[bool], default=True + Whether to estimate the pooled average treatment effect on the treated + (ATT) or the canonical event study design with all leads and lags / the + ATT for each period. Default is True. """ def __init__( @@ -59,8 +59,8 @@ def __init__( gname: str, cluster: str, weights: Optional[str] = None, - att: bool = True, xfml: Optional[str] = None, + att: Optional[bool] = True, ): super().__init__( data=data, @@ -71,6 +71,7 @@ def __init__( xfml=xfml, att=att, cluster=cluster, + weights=weights, ) self._estimator = "did2s" @@ -86,9 +87,6 @@ def __init__( self._first_u = np.array([]) self._second_u = np.array([]) - # column name with weights. None by default - self._weights_name = weights - def estimate(self): """Estimate the two-step DID2S model.""" return _did2s_estimate( @@ -96,7 +94,7 @@ def estimate(self): yname=self._yname, _first_stage=self._fml1, _second_stage=self._fml2, - weights=self._weights_name, + weights=self._weights, treatment="is_treated", ) # returns triple Feols, first_u, second_u @@ -121,7 +119,7 @@ def vcov(self): first_u=self._first_u, second_u=self._second_u, cluster=self._cluster, - weights=self._weights_name, + weights=self._weights, ) def iplot( @@ -175,10 +173,10 @@ def _did2s_estimate( The formula for the second stage. treatment: str The name of the treatment variable. Must be boolean. - weights : Optional[str]. + weights : Optional[str] Default is None. Weights for WLS estimation. If None, all observations are weighted equally. If a string, the name of the column in `data` that - contains the weights. + contains the weights. Must be analytic weights for now. Returns ------- @@ -283,10 +281,10 @@ def _did2s_vcov( The second stage residuals. cluster: str The name of the cluster variable. - weights : Optional[str]. + weights : Optional[str] Default is None. Weights for WLS estimation. If None, all observations are weighted equally. If a string, the name of the column in `data` that - contains the weights. + contains the weights. Must be analytic weights for now. Returns ------- diff --git a/pyfixest/did/estimation.py b/pyfixest/did/estimation.py index 8edeff79a..72c80cc49 100644 --- a/pyfixest/did/estimation.py +++ b/pyfixest/did/estimation.py @@ -1,4 +1,4 @@ -from typing import Optional, Union +from typing import Literal, Optional, Union import pandas as pd @@ -15,17 +15,19 @@ def event_study( idname: str, tname: str, gname: str, - xfml: Optional[str] = None, cluster: Optional[str] = None, - estimator: Optional[str] = "twfe", + weights: Optional[str] = None, + xfml: Optional[str] = None, att: Optional[bool] = True, + estimator: Optional[Literal["did2s", "twfe", "saturated"]] = "twfe", ): """ Estimate Event Study Model. This function allows for the estimation of treatment effects using different estimators. Currently, it supports "twfe" for the two-way fixed effects - estimator and "did2s" for Gardner's two-step DID2S estimator. Other estimators + estimator, "did2s" for Gardner's two-step DID2S estimator, and "saturated" for + a Sun & Abraham staggered event study estimator. Other estimators are in development. Parameters @@ -42,14 +44,18 @@ def event_study( Unit-specific time of initial treatment. cluster: Optional[str] The name of the cluster variable. If None, defaults to idname. - xfml : str + weights : Optional[str] + Default is None. Weights for WLS estimation. If None, all observations + are weighted equally. If a string, the name of the column in `data` that + contains the weights. Must be analytic weights for now. + xfml : Optional[str] The formula for the covariates. - estimator : str - The estimator to use. Options are "did2s", "twfe", and "saturated". - att : bool, optional + att : Optional[bool] If True, estimates the average treatment effect on the treated (ATT). If False, estimates the canonical event study design with all leads and lags. Default is True. + estimator : Optional[str], default="twfe" + The estimator to use. Options are "did2s", "twfe", and "saturated". Returns ------- @@ -96,10 +102,11 @@ def event_study( assert isinstance(idname, str), "idname must be a string" assert isinstance(tname, str), "tname must be a string" assert isinstance(gname, str), "gname must be a string" + assert isinstance(cluster, str) or cluster is None, "cluster must be a string" + assert isinstance(weights, str) or weights is None, "weights must be a string" assert isinstance(xfml, str) or xfml is None, "xfml must be a string or None" - assert isinstance(estimator, str), "estimator must be a string" assert isinstance(att, bool), "att must be a boolean" - assert isinstance(cluster, str) or cluster is None, "cluster must be a string" + assert isinstance(estimator, str), "estimator must be a string" cluster = idname if cluster is None else cluster @@ -110,9 +117,10 @@ def event_study( idname=idname, tname=tname, gname=gname, + cluster=cluster, + weights=weights, xfml=xfml, att=att, - cluster=cluster, ) fit, did2s._first_u, did2s._second_u = did2s.estimate() @@ -130,9 +138,10 @@ def event_study( idname=idname, tname=tname, gname=gname, + cluster=cluster, + weights=weights, xfml=xfml, att=att, - cluster=cluster, ) fit = twfe.estimate() fit._yname = twfe._yname @@ -151,9 +160,10 @@ def event_study( idname=idname, tname=tname, gname=gname, + cluster=cluster, + weights=weights, xfml=xfml, att=att, - cluster=cluster, ) fit = saturated.estimate() vcov = fit.vcov(vcov={"CRV1": cluster}) diff --git a/pyfixest/did/saturated_twfe.py b/pyfixest/did/saturated_twfe.py index a5b9194a9..bc29cea6e 100644 --- a/pyfixest/did/saturated_twfe.py +++ b/pyfixest/did/saturated_twfe.py @@ -31,6 +31,10 @@ class SaturatedEventStudy(DID): Name of the treatment variable. cluster : str The name of the cluster variable. + weights : Optional[str] + Default is None. Weights for WLS estimation. If None, all observations + are weighted equally. If a string, the name of the column in `data` that + contains the weights. Must be analytic weights for now. xfml : str Additional covariates to include in the model. att : bool @@ -47,10 +51,11 @@ def __init__( idname: str, tname: str, gname: str, - att: bool = True, cluster: Optional[str] = None, + weights: Optional[str] = None, xfml: Optional[str] = None, - display_warning: bool = True, + att: Optional[bool] = True, + display_warning: Optional[bool] = True, ): super().__init__( data=data, @@ -61,6 +66,7 @@ def __init__( cluster=cluster, xfml=xfml, att=att, + weights=weights, ) self._estimator = "Saturated Event Study" @@ -83,6 +89,7 @@ def estimate(self) -> Feols: outcome=self._yname, time_id=self._tname, unit_id=self._idname, + weights=self._weights, cluster=self._cluster, ) @@ -151,23 +158,31 @@ def test_treatment_heterogeneity(self) -> pd.Series: ) def aggregate( - self, agg="period", weighting: Optional[str] = "shares" + self, + agg="period", + use_weights: bool = True, + weighting: Optional[str] = "shares", ) -> pd.DataFrame: """ Aggregate the fully interacted event study estimates by relative time, cohort, and time. Parameters ---------- - agg : str, optional + agg : str + The type of aggregation to perform. Currently only "period" is supported. - The type of aggregation to perform. Can be either "att" or "cohort" or "period". - Default is "att". If "att", computes the average treatment effect on the treated. - If "cohort", computes the average treatment effect by cohort. If "period", - computes the average treatment effect by period. + Unimplemented: Can be either "att" or "cohort" or "period". + Default is "att". If "att", computes the average treatment effect on the treated. + If "cohort", computes the average treatment effect by cohort. If "period", + computes the average treatment effect by period. + + use_weights : bool, default=True + Whether to use analytic weights in the aggregation. + If True, uses the weights provided in the model set up. weighting : str, optional + The type of weighting to use. Can be either 'shares' or 'variance'. - The type of weighting to use. Can be either 'shares' or 'variance'. Returns ------- @@ -198,6 +213,8 @@ def aggregate( cohort=model._gname, period="rel_time", treatment="is_treated", + weights=model._weights_name, + use_weights=use_weights, ).set_index([self._gname, "rel_time"]) treated_periods = list(period_set) @@ -317,6 +334,7 @@ def _saturated_event_study( outcome: str, time_id: str, unit_id: str, + weights: Optional[str] = None, cluster: Optional[str] = None, ): cohort_dummies = pd.get_dummies( @@ -329,7 +347,13 @@ def _saturated_event_study( {"+".join([f"i(rel_time, {x}, ref = -1.0)" for x in cohort_dummies.columns.tolist()])} | {unit_id} + {time_id} """ - m = feols(fml=ff, data=df_int, vcov={"CRV1": cluster}) # type: ignore + m = feols( + fml=ff, + data=df_int, + weights=weights, + vcov={"CRV1": cluster}, + weights_type="aweights", + ) # type: ignore res = m.tidy() # create a dict with cohort specific effect curves res_cohort_eventtime_dict: dict[str, dict[str, pd.DataFrame | np.ndarray]] = {} @@ -389,6 +413,8 @@ def compute_period_weights( period: str = "rel_time", treatment: str = "treatment", include_grid: bool = True, + weights: Optional[str] = None, + use_weights: bool = True, ) -> pd.DataFrame: """ Compute Sun & Abraham interaction weights for all relative times. @@ -411,6 +437,11 @@ def compute_period_weights( Column name of treatment indicator (0/1). include_grid : bool, default True If True, returns a full (cohort x period) grid with zero-filled weights. + weights : Optional[str], default None + If provided, the name of the column in `data` that contains weights. + use_weights : bool, default True + If True, uses the analytic weights provided in the `weights` column for aggregation. + If False, uses simple counts. Returns ------- @@ -418,28 +449,48 @@ def compute_period_weights( Columns [cohort, period, weight]. If `include_grid`, every combination appears (with weight=0 where not defined). """ - df = data[[cohort, period, treatment]].copy() + columns = [cohort, period, treatment] + if weights is not None: + columns.append(weights) + + df = data[columns].copy() ever_treated = df.loc[df[treatment] == 1, cohort].unique() # post-treatment cells (l > 0) - post = ( - df[df[treatment] == 1] - .groupby([cohort, period]) - .size() - .reset_index(name="n_grel") - ) + if weights is not None and use_weights: + post = ( + df[df[treatment] == 1] + .groupby([cohort, period])[weights] + .sum() + .reset_index(name="n_grel") + ) + else: + post = ( + df[df[treatment] == 1] + .groupby([cohort, period]) + .size() + .reset_index(name="n_grel") + ) post = post[post[period] >= 0] denom_post = post.groupby(period)["n_grel"].sum().reset_index(name="n_rel") post = post.merge(denom_post, on=period) post["weight"] = post["n_grel"] / post["n_rel"] # pre-treatment cells (l < 0) - pre = ( - df[(df[treatment] == 0) & (df[cohort].isin(ever_treated))] - .groupby([cohort, period]) - .size() - .reset_index(name="n_grel") - ) + if weights is not None and use_weights: + pre = ( + df[(df[treatment] == 0) & (df[cohort].isin(ever_treated))] + .groupby([cohort, period])[weights] + .sum() + .reset_index(name="n_grel") + ) + else: + pre = ( + df[(df[treatment] == 0) & (df[cohort].isin(ever_treated))] + .groupby([cohort, period]) + .size() + .reset_index(name="n_grel") + ) pre = pre[pre[period] < 0] denom_pre = pre.groupby(period)["n_grel"].sum().reset_index(name="n_rel") pre = pre.merge(denom_pre, on=period) diff --git a/pyfixest/did/twfe.py b/pyfixest/did/twfe.py index 45a7a960a..7bbb5b075 100644 --- a/pyfixest/did/twfe.py +++ b/pyfixest/did/twfe.py @@ -31,13 +31,17 @@ class TWFE(DID): YYYYMMDDHHMMSS, i.e. it must be possible to compare two dates via '>'. Datetime variables are currently not accepted. Never treated units must have a value of 0. - xfml: str + cluster: Optional[str], default="idname" + The name of the cluster variable. + weights : Optional[str] + Default is None. Weights for WLS estimation. If None, all observations + are weighted equally. If a string, the name of the column in `data` that + contains the weights. Must be analytic weights for now. + xfml: Optional[str] The formula for the covariates. - att: bool + att: Optional[bool], default=True Whether to estimate the average treatment effect on the treated (ATT) or the canonical event study design with all leads and lags. Default is True. - cluster: Optional[str] - The name of the cluster variable. """ def __init__( @@ -47,9 +51,10 @@ def __init__( idname: str, tname: str, gname: str, - xfml: Optional[str] = None, - att: bool = True, cluster: Optional[str] = "idname", + weights: Optional[str] = None, + xfml: Optional[str] = None, + att: Optional[bool] = True, ) -> None: super().__init__( data=data, @@ -60,6 +65,7 @@ def __init__( xfml=xfml, att=att, cluster=cluster, + weights=weights, ) self._estimator = "twfe" @@ -74,7 +80,15 @@ def estimate(self): _fml = self._fml _data = self._data - fit = cast(Feols, feols(fml=_fml, data=_data)) + fit = cast( + Feols, + feols( + fml=_fml, + data=_data, + weights=self._weights, + weights_type=self._weights_type, + ), + ) self._fit = fit return fit diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py index 559654ca0..8bbdaa106 100644 --- a/pyfixest/estimation/estimation.py +++ b/pyfixest/estimation/estimation.py @@ -23,7 +23,7 @@ def feols( fml: str, data: DataFrameType, # type: ignore vcov: Optional[Union[VcovTypeOptions, dict[str, str]]] = None, - weights: Union[None, str] = None, + weights: Optional[str] = None, ssc: Optional[dict[str, Union[str, bool]]] = None, fixef_rm: FixedRmOptions = "none", fixef_tol=1e-08, diff --git a/pyfixest/report/summarize.py b/pyfixest/report/summarize.py index c9fb8ad0a..56c9f438a 100644 --- a/pyfixest/report/summarize.py +++ b/pyfixest/report/summarize.py @@ -568,6 +568,8 @@ def summary(models: ModelInputType, digits: int = 3) -> None: estimation_method = "TWFE" elif fxst._method == "did2s": estimation_method = "DID2S" + elif fxst._method == "saturated": + estimation_method = "Saturated Event Study" else: raise ValueError("Unknown estimation method.") print("###") diff --git a/pyproject.toml b/pyproject.toml index 0c2b8e391..ac9ce36f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,13 +102,14 @@ plots = { features = ["plots"], solve-group = "default" } [tool.pixi.feature.dev.tasks] tests = "pytest -rs -n 4 --cov-report=term tests" -tests-against-r-core = "pytest -rs tests -n 4 -m \"against_r_core\" --cov=pyfixest --cov-report=xml" -tests-against-r-extended = "pytest -rs tests -n 4 -m \"against_r_extended\" --cov=pyfixest --cov-report=xml" -tests-regular = "pytest tests -n 4 -m \"not (extended or against_r_core or against_r_extended or plots)\" --cov=pyfixest --cov-report=xml" -tests-extended = "pytest tests -n 4 -m \"extended\" --cov=pyfixest --cov-report=xml" -tests-fixest = "pytest -rs tests/test_vs_fixest.py -n 4 --cov=pyfixest --cov-report=xml" +tests-against-r-core = "pytest -rs tests -n 4 -m 'against_r_core' --cov=pyfixest --cov-append --cov-report=term-missing" +tests-against-r-extended = "pytest -rs tests -n 4 -m 'against_r_extended' --cov=pyfixest --cov-append --cov-report=term-missing" +tests-regular = "pytest tests -n 4 -m 'not (extended or against_r_core or against_r_extended or plots)' --cov=pyfixest --cov-append --cov-report=term-missing" +tests-extended = "pytest tests -n 4 -m 'extended' --cov=pyfixest --cov-append --cov-report=term-missing" +tests-fixest = "pytest -rs tests/test_vs_fixest.py -n 4 --cov=pyfixest --cov-append --cov-report=term-missing" tests-plots-dev = "pixi run --environment dev pytest tests/test_plots.py -n 4" tests-plots = "pixi run --environment plots pytest tests/test_plots.py -n 4" +coverage-report = "coverage xml -i" tests-rerun = "pytest --lf -n 4" debug = "python pyfixest/debug.py" update-test-data = "Rscript tests/r_test_comparisons.R" diff --git a/tests/test_did.py b/tests/test_did.py index bff62b13d..8a73a4a9d 100644 --- a/tests/test_did.py +++ b/tests/test_did.py @@ -1,4 +1,3 @@ -from importlib import resources from pathlib import Path import numpy as np @@ -291,29 +290,30 @@ def test_lpdid(): @pytest.mark.against_r_core @pytest.mark.parametrize("unit", ["unit"]) @pytest.mark.parametrize("cluster", ["unit", "unit2"]) -def test_fully_interacted(unit, cluster): - df_multi_cohort = pd.read_csv( - resources.files("pyfixest.did.data").joinpath("df_het.csv") - ) +@pytest.mark.parametrize("weights", [None, "weights"]) +def test_fully_interacted(data, unit, cluster, weights): if cluster == "unit2": rng = np.random.default_rng(21) - df_multi_cohort["unit2"] = rng.choice(range(100), size=len(df_multi_cohort)) + data["unit2"] = rng.choice(range(100), size=len(data)) + extra_fixest_args = {"weights": data[weights]} if weights is not None else {} saturated_py = pf.event_study( - data=df_multi_cohort, + data=data, yname="dep_var", idname=unit, tname="year", gname="g", estimator="saturated", cluster=cluster, + weights=weights, ) saturated_py.test_treatment_heterogeneity() saturated_r = fixest.feols( ro.Formula("dep_var ~ 1 + sunab(g, year, no_agg = TRUE) | unit + year"), - data=df_multi_cohort, + data=data, vcov=ro.Formula(f"~{cluster}"), + **extra_fixest_args, ) r_tidy = pd.DataFrame(broom.tidy_fixest(saturated_r)).T @@ -341,8 +341,9 @@ def test_fully_interacted(unit, cluster): saturated_agg_r = fixest.feols( ro.Formula("dep_var ~ 1 + sunab(g, year, no_agg = FALSE) | unit + year"), - data=df_multi_cohort, + data=data, vcov=ro.Formula(f"~{cluster}"), + **extra_fixest_args, ) r_agg_tidy = pd.DataFrame(broom.tidy_fixest(saturated_agg_r)).T diff --git a/tests/test_event_study.py b/tests/test_event_study.py index ee2a101c5..8552c0510 100644 --- a/tests/test_event_study.py +++ b/tests/test_event_study.py @@ -8,11 +8,14 @@ @pytest.fixture def data(): + rng = np.random.default_rng(1243) df_het = pd.read_csv("pyfixest/did/data/df_het.csv") + df_het["weights"] = rng.uniform(0, 10, size=len(df_het)) return df_het -def test_event_study_twfe(data): +@pytest.mark.parametrize("weights", [None, "weights"]) +def test_event_study_twfe(data, weights): twfe = event_study( data=data, yname="dep_var", @@ -21,9 +24,10 @@ def test_event_study_twfe(data): gname="g", att=True, estimator="twfe", + weights=weights, ) - twfe_feols = pf.feols("dep_var ~ treat | state + year", data=data) + twfe_feols = pf.feols("dep_var ~ treat | state + year", data=data, weights=weights) assert np.allclose(twfe.coef().values, twfe_feols.coef().values), ( "TWFE coefficients are not the same." @@ -43,7 +47,8 @@ def test_event_study_twfe(data): # ), "TWFE confidence intervals are not the same." -def test_event_study_did2s(data): +@pytest.mark.parametrize("weights", [None, "weights"]) +def test_event_study_did2s(data, weights): event_study_did2s = event_study( data=data, yname="dep_var", @@ -52,6 +57,7 @@ def test_event_study_did2s(data): gname="g", att=True, estimator="did2s", + weights=weights, ) fit_did2s = did2s( @@ -61,6 +67,7 @@ def test_event_study_did2s(data): second_stage="~treat", treatment="treat", cluster="state", + weights=weights, ) assert np.allclose(event_study_did2s.coef().values, fit_did2s.coef().values), (