param integration setup

washingtonpost · Jun 27, 2023 · b7a6305 · b7a6305
1 parent 3ebe72e
commit b7a6305
Show file tree

Hide file tree

Showing 9 changed files with 67 additions and 7 deletions.
diff --git a/notebooks/test-model.ipynb b/notebooks/test-model.ipynb
@@ -79,6 +79,7 @@
     "robust = False\n",
     "pi_method = \"gaussian\" #set for testing otherwise use `gaussian`\n",
     "beta = 1\n",
+    "winsorize = 1\n",
     "prediction_intervals = [0.7, 0.9]\n",
     "percent_reporting_threshold = 99"
    ]
@@ -308,7 +309,8 @@
     "        aggregates=aggregates,\n",
     "        fixed_effects=fixed_effects,\n",
     "        pi_method=pi_method,\n",
-    "        beta=beta\n",
+    "        beta=beta,\n",
+    "        winsorize=winsorize,\n",
     "    )\n",
     "    \n",
     "#     Turnout predictions\n",

diff --git a/src/elexmodel/cli.py b/src/elexmodel/cli.py
@@ -39,6 +39,7 @@
     type=click.Choice(["county", "precinct", "county-district", "precinct-district"]),
 )
 @click.option("--beta", "beta", default=1, type=int, help="manually add variance to Gaussian model")
+@click.option("--winsorize", "winsorize", default=1, type=int, help="reduce outliers in the Gaussian model")
 @click.option("--robust", "robust", is_flag=True, help="robust prediction intervals for nonparametric model")
 @click.option("--lambda", "lambda", default=0, type=float, help="regularization parameter")
 @click.option(

diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
@@ -49,6 +49,7 @@ def _check_input_parameters(
         fixed_effects,
         pi_method,
         beta,
+        winsorize,
         robust,
         lambda_,
         handle_unreporting,
@@ -89,6 +90,8 @@ def _check_input_parameters(
             )
         if not isinstance(beta, (int, float)):
             raise ValueError("beta is not valid. Has to be either an integer or a float.")
+        if not isinstance(winsorize, int):
+            raise ValueError("winsorize is not valid. Has to be an integer.")
         if not isinstance(robust, bool):
             raise ValueError("robust is not valid. Has to be a boolean.")
         if not isinstance(lambda_, (float, int)):
@@ -149,6 +152,7 @@ def get_estimates(
         fixed_effects = kwargs.get("fixed_effects", {})
         pi_method = kwargs.get("pi_method", "nonparametric")
         beta = kwargs.get("beta", 1)
+        winsorize = kwargs.get("winsorize", 1)
         robust = kwargs.get("robust", False)
         lambda_ = kwargs.get("lambda_", 0)
         save_output = kwargs.get("save_output", ["results"])
@@ -163,6 +167,7 @@ def get_estimates(
             "office": office,
             "geographic_unit_type": geographic_unit_type,
             "beta": beta,
+            "winsorize": winsorize,
             "robust": robust,
             "lambda_": lambda_,
             "features": features,
@@ -184,6 +189,7 @@ def get_estimates(
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,

diff --git a/src/elexmodel/distributions/GaussianModel.py b/src/elexmodel/distributions/GaussianModel.py
@@ -70,7 +70,7 @@ def _get_n_units_per_group(self, conformalization_data, nonreporting_units, aggr
             .fillna({"n": 0})
         )
 
-    def _fit(self, conformalization_data, estimand, aggregate, alpha, beta):
+    def _fit(self, conformalization_data, estimand, aggregate, alpha, beta, winsorize):
         """
         Compute fit for Gaussian Model
         """
@@ -124,6 +124,7 @@ def fit(
         alpha=0.9,
         reweight=False,
         beta=1,
+        winsorize=1,
         top_level=True,
     ):
         """
@@ -157,6 +158,7 @@ def fit(
                 alpha=alpha,
                 reweight=reweight,
                 beta=beta,
+                winsorize=winsorize,
                 top_level=False,
             )
 
@@ -189,14 +191,15 @@ def fit(
                 alpha=alpha,
                 reweight=reweight,
                 beta=beta,
+                winsorize=winsorize,
                 top_level=False,
             )
 
             # combine large and small models
             x = pd.concat([gaussian_model_small_groups, gaussian_model_large_groups]).reset_index(drop=True)
         else:
             # when the group is large enough we can compute the Gaussian model for conformalization
-            x = self._fit(conformalization_data, estimand, aggregate, alpha, beta)
+            x = self._fit(conformalization_data, estimand, aggregate, alpha, beta, winsorize)
 
         # Write to s3 at the highest level of recursion before we exit GaussianModel
         # and return to GaussianElectionModel

diff --git a/src/elexmodel/models/GaussianElectionModel.py b/src/elexmodel/models/GaussianElectionModel.py
@@ -11,6 +11,7 @@ def __init__(self, model_settings={}):
         super().__init__(model_settings)
         self.model_settings = model_settings
         self.beta = model_settings.get("beta", 1)
+        self.winsorize = model_settings.get("winsorize", 1)
         self.alpha_to_nonreporting_lower_bounds = {}
         self.alpha_to_nonreporting_upper_bounds = {}
         self.modeled_bounds_agg = None
@@ -47,6 +48,7 @@ def get_unit_prediction_intervals(self, reporting_units, nonreporting_units, alp
             aggregate=[],
             alpha=alpha,
             beta=self.beta,
+            winsorize=self.winsorize,
         )
         self.gaussian_bounds_unit = gaussian_model
         self.conformalization_data_unit = prediction_intervals.conformalization
@@ -136,6 +138,7 @@ def get_aggregate_prediction_intervals(
             alpha=alpha,
             reweight=False,
             beta=self.beta,
+            winsorize=self.winsorize,
             top_level=True,
         )
 

diff --git a/src/elexmodel/utils/math_utils.py b/src/elexmodel/utils/math_utils.py
@@ -66,6 +66,9 @@ def weighted_median(x, weights):
 
 
 def robust_sample_std(x, axis):
+    """
+    Compute the robust sample standard deviation along the last axis by calling winsorize_std.
+    """
     return winsorize_std(x, axis=-1)
 
 

diff --git a/tests/distributions/test_gaussian_model.py b/tests/distributions/test_gaussian_model.py
@@ -130,6 +130,7 @@ def test_fit():
     weights = random_number_generator.randint(low=1, high=100, size=n)
     alpha = 0.9
     beta = 1
+    winsorize = 1
     estimand = "turnout"
     model_settings = {
         "election_id": "2017-11-07_VA_G",
@@ -143,7 +144,7 @@ def test_fit():
     df = pd.DataFrame({f"last_election_results_{estimand}": weights, "lower_bounds": lower, "upper_bounds": upper})
 
     # all in the same group
-    g = gaussian_model._fit(df, estimand, [], alpha, beta)
+    g = gaussian_model._fit(df, estimand, [], alpha, beta, winsorize)
 
     # assumes that weighted median and standard deviation bootstrap works
     # tests for that in test_utils
@@ -185,7 +186,7 @@ def test_fit():
     df = pd.concat([df_a, df_b])
 
     # fit model to multiple groups separately
-    g = gaussian_model._fit(df, estimand, ["group"], alpha, beta)
+    g = gaussian_model._fit(df, estimand, ["group"], alpha, beta, winsorize)
 
     assert math_utils.weighted_median(a, weights_a / weights_a.sum()) == pytest.approx(g.mu_lower_bound[0], TOL)
     assert math_utils.boot_sigma(a, conf=(3 + alpha) / 4) == pytest.approx(g.sigma_lower_bound[0], RELAX_TOL)
@@ -247,6 +248,7 @@ def test_large_and_small_fit():
 
     alpha = 0.9
     beta = 1
+    winsorize = 1
 
     reporting = pd.DataFrame({"group_1": ["general", "general"], "group_2": ["a", "b"]})
     nonreporting = pd.DataFrame({"group_1": ["general", "general"], "group_2": ["a", "b"]})
@@ -260,6 +262,7 @@ def test_large_and_small_fit():
         alpha=alpha,
         reweight=False,
         beta=beta,
+        winsorize=winsorize,
     )
 
     assert math_utils.weighted_median(general, general_weights / general_weights.sum()) == pytest.approx(

diff --git a/tests/models/test_gaussian_election_model.py b/tests/models/test_gaussian_election_model.py
@@ -6,16 +6,19 @@ def test_instantiation():
     model = GaussianElectionModel.GaussianElectionModel(model_settings=model_settings)
 
     assert model.beta == 1
+    assert model.winsorize == 1
 
-    model_settings = {"beta": 1}
+    model_settings = {"beta": 1, "winsorize": 1}
     model = GaussianElectionModel.GaussianElectionModel(model_settings=model_settings)
 
     assert model.beta == 1
+    assert model.winsorize == 1
 
-    model_settings = {"beta": 3}
+    model_settings = {"beta": 3, "winsorize": 0}
     model = GaussianElectionModel.GaussianElectionModel(model_settings=model_settings)
 
     assert model.beta == 3
+    assert model.winsorize == 0
 
 
 def test_compute_conf_frac():

diff --git a/tests/test_client.py b/tests/test_client.py
@@ -16,6 +16,7 @@
 fixed_effects = []
 pi_method = "gaussian"
 beta = 3
+winsorize = 1
 robust = True
 lambda_ = 0
 handle_unreporting = "drop"
@@ -35,6 +36,7 @@ def test_check_input_parameters(model_client, va_governor_config):
         fixed_effects,
         pi_method,
         beta,
+        winsorize,
         robust,
         lambda_,
         handle_unreporting,
@@ -56,6 +58,7 @@ def test_check_input_parameters_office(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -77,6 +80,7 @@ def test_check_input_parameters_pi_method(model_client, va_governor_config):
             fixed_effects,
             "bad_pi_method",
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -98,6 +102,7 @@ def test_check_input_parameters_estimand(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -119,6 +124,7 @@ def test_check_input_parameters_geographic_unit_type(model_client, va_governor_c
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -140,6 +146,7 @@ def test_check_input_parameters_features(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -161,6 +168,7 @@ def test_check_input_parameters_aggregates(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -182,6 +190,7 @@ def test_check_input_parameters_fixed_effect_list(model_client, va_governor_conf
             ["bad_fixed_effect"],
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -203,6 +212,7 @@ def test_check_input_parameters_fixed_effect_dict(model_client, va_governor_conf
             {"bad_fixed_effect": ["a", "b"]},
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             handle_unreporting,
@@ -224,6 +234,29 @@ def test_check_input_parameters_beta(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             "bad_beta",
+            winsorize,
+            robust,
+            lambda_,
+            handle_unreporting,
+        )
+
+
+def test_check_input_parameters_winsorize(model_client, va_governor_config):
+    election_id = "2017-11-07_VA_G"
+    config_handler = ConfigHandler(election_id, config=va_governor_config)
+
+    with pytest.raises(ValueError):
+        model_client._check_input_parameters(
+            config_handler,
+            office,
+            estimands,
+            geographic_unit_type,
+            features,
+            aggregates,
+            fixed_effects,
+            pi_method,
+            beta,
+            "bad_winsorize",
             robust,
             lambda_,
             handle_unreporting,
@@ -245,6 +278,7 @@ def test_check_input_parameters_robust(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             "bad_robust",
             lambda_,
             handle_unreporting,
@@ -266,6 +300,7 @@ def test_check_input_parameters_lambda_(model_client, va_governor_config):
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             -1,
             handle_unreporting,
@@ -287,6 +322,7 @@ def test_check_input_parameters_handle_unreporting(model_client, va_governor_con
             fixed_effects,
             pi_method,
             beta,
+            winsorize,
             robust,
             lambda_,
             "bad_handle_unreporting",