some doc string additions, and cleaning.

NREL · Oct 5, 2024 · 704ad07 · 704ad07
1 parent 2638213
commit 704ad07
Show file tree

Hide file tree

Showing 15 changed files with 158 additions and 272 deletions.
diff --git a/.gitignore b/.gitignore
@@ -36,6 +36,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+mlclouds/_version.py
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/mlclouds/autoxval.py b/mlclouds/autoxval.py
@@ -1,9 +1,5 @@
-"""
-Automatic cross validation of PHYGNN models predicting opd and reff
-
-Mike Bannister 7/2020
-Based on code by Grant Buster
-"""
+"""Automatic cross validation of PHYGNN models predicting cloud types and
+properties"""
 
 import json
 import logging

diff --git a/mlclouds/data_cleaners.py b/mlclouds/data_cleaners.py
@@ -1,9 +1,4 @@
-"""
-Automatic cross validation of PhyGNN models predicting opd and reff
-
-Mike Bannister 7/2020
-Based on code by Grant Buster
-"""
+"""Training data cleaning methods"""
 
 import logging
 import time
@@ -17,14 +12,17 @@
 
 def sky_class_filter(cloud_df):
     """Filter cloud data so that final data includes only data with matching
-    sky_class and cloud types."""
+    sky_class and cloud types. ``sky_class`` is determined by comparing clear
+    sky REST2 predictions with ground measurements. If the clear sky prediction
+    is within a given tolerance the class is considered "clear", otherwise the
+    class is considered "cloudy"."""
 
-    cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
-    clear = cloud_df["cloud_type"].isin(CLEAR_TYPES)
+    cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
+    clear = cloud_df['cloud_type'].isin(CLEAR_TYPES)
 
-    if "sky_class" in cloud_df.columns:
-        cloudy &= cloud_df["sky_class"].isin(("cloudy",))
-        clear &= cloud_df["sky_class"].isin(("clear",))
+    if 'sky_class' in cloud_df.columns:
+        cloudy &= cloud_df['sky_class'].isin(('cloudy',))
+        clear &= cloud_df['sky_class'].isin(('clear',))
 
     return clear | cloudy
 
@@ -35,96 +33,98 @@ def clean_cloud_df(
     filter_clear=False,
     add_cloud_flag=True,
     sza_lim=89,
-    nan_option="interp",
+    nan_option='interp',
 ):
-    """Clean up cloud data"""
+    """Clean up cloud data. Includes options to filter night and clear
+    timesteps, timesteps with sza which exceed a given threshold, and to add
+    cloud flag labels"""
     t0 = time.time()
     cloud_df = cloud_df_raw.copy()
-    day = cloud_df["solar_zenith_angle"] < sza_lim
+    day = cloud_df['solar_zenith_angle'] < sza_lim
 
-    day_missing_ctype = day & (cloud_df["cloud_type"] < 0)
-    cloud_df.loc[(cloud_df["cloud_type"] < 0), "cloud_type"] = np.nan
-    cloud_df["cloud_type"] = (
-        cloud_df["cloud_type"].interpolate("nearest").ffill().bfill()
+    day_missing_ctype = day & (cloud_df['cloud_type'] < 0)
+    cloud_df.loc[(cloud_df['cloud_type'] < 0), 'cloud_type'] = np.nan
+    cloud_df['cloud_type'] = (
+        cloud_df['cloud_type'].interpolate('nearest').ffill().bfill()
     )
 
-    cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
+    cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
     day_clouds = day & cloudy
-    day_missing_opd = day_clouds & (cloud_df["cld_opd_dcomp"] <= 0)
-    day_missing_reff = day_clouds & (cloud_df["cld_reff_dcomp"] <= 0)
-    cloud_df.loc[(cloud_df["cld_opd_dcomp"] <= 0), "cld_opd_dcomp"] = np.nan
-    cloud_df.loc[(cloud_df["cld_reff_dcomp"] <= 0), "cld_reff_dcomp"] = np.nan
+    day_missing_opd = day_clouds & (cloud_df['cld_opd_dcomp'] <= 0)
+    day_missing_reff = day_clouds & (cloud_df['cld_reff_dcomp'] <= 0)
+    cloud_df.loc[(cloud_df['cld_opd_dcomp'] <= 0), 'cld_opd_dcomp'] = np.nan
+    cloud_df.loc[(cloud_df['cld_reff_dcomp'] <= 0), 'cld_reff_dcomp'] = np.nan
 
     logger.info(
-        "{:.2f}% of timesteps are daylight".format(100 * day.sum() / len(day))
+        '{:.2f}% of timesteps are daylight'.format(100 * day.sum() / len(day))
     )
     logger.info(
-        "{:.2f}% of daylight timesteps are cloudy".format(
+        '{:.2f}% of daylight timesteps are cloudy'.format(
             100 * day_clouds.sum() / day.sum()
         )
     )
     logger.info(
-        "{:.2f}% of daylight timesteps are missing cloud type".format(
+        '{:.2f}% of daylight timesteps are missing cloud type'.format(
             100 * day_missing_ctype.sum() / day.sum()
         )
     )
     logger.info(
-        "{:.2f}% of cloudy daylight timesteps are missing cloud opd".format(
+        '{:.2f}% of cloudy daylight timesteps are missing cloud opd'.format(
             100 * day_missing_opd.sum() / day_clouds.sum()
         )
     )
     logger.info(
-        "{:.2f}% of cloudy daylight timesteps are missing cloud reff".format(
+        '{:.2f}% of cloudy daylight timesteps are missing cloud reff'.format(
             100 * day_missing_reff.sum() / day_clouds.sum()
         )
     )
 
-    logger.debug("Column NaN values:")
+    logger.debug('Column NaN values:')
     for c in cloud_df.columns:
         pnan = 100 * pd.isna(cloud_df[c]).sum() / len(cloud_df)
         logger.debug('\t"{}" has {:.2f}% NaN values'.format(c, pnan))
 
-    if "interp" in nan_option.lower():
-        logger.debug("Interpolating opd and reff")
+    if 'interp' in nan_option.lower():
+        logger.debug('Interpolating opd and reff')
 
-        if "time_index" in cloud_df.columns:
+        if 'time_index' in cloud_df.columns:
             time_index = cloud_df.time_index
             assert time_index.isnull().sum() == 0
-            cloud_df = cloud_df.drop("time_index", axis=1)
-            cloud_df = cloud_df.interpolate("nearest").ffill().bfill()
-            cloud_df["time_index"] = time_index
+            cloud_df = cloud_df.drop('time_index', axis=1)
+            cloud_df = cloud_df.interpolate('nearest').ffill().bfill()
+            cloud_df['time_index'] = time_index
         else:
-            cloud_df = cloud_df.interpolate("nearest").ffill().bfill()
+            cloud_df = cloud_df.interpolate('nearest').ffill().bfill()
 
-        cloud_df.loc[~cloudy, "cld_opd_dcomp"] = 0.0
-        cloud_df.loc[~cloudy, "cld_reff_dcomp"] = 0.0
-    elif "drop" in nan_option.lower():
+        cloud_df.loc[~cloudy, 'cld_opd_dcomp'] = 0.0
+        cloud_df.loc[~cloudy, 'cld_reff_dcomp'] = 0.0
+    elif 'drop' in nan_option.lower():
         l0 = len(cloud_df)
-        cloud_df = cloud_df.dropna(axis=0, how="any")
-        day = cloud_df["solar_zenith_angle"] < sza_lim
-        cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
+        cloud_df = cloud_df.dropna(axis=0, how='any')
+        day = cloud_df['solar_zenith_angle'] < sza_lim
+        cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
         logger.debug(
-            "Dropped {} rows with NaN values.".format(l0 - len(cloud_df))
+            'Dropped {} rows with NaN values.'.format(l0 - len(cloud_df))
         )
 
-    assert ~any(cloud_df["cloud_type"] < 0)
+    assert ~any(cloud_df['cloud_type'] < 0)
     assert ~any(pd.isna(cloud_df))
-    assert ~any(cloudy & (cloud_df["cld_opd_dcomp"] <= 0))
+    assert ~any(cloudy & (cloud_df['cld_opd_dcomp'] <= 0))
 
     if add_cloud_flag:
         logger.debug(
-            "Adding cloud type flag (e.g. flag=[night, clear, "
-            "ice_cloud, water_cloud, bad_cloud])"
+            'Adding cloud type flag (e.g. flag=[night, clear, '
+            'ice_cloud, water_cloud, bad_cloud])'
         )
-        ice_clouds = cloud_df["cloud_type"].isin(ICE_TYPES)
-        water_clouds = cloud_df["cloud_type"].isin(WATER_TYPES)
-        cloud_df["flag"] = "night"
-        cloud_df.loc[day, "flag"] = "clear"
-        cloud_df.loc[ice_clouds, "flag"] = "ice_cloud"
-        cloud_df.loc[water_clouds, "flag"] = "water_cloud"
-        cloud_df.loc[day_missing_ctype, "flag"] = "bad_cloud"
-        cloud_df.loc[day_missing_opd, "flag"] = "bad_cloud"
-        cloud_df.loc[day_missing_reff, "flag"] = "bad_cloud"
+        ice_clouds = cloud_df['cloud_type'].isin(ICE_TYPES)
+        water_clouds = cloud_df['cloud_type'].isin(WATER_TYPES)
+        cloud_df['flag'] = 'night'
+        cloud_df.loc[day, 'flag'] = 'clear'
+        cloud_df.loc[ice_clouds, 'flag'] = 'ice_cloud'
+        cloud_df.loc[water_clouds, 'flag'] = 'water_cloud'
+        cloud_df.loc[day_missing_ctype, 'flag'] = 'bad_cloud'
+        cloud_df.loc[day_missing_opd, 'flag'] = 'bad_cloud'
+        cloud_df.loc[day_missing_reff, 'flag'] = 'bad_cloud'
 
     mask = True
     if filter_daylight:
@@ -135,8 +135,8 @@ def clean_cloud_df(
 
     if filter_clear or filter_daylight:
         logger.info(
-            "Data reduced from "
-            "{} rows to {} after filters ({:.2f}% of original)".format(
+            'Data reduced from '
+            '{} rows to {} after filters ({:.2f}% of original)'.format(
                 len(cloud_df), mask.sum(), 100 * mask.sum() / len(cloud_df)
             )
         )
@@ -145,10 +145,10 @@ def clean_cloud_df(
 
     if add_cloud_flag:
         logger.debug(
-            "Feature flag column has these values: {}".format(
+            'Feature flag column has these values: {}'.format(
                 cloud_df.flag.unique()
             )
         )
-    logger.info("Cleaning took {:.1f} seconds".format(time.time() - t0))
+    logger.info('Cleaning took {:.1f} seconds'.format(time.time() - t0))
 
     return cloud_df
diff --git a/mlclouds/data_handlers.py b/mlclouds/data_handlers.py
@@ -1,4 +1,7 @@
-"""Data handlers for training and validation data."""
+"""Data handlers for training and validation data.
+
+TODO: Move sky class filter to cloud data cleaning?
+"""
 
 import copy
 import logging
@@ -22,26 +25,13 @@
     TRAINING_PREP_KWARGS,
     calc_time_step,
     extract_file_meta,
+    get_valid_surf_sites,
     surf_meta,
 )
 
 logger = logging.getLogger(__name__)
 
 
-def get_valid_surf_sites(sites, fp_surfrad_data, data_file):
-    """Get surfrad sites available for the given data file. This is
-    determined from the year in the data file name."""
-    year, _ = extract_file_meta(data_file)
-    valid_sites = []
-    for gid in sites:
-        surfrad_file = fp_surfrad_data.format(
-            year=year, code=surf_meta().loc[gid, 'surfrad_id']
-        )
-        if os.path.exists(surfrad_file):
-            valid_sites.append(gid)
-    return valid_sites
-
-
 class TrainData:
     """Load and prep training data"""
 

diff --git a/mlclouds/model/base.py b/mlclouds/model/base.py
@@ -10,29 +10,37 @@
 class MLCloudsModel(PhygnnModel):
     """Extended phygnn model with methods for interfacing with NSRDB."""
 
+    CTYPE_FRACTIONS = ('clear_fraction', 'ice_fraction', 'water_fraction')
+
+    @property
+    def predicts_cloud_fractions(self):
+        """Check if this model predicts cloud type fractions."""
+        return all(f in self.label_names for f in self.CTYPE_FRACTIONS)
+
     def predict(self, *args, **kwargs):
-        """Override predict method to return cloud type if cloud type fractions
-        are predicted by this model."""
+        """Convert cloud type fractions into a integer cloud type and remove
+        cloud type fractions from output, if cloud type fractions are predicted
+        by this model. Otherwise, return output without any additional
+        processing."""
         out = super().predict(*args, **kwargs)
-        frac_names = ("clear_fraction", "ice_fraction", "water_fraction")
-        is_array = not hasattr(out, "columns")
-        if all(f in self.label_names for f in frac_names):
+        is_array = not hasattr(out, 'columns')
+        if self.predicts_cloud_fractions:
             if is_array:
                 out = pd.DataFrame(columns=self.label_names, data=out)
-            fracs = {f: out[f].values for f in frac_names}
-            out["cloud_type"] = _get_sky_type(**fracs)
-            out_feats = [f for f in self.label_names if f not in frac_names]
-            out_feats += ["cloud_type"]
-            out = out[out_feats]
+            fracs = {f: out[f].values for f in self.CTYPE_FRACTIONS}
+            out['cloud_type'] = _get_sky_type(**fracs)
+            out = out[self.output_names]
         return out if not is_array else np.asarray(out)
 
     @property
     def output_names(self):
-        """Output feature names with parsing of cloud type fractions if the
-        model predicts cloud types."""
-        frac_names = ("clear_fraction", "ice_fraction", "water_fraction")
-        output_names = self.label_names
-        if all(f in output_names for f in frac_names):
-            output_names = [f for f in output_names if f not in frac_names]
-            output_names += ["cloud_type"]
+        """Remove cloud type fraction labels from features and replace with
+        "cloud_type", if this model predicts cloud type fractions. Otherwise,
+        just return labels unchanged."""
+        output_names = self.label_names.copy()
+        if self.predicts_cloud_fractions:
+            output_names = [
+                f for f in output_names if f not in self.CTYPE_FRACTIONS
+            ]
+            output_names += ['cloud_type']
         return output_names
diff --git a/mlclouds/model/experimental/config.json b/mlclouds/model/experimental/config.json
@@ -3,21 +3,16 @@
     "filter_clear": false,
     "nan_option": "interp"
   },
-  "epochs_a": 50,
-  "epochs_b": 50,
+  "epochs_a": 100,
+  "epochs_b": 90,
   "features": [
     "solar_zenith_angle",
-    "cloud_type",
     "refl_0_65um_nom",
-    "refl_0_65um_nom_stddev_3x3",
-    "refl_3_75um_nom",
     "temp_3_75um_nom",
     "temp_11_0um_nom",
-    "temp_11_0um_nom_stddev_3x3",
-    "cloud_probability",
-    "cloud_fraction",
     "air_temperature",
     "dew_point",
+    "cloud_type",
     "relative_humidity",
     "total_precipitable_water",
     "surface_albedo"
@@ -55,12 +50,11 @@
     0
   ],
   "loss_weights_b": [
-    1,
-    2
+    0.5,
+    0.5
   ],
   "metric": "relative_mae",
   "n_batch": 64,
-  "one_hot_categories": null,
   "p_fun": "p_fun_all_sky",
   "p_kwargs": {
     "loss_terms": [
@@ -71,6 +65,16 @@
   "surfrad_window_minutes": 15,
   "y_labels": [
     "cld_opd_dcomp",
-    "cld_reff_dcomp"
-  ]
+    "cld_reff_dcomp",
+    "ice_fraction",
+    "clear_fraction",
+    "water_fraction"
+  ],
+  "training_prep_kwargs": {
+    "filter_daylight": true,
+    "filter_clear": false,
+    "add_cloud_flag": true,
+    "sza_lim": 89,
+    "nan_option": "interp"
+  }
 }