Skip to content

Commit

Permalink
some doc string additions, and cleaning.
Browse files Browse the repository at this point in the history
  • Loading branch information
bnb32 committed Oct 5, 2024
1 parent 2638213 commit 704ad07
Show file tree
Hide file tree
Showing 15 changed files with 158 additions and 272 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
mlclouds/_version.py

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
8 changes: 2 additions & 6 deletions mlclouds/autoxval.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
"""
Automatic cross validation of PHYGNN models predicting opd and reff
Mike Bannister 7/2020
Based on code by Grant Buster
"""
"""Automatic cross validation of PHYGNN models predicting cloud types and
properties"""

import json
import logging
Expand Down
122 changes: 61 additions & 61 deletions mlclouds/data_cleaners.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
"""
Automatic cross validation of PhyGNN models predicting opd and reff
Mike Bannister 7/2020
Based on code by Grant Buster
"""
"""Training data cleaning methods"""

import logging
import time
Expand All @@ -17,14 +12,17 @@

def sky_class_filter(cloud_df):
"""Filter cloud data so that final data includes only data with matching
sky_class and cloud types."""
sky_class and cloud types. ``sky_class`` is determined by comparing clear
sky REST2 predictions with ground measurements. If the clear sky prediction
is within a given tolerance the class is considered "clear", otherwise the
class is considered "cloudy"."""

cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
clear = cloud_df["cloud_type"].isin(CLEAR_TYPES)
cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
clear = cloud_df['cloud_type'].isin(CLEAR_TYPES)

if "sky_class" in cloud_df.columns:
cloudy &= cloud_df["sky_class"].isin(("cloudy",))
clear &= cloud_df["sky_class"].isin(("clear",))
if 'sky_class' in cloud_df.columns:
cloudy &= cloud_df['sky_class'].isin(('cloudy',))
clear &= cloud_df['sky_class'].isin(('clear',))

return clear | cloudy

Expand All @@ -35,96 +33,98 @@ def clean_cloud_df(
filter_clear=False,
add_cloud_flag=True,
sza_lim=89,
nan_option="interp",
nan_option='interp',
):
"""Clean up cloud data"""
"""Clean up cloud data. Includes options to filter night and clear
timesteps, timesteps with sza which exceed a given threshold, and to add
cloud flag labels"""
t0 = time.time()
cloud_df = cloud_df_raw.copy()
day = cloud_df["solar_zenith_angle"] < sza_lim
day = cloud_df['solar_zenith_angle'] < sza_lim

day_missing_ctype = day & (cloud_df["cloud_type"] < 0)
cloud_df.loc[(cloud_df["cloud_type"] < 0), "cloud_type"] = np.nan
cloud_df["cloud_type"] = (
cloud_df["cloud_type"].interpolate("nearest").ffill().bfill()
day_missing_ctype = day & (cloud_df['cloud_type'] < 0)
cloud_df.loc[(cloud_df['cloud_type'] < 0), 'cloud_type'] = np.nan
cloud_df['cloud_type'] = (
cloud_df['cloud_type'].interpolate('nearest').ffill().bfill()
)

cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
day_clouds = day & cloudy
day_missing_opd = day_clouds & (cloud_df["cld_opd_dcomp"] <= 0)
day_missing_reff = day_clouds & (cloud_df["cld_reff_dcomp"] <= 0)
cloud_df.loc[(cloud_df["cld_opd_dcomp"] <= 0), "cld_opd_dcomp"] = np.nan
cloud_df.loc[(cloud_df["cld_reff_dcomp"] <= 0), "cld_reff_dcomp"] = np.nan
day_missing_opd = day_clouds & (cloud_df['cld_opd_dcomp'] <= 0)
day_missing_reff = day_clouds & (cloud_df['cld_reff_dcomp'] <= 0)
cloud_df.loc[(cloud_df['cld_opd_dcomp'] <= 0), 'cld_opd_dcomp'] = np.nan
cloud_df.loc[(cloud_df['cld_reff_dcomp'] <= 0), 'cld_reff_dcomp'] = np.nan

logger.info(
"{:.2f}% of timesteps are daylight".format(100 * day.sum() / len(day))
'{:.2f}% of timesteps are daylight'.format(100 * day.sum() / len(day))
)
logger.info(
"{:.2f}% of daylight timesteps are cloudy".format(
'{:.2f}% of daylight timesteps are cloudy'.format(
100 * day_clouds.sum() / day.sum()
)
)
logger.info(
"{:.2f}% of daylight timesteps are missing cloud type".format(
'{:.2f}% of daylight timesteps are missing cloud type'.format(
100 * day_missing_ctype.sum() / day.sum()
)
)
logger.info(
"{:.2f}% of cloudy daylight timesteps are missing cloud opd".format(
'{:.2f}% of cloudy daylight timesteps are missing cloud opd'.format(
100 * day_missing_opd.sum() / day_clouds.sum()
)
)
logger.info(
"{:.2f}% of cloudy daylight timesteps are missing cloud reff".format(
'{:.2f}% of cloudy daylight timesteps are missing cloud reff'.format(
100 * day_missing_reff.sum() / day_clouds.sum()
)
)

logger.debug("Column NaN values:")
logger.debug('Column NaN values:')
for c in cloud_df.columns:
pnan = 100 * pd.isna(cloud_df[c]).sum() / len(cloud_df)
logger.debug('\t"{}" has {:.2f}% NaN values'.format(c, pnan))

if "interp" in nan_option.lower():
logger.debug("Interpolating opd and reff")
if 'interp' in nan_option.lower():
logger.debug('Interpolating opd and reff')

if "time_index" in cloud_df.columns:
if 'time_index' in cloud_df.columns:
time_index = cloud_df.time_index
assert time_index.isnull().sum() == 0
cloud_df = cloud_df.drop("time_index", axis=1)
cloud_df = cloud_df.interpolate("nearest").ffill().bfill()
cloud_df["time_index"] = time_index
cloud_df = cloud_df.drop('time_index', axis=1)
cloud_df = cloud_df.interpolate('nearest').ffill().bfill()
cloud_df['time_index'] = time_index
else:
cloud_df = cloud_df.interpolate("nearest").ffill().bfill()
cloud_df = cloud_df.interpolate('nearest').ffill().bfill()

cloud_df.loc[~cloudy, "cld_opd_dcomp"] = 0.0
cloud_df.loc[~cloudy, "cld_reff_dcomp"] = 0.0
elif "drop" in nan_option.lower():
cloud_df.loc[~cloudy, 'cld_opd_dcomp'] = 0.0
cloud_df.loc[~cloudy, 'cld_reff_dcomp'] = 0.0
elif 'drop' in nan_option.lower():
l0 = len(cloud_df)
cloud_df = cloud_df.dropna(axis=0, how="any")
day = cloud_df["solar_zenith_angle"] < sza_lim
cloudy = cloud_df["cloud_type"].isin(ICE_TYPES + WATER_TYPES)
cloud_df = cloud_df.dropna(axis=0, how='any')
day = cloud_df['solar_zenith_angle'] < sza_lim
cloudy = cloud_df['cloud_type'].isin(ICE_TYPES + WATER_TYPES)
logger.debug(
"Dropped {} rows with NaN values.".format(l0 - len(cloud_df))
'Dropped {} rows with NaN values.'.format(l0 - len(cloud_df))
)

assert ~any(cloud_df["cloud_type"] < 0)
assert ~any(cloud_df['cloud_type'] < 0)
assert ~any(pd.isna(cloud_df))
assert ~any(cloudy & (cloud_df["cld_opd_dcomp"] <= 0))
assert ~any(cloudy & (cloud_df['cld_opd_dcomp'] <= 0))

if add_cloud_flag:
logger.debug(
"Adding cloud type flag (e.g. flag=[night, clear, "
"ice_cloud, water_cloud, bad_cloud])"
'Adding cloud type flag (e.g. flag=[night, clear, '
'ice_cloud, water_cloud, bad_cloud])'
)
ice_clouds = cloud_df["cloud_type"].isin(ICE_TYPES)
water_clouds = cloud_df["cloud_type"].isin(WATER_TYPES)
cloud_df["flag"] = "night"
cloud_df.loc[day, "flag"] = "clear"
cloud_df.loc[ice_clouds, "flag"] = "ice_cloud"
cloud_df.loc[water_clouds, "flag"] = "water_cloud"
cloud_df.loc[day_missing_ctype, "flag"] = "bad_cloud"
cloud_df.loc[day_missing_opd, "flag"] = "bad_cloud"
cloud_df.loc[day_missing_reff, "flag"] = "bad_cloud"
ice_clouds = cloud_df['cloud_type'].isin(ICE_TYPES)
water_clouds = cloud_df['cloud_type'].isin(WATER_TYPES)
cloud_df['flag'] = 'night'
cloud_df.loc[day, 'flag'] = 'clear'
cloud_df.loc[ice_clouds, 'flag'] = 'ice_cloud'
cloud_df.loc[water_clouds, 'flag'] = 'water_cloud'
cloud_df.loc[day_missing_ctype, 'flag'] = 'bad_cloud'
cloud_df.loc[day_missing_opd, 'flag'] = 'bad_cloud'
cloud_df.loc[day_missing_reff, 'flag'] = 'bad_cloud'

mask = True
if filter_daylight:
Expand All @@ -135,8 +135,8 @@ def clean_cloud_df(

if filter_clear or filter_daylight:
logger.info(
"Data reduced from "
"{} rows to {} after filters ({:.2f}% of original)".format(
'Data reduced from '
'{} rows to {} after filters ({:.2f}% of original)'.format(
len(cloud_df), mask.sum(), 100 * mask.sum() / len(cloud_df)
)
)
Expand All @@ -145,10 +145,10 @@ def clean_cloud_df(

if add_cloud_flag:
logger.debug(
"Feature flag column has these values: {}".format(
'Feature flag column has these values: {}'.format(
cloud_df.flag.unique()
)
)
logger.info("Cleaning took {:.1f} seconds".format(time.time() - t0))
logger.info('Cleaning took {:.1f} seconds'.format(time.time() - t0))

return cloud_df
20 changes: 5 additions & 15 deletions mlclouds/data_handlers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
"""Data handlers for training and validation data."""
"""Data handlers for training and validation data.
TODO: Move sky class filter to cloud data cleaning?
"""

import copy
import logging
Expand All @@ -22,26 +25,13 @@
TRAINING_PREP_KWARGS,
calc_time_step,
extract_file_meta,
get_valid_surf_sites,
surf_meta,
)

logger = logging.getLogger(__name__)


def get_valid_surf_sites(sites, fp_surfrad_data, data_file):
"""Get surfrad sites available for the given data file. This is
determined from the year in the data file name."""
year, _ = extract_file_meta(data_file)
valid_sites = []
for gid in sites:
surfrad_file = fp_surfrad_data.format(
year=year, code=surf_meta().loc[gid, 'surfrad_id']
)
if os.path.exists(surfrad_file):
valid_sites.append(gid)
return valid_sites


class TrainData:
"""Load and prep training data"""

Expand Down
42 changes: 25 additions & 17 deletions mlclouds/model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,37 @@
class MLCloudsModel(PhygnnModel):
"""Extended phygnn model with methods for interfacing with NSRDB."""

CTYPE_FRACTIONS = ('clear_fraction', 'ice_fraction', 'water_fraction')

@property
def predicts_cloud_fractions(self):
"""Check if this model predicts cloud type fractions."""
return all(f in self.label_names for f in self.CTYPE_FRACTIONS)

def predict(self, *args, **kwargs):
"""Override predict method to return cloud type if cloud type fractions
are predicted by this model."""
"""Convert cloud type fractions into a integer cloud type and remove
cloud type fractions from output, if cloud type fractions are predicted
by this model. Otherwise, return output without any additional
processing."""
out = super().predict(*args, **kwargs)
frac_names = ("clear_fraction", "ice_fraction", "water_fraction")
is_array = not hasattr(out, "columns")
if all(f in self.label_names for f in frac_names):
is_array = not hasattr(out, 'columns')
if self.predicts_cloud_fractions:
if is_array:
out = pd.DataFrame(columns=self.label_names, data=out)
fracs = {f: out[f].values for f in frac_names}
out["cloud_type"] = _get_sky_type(**fracs)
out_feats = [f for f in self.label_names if f not in frac_names]
out_feats += ["cloud_type"]
out = out[out_feats]
fracs = {f: out[f].values for f in self.CTYPE_FRACTIONS}
out['cloud_type'] = _get_sky_type(**fracs)
out = out[self.output_names]
return out if not is_array else np.asarray(out)

@property
def output_names(self):
"""Output feature names with parsing of cloud type fractions if the
model predicts cloud types."""
frac_names = ("clear_fraction", "ice_fraction", "water_fraction")
output_names = self.label_names
if all(f in output_names for f in frac_names):
output_names = [f for f in output_names if f not in frac_names]
output_names += ["cloud_type"]
"""Remove cloud type fraction labels from features and replace with
"cloud_type", if this model predicts cloud type fractions. Otherwise,
just return labels unchanged."""
output_names = self.label_names.copy()
if self.predicts_cloud_fractions:
output_names = [
f for f in output_names if f not in self.CTYPE_FRACTIONS
]
output_names += ['cloud_type']
return output_names
30 changes: 17 additions & 13 deletions mlclouds/model/experimental/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,16 @@
"filter_clear": false,
"nan_option": "interp"
},
"epochs_a": 50,
"epochs_b": 50,
"epochs_a": 100,
"epochs_b": 90,
"features": [
"solar_zenith_angle",
"cloud_type",
"refl_0_65um_nom",
"refl_0_65um_nom_stddev_3x3",
"refl_3_75um_nom",
"temp_3_75um_nom",
"temp_11_0um_nom",
"temp_11_0um_nom_stddev_3x3",
"cloud_probability",
"cloud_fraction",
"air_temperature",
"dew_point",
"cloud_type",
"relative_humidity",
"total_precipitable_water",
"surface_albedo"
Expand Down Expand Up @@ -55,12 +50,11 @@
0
],
"loss_weights_b": [
1,
2
0.5,
0.5
],
"metric": "relative_mae",
"n_batch": 64,
"one_hot_categories": null,
"p_fun": "p_fun_all_sky",
"p_kwargs": {
"loss_terms": [
Expand All @@ -71,6 +65,16 @@
"surfrad_window_minutes": 15,
"y_labels": [
"cld_opd_dcomp",
"cld_reff_dcomp"
]
"cld_reff_dcomp",
"ice_fraction",
"clear_fraction",
"water_fraction"
],
"training_prep_kwargs": {
"filter_daylight": true,
"filter_clear": false,
"add_cloud_flag": true,
"sza_lim": 89,
"nan_option": "interp"
}
}
Loading

0 comments on commit 704ad07

Please sign in to comment.