Skip to content

Commit

Permalink
[missing values] Removing replacing missing values (#4905)
Browse files Browse the repository at this point in the history
(cherry picked from commit 61add60)
  • Loading branch information
john-bodley authored and xtinec committed Mar 21, 2019
1 parent 7bc3006 commit 4ffc1b3
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 106 deletions.
3 changes: 2 additions & 1 deletion superset/assets/backendSync.json
Original file line number Diff line number Diff line change
Expand Up @@ -3783,4 +3783,5 @@
"default": false
}
}
}
}

2 changes: 1 addition & 1 deletion superset/assets/src/explore/controls.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,7 @@ export const controls = {
'mean',
'min',
'max',
'stdev',
'std',
'var',
]),
default: 'sum',
Expand Down
23 changes: 0 additions & 23 deletions superset/common/query_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ class QueryContext:
to retrieve the data payload for a given viz.
"""

default_fillna = 0
cache_type = 'df'
enforce_numerical_metrics = True

Expand Down Expand Up @@ -103,7 +102,6 @@ def get_query_result(self, query_object):
self.df_metrics_to_num(df, query_object)

df.replace([np.inf, -np.inf], np.nan)
df = self.handle_nulls(df)
return {
'query': result.query,
'status': result.status,
Expand All @@ -118,27 +116,6 @@ def df_metrics_to_num(self, df, query_object):
if dtype.type == np.object_ and col in metrics:
df[col] = pd.to_numeric(df[col], errors='coerce')

def handle_nulls(self, df):
fillna = self.get_fillna_for_columns(df.columns)
return df.fillna(fillna)

def get_fillna_for_col(self, col):
"""Returns the value to use as filler for a specific Column.type"""
if col and col.is_string:
return ' NULL'
return self.default_fillna

def get_fillna_for_columns(self, columns=None):
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
if columns is None:
return self.default_fillna
columns_dict = {col.column_name: col for col in self.datasource.columns}
fillna = {
c: self.get_fillna_for_col(columns_dict.get(c))
for c in columns
}
return fillna

def get_data(self, df):
return df.to_dict(orient='records')

Expand Down
102 changes: 43 additions & 59 deletions superset/viz.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ class BaseViz(object):
verbose_name = 'Base Viz'
credits = ''
is_timeseries = False
default_fillna = 0
cache_type = 'df'
enforce_numerical_metrics = True

Expand Down Expand Up @@ -164,28 +163,6 @@ def run_extra_queries(self):
"""
pass

def handle_nulls(self, df):
fillna = self.get_fillna_for_columns(df.columns)
return df.fillna(fillna)

def get_fillna_for_col(self, col):
"""Returns the value to use as filler for a specific Column.type"""
if col:
if col.is_string:
return ' NULL'
return self.default_fillna

def get_fillna_for_columns(self, columns=None):
"""Returns a dict or scalar that can be passed to DataFrame.fillna"""
if columns is None:
return self.default_fillna
columns_dict = {col.column_name: col for col in self.datasource.columns}
fillna = {
c: self.get_fillna_for_col(columns_dict.get(c))
for c in columns
}
return fillna

def get_samples(self):
query_obj = self.query_obj()
query_obj.update({
Expand Down Expand Up @@ -254,8 +231,7 @@ def get_df(self, query_obj=None):
if self.enforce_numerical_metrics:
self.df_metrics_to_num(df)

df.replace([np.inf, -np.inf], np.nan)
df = self.handle_nulls(df)
df.replace([np.inf, -np.inf], np.nan, inplace=True)
return df

def df_metrics_to_num(self, df):
Expand Down Expand Up @@ -653,7 +629,9 @@ def get_data(self, df):
pt = df.pivot_table(
index=DTTM_ALIAS,
columns=columns,
values=values)
values=values,
dropna=False,
)
pt.index = pt.index.map(str)
pt = pt.sort_index()
return dict(
Expand Down Expand Up @@ -696,20 +674,28 @@ def get_data(self, df):
self.form_data.get('granularity') == 'all' and
DTTM_ALIAS in df):
del df[DTTM_ALIAS]

aggfunc = self.form_data.get('pandas_aggfunc')

# Ensure that Pandas's sum function mimics that of SQL.
if aggfunc == 'sum':
aggfunc = lambda x: x.sum(min_count=1) # noqa: E731

df = df.pivot_table(
index=self.form_data.get('groupby'),
columns=self.form_data.get('columns'),
values=[utils.get_metric_name(m) for m in self.form_data.get('metrics')],
aggfunc=self.form_data.get('pandas_aggfunc'),
aggfunc=aggfunc,
margins=self.form_data.get('pivot_margins'),
dropna=False,
)
# Display metrics side by side with each column
if self.form_data.get('combine_metric'):
df = df.stack(0).unstack()
return dict(
columns=list(df.columns),
html=df.to_html(
na_rep='',
na_rep='null',
classes=(
'dataframe table table-striped table-bordered '
'table-condensed table-hover').split(' ')),
Expand Down Expand Up @@ -877,7 +863,7 @@ def to_series(self, df, classed='', title_suffix=''):
index_value = label_sep.join(index_value)
boxes = defaultdict(dict)
for (label, key), value in row.items():
if key == 'median':
if key == 'nanmedian':
key = 'Q2'
boxes[label][key] = value
for label, box in boxes.items():
Expand All @@ -894,28 +880,24 @@ def to_series(self, df, classed='', title_suffix=''):

def get_data(self, df):
form_data = self.form_data
df = df.fillna(0)

# conform to NVD3 names
def Q1(series): # need to be named functions - can't use lambdas
return np.percentile(series, 25)
return np.nanpercentile(series, 25)

def Q3(series):
return np.percentile(series, 75)
return np.nanpercentile(series, 75)

whisker_type = form_data.get('whisker_options')
if whisker_type == 'Tukey':

def whisker_high(series):
upper_outer_lim = Q3(series) + 1.5 * (Q3(series) - Q1(series))
series = series[series <= upper_outer_lim]
return series[np.abs(series - upper_outer_lim).argmin()]
return series[series <= upper_outer_lim].max()

def whisker_low(series):
lower_outer_lim = Q1(series) - 1.5 * (Q3(series) - Q1(series))
# find the closest value above the lower outer limit
series = series[series >= lower_outer_lim]
return series[np.abs(series - lower_outer_lim).argmin()]
return series[series >= lower_outer_lim].min()

elif whisker_type == 'Min/max (no outliers)':

Expand All @@ -929,10 +911,10 @@ def whisker_low(series):
low, high = whisker_type.replace(' percentiles', '').split('/')

def whisker_high(series):
return np.percentile(series, int(high))
return np.nanpercentile(series, int(high))

def whisker_low(series):
return np.percentile(series, int(low))
return np.nanpercentile(series, int(low))

else:
raise ValueError('Unknown whisker type: {}'.format(whisker_type))
Expand All @@ -943,7 +925,7 @@ def outliers(series):
# pandas sometimes doesn't like getting lists back here
return set(above.tolist() + below.tolist())

aggregate = [Q1, np.median, Q3, whisker_high, whisker_low, outliers]
aggregate = [Q1, np.nanmedian, Q3, whisker_high, whisker_low, outliers]
df = df.groupby(form_data.get('groupby')).agg(aggregate)
chart_data = self.to_series(df)
return chart_data
Expand Down Expand Up @@ -1034,7 +1016,6 @@ def as_floats(field):
return d

def get_data(self, df):
df = df.fillna(0)
df['metric'] = df[[utils.get_metric_name(self.metric)]]
values = df['metric'].values
return {
Expand Down Expand Up @@ -1152,22 +1133,25 @@ def to_series(self, df, classed='', title_suffix=''):

def process_data(self, df, aggregate=False):
fd = self.form_data
df = df.fillna(0)
if fd.get('granularity') == 'all':
raise Exception(_('Pick a time granularity for your time series'))

if not aggregate:
df = df.pivot_table(
index=DTTM_ALIAS,
columns=fd.get('groupby'),
values=self.metric_labels)
values=self.metric_labels,
dropna=False,
)
else:
df = df.pivot_table(
index=DTTM_ALIAS,
columns=fd.get('groupby'),
values=self.metric_labels,
fill_value=0,
aggfunc=sum)
aggfunc=sum,
dropna=False,
)

fm = fd.get('resample_fillmethod')
if not fm:
Expand All @@ -1176,8 +1160,6 @@ def process_data(self, df, aggregate=False):
rule = fd.get('resample_rule')
if how and rule:
df = df.resample(rule, how=how, fill_method=fm)
if not fm:
df = df.fillna(0)

if self.sort_series:
dfs = df.sum()
Expand Down Expand Up @@ -1241,7 +1223,6 @@ def get_data(self, df):
fd = self.form_data
comparison_type = fd.get('comparison_type') or 'values'
df = self.process_data(df)

if comparison_type == 'values':
chart_data = self.to_series(df)
for i, (label, df2) in enumerate(self._extra_chart_data):
Expand Down Expand Up @@ -1368,7 +1349,6 @@ def to_series(self, df, classed=''):

def get_data(self, df):
fd = self.form_data
df = df.fillna(0)

if self.form_data.get('granularity') == 'all':
raise Exception(_('Pick a time granularity for your time series'))
Expand All @@ -1377,7 +1357,9 @@ def get_data(self, df):
metric_2 = utils.get_metric_name(fd.get('metric_2'))
df = df.pivot_table(
index=DTTM_ALIAS,
values=[metric, metric_2])
values=[metric, metric_2],
dropna=False,
)

chart_data = self.to_series(df)
return chart_data
Expand Down Expand Up @@ -1425,7 +1407,9 @@ def get_data(self, df):
df = df.pivot_table(
index=DTTM_ALIAS,
columns='series',
values=utils.get_metric_name(fd.get('metric')))
values=utils.get_metric_name(fd.get('metric')),
dropna=False,
)
chart_data = self.to_series(df)
for serie in chart_data:
serie['rank'] = rank_lookup[serie['key']]
Expand Down Expand Up @@ -1462,7 +1446,9 @@ def get_data(self, df):
metric = self.metric_labels[0]
df = df.pivot_table(
index=self.groupby,
values=[metric])
values=[metric],
dropna=False,
)
df.sort_values(by=metric, ascending=False, inplace=True)
df = df.reset_index()
df.columns = ['x', 'y']
Expand Down Expand Up @@ -1549,9 +1535,10 @@ def get_data(self, df):
pt = df.pivot_table(
index=self.groupby,
columns=columns,
values=metrics)
values=metrics,
dropna=False,
)
if fd.get('contribution'):
pt = pt.fillna(0)
pt = pt.T
pt = (pt / pt.sum()).T
pt = pt.reindex(row.index)
Expand Down Expand Up @@ -2117,9 +2104,6 @@ class BaseDeckGLViz(BaseViz):
credits = '<a href="https://uber.github.io/deck.gl/">deck.gl</a>'
spatial_control_keys = []

def handle_nulls(self, df):
return df

def get_metrics(self):
self.metric = self.form_data.get('size')
return [self.metric] if self.metric else []
Expand Down Expand Up @@ -2572,11 +2556,11 @@ def get_data(self, df):
fd = self.form_data
groups = fd.get('groupby')
metrics = fd.get('metrics')
df.fillna(0)
df = df.pivot_table(
index=DTTM_ALIAS,
columns=groups,
values=metrics)
values=metrics,
)
cols = []
# Be rid of falsey keys
for col in df.columns:
Expand Down Expand Up @@ -2699,7 +2683,7 @@ def levels_for_time(self, groups, df):
for i in range(0, len(groups) + 1):
self.form_data['groupby'] = groups[:i]
df_drop = df.drop(groups[i:], 1)
procs[i] = self.process_data(df_drop, aggregate=True).fillna(0)
procs[i] = self.process_data(df_drop, aggregate=True)
self.form_data['groupby'] = groups
return procs

Expand Down
10 changes: 0 additions & 10 deletions tests/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,16 +622,6 @@ def test_slice_query_endpoint(self):
assert 'language' in resp
self.logout()

def test_viz_get_fillna_for_columns(self):
slc = self.get_slice('Girls', db.session)
q = slc.viz.query_obj()
results = slc.viz.datasource.query(q)
fillna_columns = slc.viz.get_fillna_for_columns(results.df.columns)
self.assertDictEqual(
fillna_columns,
{'name': ' NULL', 'sum__num': 0},
)

def test_import_csv(self):
self.login(username='admin')
filename = 'testCSV.csv'
Expand Down
12 changes: 0 additions & 12 deletions tests/viz_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,18 +84,6 @@ def test_process_metrics(self):
self.assertEqual(test_viz.metric_labels, expect_metric_labels)
self.assertEqual(test_viz.all_metrics, expect_metric_labels)

def test_get_fillna_returns_default_on_null_columns(self):
form_data = {
'viz_type': 'table',
'token': '12345',
}
datasource = self.get_datasource_mock()
test_viz = viz.BaseViz(datasource, form_data)
self.assertEqual(
test_viz.default_fillna,
test_viz.get_fillna_for_columns(),
)

def test_get_df_returns_empty_df(self):
form_data = {'dummy': 123}
query_obj = {'granularity': 'day'}
Expand Down

0 comments on commit 4ffc1b3

Please sign in to comment.