Skip to content

Commit

Permalink
[load_examples] download data at runtime
Browse files Browse the repository at this point in the history
When running `superset load_examples` to load example data sets,
Superset used to load from the local package. This created a few issues
notably around licensing (what are these datasets licensed as?) and
around package size.

For now, I moved the data sets here:
https://github.com/apache-superset/examples-data

Altered the logic to download the data from where it is stored.
  • Loading branch information
mistercrunch committed Apr 17, 2019
1 parent 154f6ab commit 1ac4964
Show file tree
Hide file tree
Showing 26 changed files with 64 additions and 175 deletions.
Binary file removed superset/data/airports.csv.gz
Binary file not shown.
Binary file removed superset/data/bart-lines.json.gz
Binary file not shown.
13 changes: 7 additions & 6 deletions superset/data/bart_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,17 @@

from superset import db
from superset.utils.core import get_or_create_main_db
from .helpers import DATA_FOLDER, TBL
from .helpers import DATA_FOLDER, TBL, get_example_data


def load_bart_lines():
tbl_name = 'bart_lines'
with gzip.open(os.path.join(DATA_FOLDER, 'bart-lines.json.gz')) as f:
df = pd.read_json(f, encoding='latin-1')
df['path_json'] = df.path.map(json.dumps)
df['polyline'] = df.path.map(polyline.encode)
del df['path']
content = get_example_data('bart-lines.json.gz')
df = pd.read_json(content, encoding='latin-1')
df['path_json'] = df.path.map(json.dumps)
df['polyline'] = df.path.map(polyline.encode)
del df['path']

df.to_sql(
tbl_name,
db.engine,
Expand Down
97 changes: 0 additions & 97 deletions superset/data/birth_france_data_for_country_map.csv

This file was deleted.

Binary file removed superset/data/birth_names.json.gz
Binary file not shown.
5 changes: 3 additions & 2 deletions superset/data/birth_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
config,
Dash,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
Slice,
Expand All @@ -39,8 +40,8 @@

def load_birth_names():
"""Loading birth name dataset from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('birth_names.json.gz')
pdf = pd.read_json(data)
pdf.ds = pd.to_datetime(pdf.ds, unit='ms')
pdf.to_sql(
'birth_names',
Expand Down
Binary file removed superset/data/countries.json.gz
Binary file not shown.
6 changes: 4 additions & 2 deletions superset/data/country_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from superset.utils import core as utils
from .helpers import (
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -35,8 +36,9 @@

def load_country_map_data():
"""Loading data for map with country map"""
csv_path = os.path.join(DATA_FOLDER, 'birth_france_data_for_country_map.csv')
data = pd.read_csv(csv_path, encoding='utf-8')
csv_bytes = get_example_data(
'birth_france_data_for_country_map.csv', is_gzip=False, make_bytes=True)
data = pd.read_csv(csv_bytes, encoding='utf-8')
data['dttm'] = datetime.datetime.now().date()
data.to_sql( # pylint: disable=no-member
'birth_france_by_region',
Expand Down
Binary file removed superset/data/energy.json.gz
Binary file not shown.
8 changes: 5 additions & 3 deletions superset/data/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,16 @@
from superset import db
from superset.connectors.sqla.models import SqlMetric
from superset.utils import core as utils
from .helpers import DATA_FOLDER, merge_slice, misc_dash_slices, Slice, TBL
from .helpers import (
DATA_FOLDER, get_example_data, merge_slice, misc_dash_slices, Slice, TBL,
)


def load_energy():
"""Loads an energy related dataset to use with sankey and graphs"""
tbl_name = 'energy_usage'
with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('energy.json.gz')
pdf = pd.read_json(data)
pdf.to_sql(
tbl_name,
db.engine,
Expand Down
Binary file removed superset/data/flight_data.csv.gz
Binary file not shown.
10 changes: 5 additions & 5 deletions superset/data/flights.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@

from superset import db
from superset.utils import core as utils
from .helpers import DATA_FOLDER, TBL
from .helpers import DATA_FOLDER, get_example_data, TBL


def load_flights():
"""Loading random time series data from a zip file in the repo"""
tbl_name = 'flights'
with gzip.open(os.path.join(DATA_FOLDER, 'flight_data.csv.gz')) as f:
pdf = pd.read_csv(f, encoding='latin-1')
data = get_example_data('flight_data.csv.gz', make_bytes=True)
pdf = pd.read_csv(data, encoding='latin-1')

# Loading airports info to join and get lat/long
with gzip.open(os.path.join(DATA_FOLDER, 'airports.csv.gz')) as f:
airports = pd.read_csv(f, encoding='latin-1')
airports_bytes = get_example_data('airports.csv.gz', make_bytes=True)
airports = pd.read_csv(airports_bytes, encoding='latin-1')
airports = airports.set_index('IATA_CODE')

pdf['ds'] = pdf.YEAR.map(str) + '-0' + pdf.MONTH.map(str) + '-0' + pdf.DAY.map(str)
Expand Down
15 changes: 15 additions & 0 deletions superset/data/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@
# under the License.
"""Loads datasets, dashboards and slices in a new superset instance"""
# pylint: disable=C,R,W
from io import BytesIO
import json
import os
import zlib

import requests

from superset import app, db
from superset.connectors.connector_registry import ConnectorRegistry
from superset.models import core as models

BASE_URL = 'https://github.com/apache-superset/examples-data/blob/master/'

# Shortcuts
DB = models.Database
Slice = models.Slice
Expand Down Expand Up @@ -60,3 +66,12 @@ def get_slice_json(defaults, **kwargs):
d = defaults.copy()
d.update(kwargs)
return json.dumps(d, indent=4, sort_keys=True)


def get_example_data(filepath, is_gzip=True, make_bytes=False):
content = requests.get(f'{BASE_URL}{filepath}?raw=true').content
if is_gzip:
content = zlib.decompress(content, zlib.MAX_WBITS|16)
if make_bytes:
content = BytesIO(content)
return content
5 changes: 3 additions & 2 deletions superset/data/long_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from superset.utils import core as utils
from .helpers import (
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -37,8 +38,8 @@

def load_long_lat_data():
"""Loading lat/long data from a csv file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'san_francisco.csv.gz')) as f:
pdf = pd.read_csv(f, encoding='utf-8')
data = get_example_data('san_francisco.csv.gz', make_bytes=True)
pdf = pd.read_csv(data, encoding='utf-8')
start = datetime.datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0)
pdf['datetime'] = [
Expand Down
Binary file removed superset/data/multiformat_time_series.json.gz
Binary file not shown.
6 changes: 4 additions & 2 deletions superset/data/multiformat_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .helpers import (
config,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -35,8 +36,9 @@

def load_multiformat_time_series():
"""Loading time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('multiformat_time_series.json.gz')
pdf = pd.read_json(data)

pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
pdf.to_sql(
Expand Down
8 changes: 4 additions & 4 deletions superset/data/paris.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@

from superset import db
from superset.utils import core as utils
from .helpers import DATA_FOLDER, TBL
from .helpers import DATA_FOLDER, get_example_data, TBL


def load_paris_iris_geojson():
tbl_name = 'paris_iris_mapping'

with gzip.open(os.path.join(DATA_FOLDER, 'paris_iris.json.gz')) as f:
df = pd.read_json(f)
df['features'] = df.features.map(json.dumps)
data = get_example_data('paris_iris.json.gz')
df = pd.read_json(data)
df['features'] = df.features.map(json.dumps)

df.to_sql(
tbl_name,
Expand Down
Binary file removed superset/data/paris_iris.json.gz
Binary file not shown.
Binary file removed superset/data/random_time_series.json.gz
Binary file not shown.
5 changes: 3 additions & 2 deletions superset/data/random_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from .helpers import (
config,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
Slice,
Expand All @@ -34,8 +35,8 @@

def load_random_time_series_data():
"""Loading random time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'random_time_series.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('random_time_series.json.gz')
pdf = pd.read_json(data)
pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.to_sql(
'random_time_series',
Expand Down
Binary file removed superset/data/san_francisco.csv.gz
Binary file not shown.
Binary file removed superset/data/sf_population.json.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions superset/data/sf_population_polygons.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,15 @@

from superset import db
from superset.utils import core as utils
from .helpers import DATA_FOLDER, TBL
from .helpers import DATA_FOLDER, get_example_data, TBL


def load_sf_population_polygons():
tbl_name = 'sf_population_polygons'

with gzip.open(os.path.join(DATA_FOLDER, 'sf_population.json.gz')) as f:
df = pd.read_json(f)
df['contour'] = df.contour.map(json.dumps)
data = get_example_data('sf_population.json.gz')
df = pd.read_json(data)
df['contour'] = df.contour.map(json.dumps)

df.to_sql(
tbl_name,
Expand Down
6 changes: 4 additions & 2 deletions superset/data/unicode_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
config,
Dash,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
Slice,
Expand All @@ -38,8 +39,9 @@

def load_unicode_test_data():
"""Loading unicode test dataset from a csv file in the repo"""
df = pd.read_csv(os.path.join(DATA_FOLDER, 'unicode_utf8_unixnl_test.csv'),
encoding='utf-8')
data = get_example_data(
'unicode_utf8_unixnl_test.csv', is_gzip=False, make_bytes=True)
df = pd.read_csv(data, encoding='utf-8')
# generate date/numeric data
df['dttm'] = datetime.datetime.now().date()
df['value'] = [random.randint(1, 100) for _ in range(len(df))]
Expand Down
42 changes: 0 additions & 42 deletions superset/data/unicode_utf8_unixnl_test.csv

This file was deleted.

Loading

0 comments on commit 1ac4964

Please sign in to comment.