[load_examples] download data at runtime

When running `superset load_examples` to load example data sets, Superset used to load from the local package. This created a few issues notably around licensing (what are these datasets licensed as?) and around package size. For now, I moved the data sets here: https://github.com/apache-superset/examples-data Altered the logic to download the data from where it is stored.
apache · Apr 17, 2019 · 1ac4964 · 1ac4964
1 parent 154f6ab
commit 1ac4964
Show file tree

Hide file tree

Showing 26 changed files with 64 additions and 175 deletions.
diff --git a/superset/data/airports.csv.gz b/superset/data/airports.csv.gz
diff --git a/superset/data/bart-lines.json.gz b/superset/data/bart-lines.json.gz
diff --git a/superset/data/bart_lines.py b/superset/data/bart_lines.py
@@ -24,16 +24,17 @@
 
 from superset import db
 from superset.utils.core import get_or_create_main_db
-from .helpers import DATA_FOLDER, TBL
+from .helpers import DATA_FOLDER, TBL, get_example_data
 
 
 def load_bart_lines():
     tbl_name = 'bart_lines'
-    with gzip.open(os.path.join(DATA_FOLDER, 'bart-lines.json.gz')) as f:
-        df = pd.read_json(f, encoding='latin-1')
-        df['path_json'] = df.path.map(json.dumps)
-        df['polyline'] = df.path.map(polyline.encode)
-        del df['path']
+    content = get_example_data('bart-lines.json.gz')
+    df = pd.read_json(content, encoding='latin-1')
+    df['path_json'] = df.path.map(json.dumps)
+    df['polyline'] = df.path.map(polyline.encode)
+    del df['path']
+
     df.to_sql(
         tbl_name,
         db.engine,

diff --git a/superset/data/birth_france_data_for_country_map.csv b/superset/data/birth_france_data_for_country_map.csv
diff --git a/superset/data/birth_names.json.gz b/superset/data/birth_names.json.gz
diff --git a/superset/data/birth_names.py b/superset/data/birth_names.py
@@ -29,6 +29,7 @@
     config,
     Dash,
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     Slice,
@@ -39,8 +40,8 @@
 
 def load_birth_names():
     """Loading birth name dataset from a zip file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('birth_names.json.gz')
+    pdf = pd.read_json(data)
     pdf.ds = pd.to_datetime(pdf.ds, unit='ms')
     pdf.to_sql(
         'birth_names',

diff --git a/superset/data/countries.json.gz b/superset/data/countries.json.gz
diff --git a/superset/data/country_map.py b/superset/data/country_map.py
@@ -25,6 +25,7 @@
 from superset.utils import core as utils
 from .helpers import (
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -35,8 +36,9 @@
 
 def load_country_map_data():
     """Loading data for map with country map"""
-    csv_path = os.path.join(DATA_FOLDER, 'birth_france_data_for_country_map.csv')
-    data = pd.read_csv(csv_path, encoding='utf-8')
+    csv_bytes = get_example_data(
+        'birth_france_data_for_country_map.csv', is_gzip=False, make_bytes=True)
+    data = pd.read_csv(csv_bytes, encoding='utf-8')
     data['dttm'] = datetime.datetime.now().date()
     data.to_sql(  # pylint: disable=no-member
         'birth_france_by_region',

diff --git a/superset/data/energy.json.gz b/superset/data/energy.json.gz
diff --git a/superset/data/energy.py b/superset/data/energy.py
@@ -26,14 +26,16 @@
 from superset import db
 from superset.connectors.sqla.models import SqlMetric
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, merge_slice, misc_dash_slices, Slice, TBL
+from .helpers import (
+    DATA_FOLDER, get_example_data, merge_slice, misc_dash_slices, Slice, TBL,
+)
 
 
 def load_energy():
     """Loads an energy related dataset to use with sankey and graphs"""
     tbl_name = 'energy_usage'
-    with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('energy.json.gz')
+    pdf = pd.read_json(data)
     pdf.to_sql(
         tbl_name,
         db.engine,

diff --git a/superset/data/flight_data.csv.gz b/superset/data/flight_data.csv.gz
diff --git a/superset/data/flights.py b/superset/data/flights.py
@@ -22,18 +22,18 @@
 
 from superset import db
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, TBL
+from .helpers import DATA_FOLDER, get_example_data, TBL
 
 
 def load_flights():
     """Loading random time series data from a zip file in the repo"""
     tbl_name = 'flights'
-    with gzip.open(os.path.join(DATA_FOLDER, 'flight_data.csv.gz')) as f:
-        pdf = pd.read_csv(f, encoding='latin-1')
+    data = get_example_data('flight_data.csv.gz', make_bytes=True)
+    pdf = pd.read_csv(data, encoding='latin-1')
 
     # Loading airports info to join and get lat/long
-    with gzip.open(os.path.join(DATA_FOLDER, 'airports.csv.gz')) as f:
-        airports = pd.read_csv(f, encoding='latin-1')
+    airports_bytes = get_example_data('airports.csv.gz', make_bytes=True)
+    airports = pd.read_csv(airports_bytes, encoding='latin-1')
     airports = airports.set_index('IATA_CODE')
 
     pdf['ds'] = pdf.YEAR.map(str) + '-0' + pdf.MONTH.map(str) + '-0' + pdf.DAY.map(str)

diff --git a/superset/data/helpers.py b/superset/data/helpers.py
@@ -16,13 +16,19 @@
 # under the License.
 """Loads datasets, dashboards and slices in a new superset instance"""
 # pylint: disable=C,R,W
+from io import BytesIO
 import json
 import os
+import zlib
+
+import requests
 
 from superset import app, db
 from superset.connectors.connector_registry import ConnectorRegistry
 from superset.models import core as models
 
+BASE_URL = 'https://github.com/apache-superset/examples-data/blob/master/'
+
 # Shortcuts
 DB = models.Database
 Slice = models.Slice
@@ -60,3 +66,12 @@ def get_slice_json(defaults, **kwargs):
     d = defaults.copy()
     d.update(kwargs)
     return json.dumps(d, indent=4, sort_keys=True)
+
+
+def get_example_data(filepath, is_gzip=True, make_bytes=False):
+    content = requests.get(f'{BASE_URL}{filepath}?raw=true').content
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS|16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content
diff --git a/superset/data/long_lat.py b/superset/data/long_lat.py
@@ -27,6 +27,7 @@
 from superset.utils import core as utils
 from .helpers import (
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -37,8 +38,8 @@
 
 def load_long_lat_data():
     """Loading lat/long data from a csv file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'san_francisco.csv.gz')) as f:
-        pdf = pd.read_csv(f, encoding='utf-8')
+    data = get_example_data('san_francisco.csv.gz', make_bytes=True)
+    pdf = pd.read_csv(data, encoding='utf-8')
     start = datetime.datetime.now().replace(
         hour=0, minute=0, second=0, microsecond=0)
     pdf['datetime'] = [

diff --git a/superset/data/multiformat_time_series.json.gz b/superset/data/multiformat_time_series.json.gz
diff --git a/superset/data/multiformat_time_series.py b/superset/data/multiformat_time_series.py
@@ -25,6 +25,7 @@
 from .helpers import (
     config,
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -35,8 +36,9 @@
 
 def load_multiformat_time_series():
     """Loading time series data from a zip file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('multiformat_time_series.json.gz')
+    pdf = pd.read_json(data)
+
     pdf.ds = pd.to_datetime(pdf.ds, unit='s')
     pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
     pdf.to_sql(

diff --git a/superset/data/paris.py b/superset/data/paris.py
@@ -23,15 +23,15 @@
 
 from superset import db
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, TBL
+from .helpers import DATA_FOLDER, get_example_data, TBL
 
 
 def load_paris_iris_geojson():
     tbl_name = 'paris_iris_mapping'
 
-    with gzip.open(os.path.join(DATA_FOLDER, 'paris_iris.json.gz')) as f:
-        df = pd.read_json(f)
-        df['features'] = df.features.map(json.dumps)
+    data = get_example_data('paris_iris.json.gz')
+    df = pd.read_json(data)
+    df['features'] = df.features.map(json.dumps)
 
     df.to_sql(
         tbl_name,

diff --git a/superset/data/paris_iris.json.gz b/superset/data/paris_iris.json.gz
diff --git a/superset/data/random_time_series.json.gz b/superset/data/random_time_series.json.gz
diff --git a/superset/data/random_time_series.py b/superset/data/random_time_series.py
@@ -25,6 +25,7 @@
 from .helpers import (
     config,
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     Slice,
@@ -34,8 +35,8 @@
 
 def load_random_time_series_data():
     """Loading random time series data from a zip file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'random_time_series.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('random_time_series.json.gz')
+    pdf = pd.read_json(data)
     pdf.ds = pd.to_datetime(pdf.ds, unit='s')
     pdf.to_sql(
         'random_time_series',

diff --git a/superset/data/san_francisco.csv.gz b/superset/data/san_francisco.csv.gz
diff --git a/superset/data/sf_population.json.gz b/superset/data/sf_population.json.gz
diff --git a/superset/data/sf_population_polygons.py b/superset/data/sf_population_polygons.py
@@ -23,15 +23,15 @@
 
 from superset import db
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, TBL
+from .helpers import DATA_FOLDER, get_example_data, TBL
 
 
 def load_sf_population_polygons():
     tbl_name = 'sf_population_polygons'
 
-    with gzip.open(os.path.join(DATA_FOLDER, 'sf_population.json.gz')) as f:
-        df = pd.read_json(f)
-        df['contour'] = df.contour.map(json.dumps)
+    data = get_example_data('sf_population.json.gz')
+    df = pd.read_json(data)
+    df['contour'] = df.contour.map(json.dumps)
 
     df.to_sql(
         tbl_name,

diff --git a/superset/data/unicode_test_data.py b/superset/data/unicode_test_data.py
@@ -28,6 +28,7 @@
     config,
     Dash,
     DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     Slice,
@@ -38,8 +39,9 @@
 
 def load_unicode_test_data():
     """Loading unicode test dataset from a csv file in the repo"""
-    df = pd.read_csv(os.path.join(DATA_FOLDER, 'unicode_utf8_unixnl_test.csv'),
-                     encoding='utf-8')
+    data = get_example_data(
+        'unicode_utf8_unixnl_test.csv', is_gzip=False, make_bytes=True)
+    df = pd.read_csv(data, encoding='utf-8')
     # generate date/numeric data
     df['dttm'] = datetime.datetime.now().date()
     df['value'] = [random.randint(1, 100) for _ in range(len(df))]

diff --git a/superset/data/unicode_utf8_unixnl_test.csv b/superset/data/unicode_utf8_unixnl_test.csv