[load_examples] download data at runtime (apache#7314)

* [load_examples] download data at runtime When running `superset load_examples` to load example data sets, Superset used to load from the local package. This created a few issues notably around licensing (what are these datasets licensed as?) and around package size. For now, I moved the data sets here: https://github.com/apache-superset/examples-data Altered the logic to download the data from where it is stored. * flakes (cherry picked from commit 0088895)
CybercentreCanada · Apr 19, 2019 · 01b71aa · 01b71aa
1 parent 79b3169
commit 01b71aa
Show file tree

Hide file tree

Showing 26 changed files with 64 additions and 203 deletions.
diff --git a/superset/data/airports.csv.gz b/superset/data/airports.csv.gz
diff --git a/superset/data/bart-lines.json.gz b/superset/data/bart-lines.json.gz
diff --git a/superset/data/bart_lines.py b/superset/data/bart_lines.py
@@ -14,26 +14,25 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import gzip
 import json
-import os
 
 import pandas as pd
 import polyline
 from sqlalchemy import String, Text
 
 from superset import db
 from superset.utils.core import get_or_create_main_db
-from .helpers import DATA_FOLDER, TBL
+from .helpers import TBL, get_example_data
 
 
 def load_bart_lines():
     tbl_name = 'bart_lines'
-    with gzip.open(os.path.join(DATA_FOLDER, 'bart-lines.json.gz')) as f:
-        df = pd.read_json(f, encoding='latin-1')
-        df['path_json'] = df.path.map(json.dumps)
-        df['polyline'] = df.path.map(polyline.encode)
-        del df['path']
+    content = get_example_data('bart-lines.json.gz')
+    df = pd.read_json(content, encoding='latin-1')
+    df['path_json'] = df.path.map(json.dumps)
+    df['polyline'] = df.path.map(polyline.encode)
+    del df['path']
+
     df.to_sql(
         tbl_name,
         db.engine,

diff --git a/superset/data/birth_france_data_for_country_map.csv b/superset/data/birth_france_data_for_country_map.csv
diff --git a/superset/data/birth_names.json.gz b/superset/data/birth_names.json.gz
diff --git a/superset/data/birth_names.py b/superset/data/birth_names.py
@@ -14,9 +14,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import gzip
 import json
-import os
 import textwrap
 
 import pandas as pd
@@ -28,7 +26,7 @@
 from .helpers import (
     config,
     Dash,
-    DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     Slice,
@@ -39,8 +37,8 @@
 
 def load_birth_names():
     """Loading birth name dataset from a zip file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('birth_names.json.gz')
+    pdf = pd.read_json(data)
     pdf.ds = pd.to_datetime(pdf.ds, unit='ms')
     pdf.to_sql(
         'birth_names',

diff --git a/superset/data/countries.json.gz b/superset/data/countries.json.gz
diff --git a/superset/data/country_map.py b/superset/data/country_map.py
@@ -15,7 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import datetime
-import os
 
 import pandas as pd
 from sqlalchemy import BigInteger, Date, String
@@ -24,7 +23,7 @@
 from superset.connectors.sqla.models import SqlMetric
 from superset.utils import core as utils
 from .helpers import (
-    DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -35,8 +34,9 @@
 
 def load_country_map_data():
     """Loading data for map with country map"""
-    csv_path = os.path.join(DATA_FOLDER, 'birth_france_data_for_country_map.csv')
-    data = pd.read_csv(csv_path, encoding='utf-8')
+    csv_bytes = get_example_data(
+        'birth_france_data_for_country_map.csv', is_gzip=False, make_bytes=True)
+    data = pd.read_csv(csv_bytes, encoding='utf-8')
     data['dttm'] = datetime.datetime.now().date()
     data.to_sql(  # pylint: disable=no-member
         'birth_france_by_region',

diff --git a/superset/data/energy.json.gz b/superset/data/energy.json.gz
diff --git a/superset/data/energy.py b/superset/data/energy.py
@@ -16,8 +16,6 @@
 # under the License.
 """Loads datasets, dashboards and slices in a new superset instance"""
 # pylint: disable=C,R,W
-import gzip
-import os
 import textwrap
 
 import pandas as pd
@@ -26,14 +24,16 @@
 from superset import db
 from superset.connectors.sqla.models import SqlMetric
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, merge_slice, misc_dash_slices, Slice, TBL
+from .helpers import (
+    DATA_FOLDER, get_example_data, merge_slice, misc_dash_slices, Slice, TBL,
+)
 
 
 def load_energy():
     """Loads an energy related dataset to use with sankey and graphs"""
     tbl_name = 'energy_usage'
-    with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('energy.json.gz')
+    pdf = pd.read_json(data)
     pdf.to_sql(
         tbl_name,
         db.engine,

diff --git a/superset/data/flight_data.csv.gz b/superset/data/flight_data.csv.gz
diff --git a/superset/data/flights.py b/superset/data/flights.py
@@ -14,26 +14,23 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import gzip
-import os
-
 import pandas as pd
 from sqlalchemy import DateTime
 
 from superset import db
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, TBL
+from .helpers import get_example_data, TBL
 
 
 def load_flights():
     """Loading random time series data from a zip file in the repo"""
     tbl_name = 'flights'
-    with gzip.open(os.path.join(DATA_FOLDER, 'flight_data.csv.gz')) as f:
-        pdf = pd.read_csv(f, encoding='latin-1')
+    data = get_example_data('flight_data.csv.gz', make_bytes=True)
+    pdf = pd.read_csv(data, encoding='latin-1')
 
     # Loading airports info to join and get lat/long
-    with gzip.open(os.path.join(DATA_FOLDER, 'airports.csv.gz')) as f:
-        airports = pd.read_csv(f, encoding='latin-1')
+    airports_bytes = get_example_data('airports.csv.gz', make_bytes=True)
+    airports = pd.read_csv(airports_bytes, encoding='latin-1')
     airports = airports.set_index('IATA_CODE')
 
     pdf['ds'] = pdf.YEAR.map(str) + '-0' + pdf.MONTH.map(str) + '-0' + pdf.DAY.map(str)

diff --git a/superset/data/helpers.py b/superset/data/helpers.py
@@ -16,13 +16,19 @@
 # under the License.
 """Loads datasets, dashboards and slices in a new superset instance"""
 # pylint: disable=C,R,W
+from io import BytesIO
 import json
 import os
+import zlib
+
+import requests
 
 from superset import app, db
 from superset.connectors.connector_registry import ConnectorRegistry
 from superset.models import core as models
 
+BASE_URL = 'https://github.com/apache-superset/examples-data/blob/master/'
+
 # Shortcuts
 DB = models.Database
 Slice = models.Slice
@@ -60,3 +66,12 @@ def get_slice_json(defaults, **kwargs):
     d = defaults.copy()
     d.update(kwargs)
     return json.dumps(d, indent=4, sort_keys=True)
+
+
+def get_example_data(filepath, is_gzip=True, make_bytes=False):
+    content = requests.get(f'{BASE_URL}{filepath}?raw=true').content
+    if is_gzip:
+        content = zlib.decompress(content, zlib.MAX_WBITS|16)
+    if make_bytes:
+        content = BytesIO(content)
+    return content
diff --git a/superset/data/long_lat.py b/superset/data/long_lat.py
@@ -15,8 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 import datetime
-import gzip
-import os
 import random
 
 import geohash
@@ -26,7 +24,7 @@
 from superset import db
 from superset.utils import core as utils
 from .helpers import (
-    DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -37,8 +35,8 @@
 
 def load_long_lat_data():
     """Loading lat/long data from a csv file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'san_francisco.csv.gz')) as f:
-        pdf = pd.read_csv(f, encoding='utf-8')
+    data = get_example_data('san_francisco.csv.gz', make_bytes=True)
+    pdf = pd.read_csv(data, encoding='utf-8')
     start = datetime.datetime.now().replace(
         hour=0, minute=0, second=0, microsecond=0)
     pdf['datetime'] = [

diff --git a/superset/data/multiformat_time_series.json.gz b/superset/data/multiformat_time_series.json.gz
diff --git a/superset/data/multiformat_time_series.py b/superset/data/multiformat_time_series.py
@@ -14,8 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import gzip
-import os
 
 import pandas as pd
 from sqlalchemy import BigInteger, Date, DateTime, String
@@ -24,7 +22,7 @@
 from superset.utils import core as utils
 from .helpers import (
     config,
-    DATA_FOLDER,
+    get_example_data,
     get_slice_json,
     merge_slice,
     misc_dash_slices,
@@ -35,8 +33,9 @@
 
 def load_multiformat_time_series():
     """Loading time series data from a zip file in the repo"""
-    with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
-        pdf = pd.read_json(f)
+    data = get_example_data('multiformat_time_series.json.gz')
+    pdf = pd.read_json(data)
+
     pdf.ds = pd.to_datetime(pdf.ds, unit='s')
     pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
     pdf.to_sql(

diff --git a/superset/data/paris.py b/superset/data/paris.py
@@ -14,24 +14,22 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-import gzip
 import json
-import os
 
 import pandas as pd
 from sqlalchemy import String, Text
 
 from superset import db
 from superset.utils import core as utils
-from .helpers import DATA_FOLDER, TBL
+from .helpers import TBL, get_example_data
 
 
 def load_paris_iris_geojson():
     tbl_name = 'paris_iris_mapping'
 
-    with gzip.open(os.path.join(DATA_FOLDER, 'paris_iris.json.gz')) as f:
-        df = pd.read_json(f)
-        df['features'] = df.features.map(json.dumps)
+    data = get_example_data('paris_iris.json.gz')
+    df = pd.read_json(data)
+    df['features'] = df.features.map(json.dumps)
 
     df.to_sql(
         tbl_name,

diff --git a/superset/data/paris_iris.json.gz b/superset/data/paris_iris.json.gz
diff --git a/superset/data/random_time_series.json.gz b/superset/data/random_time_series.json.gz