Skip to content

Commit

Permalink
[load_examples] download data at runtime (apache#7314)
Browse files Browse the repository at this point in the history
* [load_examples] download data at runtime

When running `superset load_examples` to load example data sets,
Superset used to load from the local package. This created a few issues
notably around licensing (what are these datasets licensed as?) and
around package size.

For now, I moved the data sets here:
https://github.com/apache-superset/examples-data

Altered the logic to download the data from where it is stored.

* flakes

(cherry picked from commit 0088895)
  • Loading branch information
mistercrunch committed Apr 19, 2019
1 parent 79b3169 commit 01b71aa
Show file tree
Hide file tree
Showing 26 changed files with 64 additions and 203 deletions.
Binary file removed superset/data/airports.csv.gz
Binary file not shown.
Binary file removed superset/data/bart-lines.json.gz
Binary file not shown.
15 changes: 7 additions & 8 deletions superset/data/bart_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,25 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import gzip
import json
import os

import pandas as pd
import polyline
from sqlalchemy import String, Text

from superset import db
from superset.utils.core import get_or_create_main_db
from .helpers import DATA_FOLDER, TBL
from .helpers import TBL, get_example_data


def load_bart_lines():
tbl_name = 'bart_lines'
with gzip.open(os.path.join(DATA_FOLDER, 'bart-lines.json.gz')) as f:
df = pd.read_json(f, encoding='latin-1')
df['path_json'] = df.path.map(json.dumps)
df['polyline'] = df.path.map(polyline.encode)
del df['path']
content = get_example_data('bart-lines.json.gz')
df = pd.read_json(content, encoding='latin-1')
df['path_json'] = df.path.map(json.dumps)
df['polyline'] = df.path.map(polyline.encode)
del df['path']

df.to_sql(
tbl_name,
db.engine,
Expand Down
97 changes: 0 additions & 97 deletions superset/data/birth_france_data_for_country_map.csv

This file was deleted.

Binary file removed superset/data/birth_names.json.gz
Binary file not shown.
8 changes: 3 additions & 5 deletions superset/data/birth_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import gzip
import json
import os
import textwrap

import pandas as pd
Expand All @@ -28,7 +26,7 @@
from .helpers import (
config,
Dash,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
Slice,
Expand All @@ -39,8 +37,8 @@

def load_birth_names():
"""Loading birth name dataset from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'birth_names.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('birth_names.json.gz')
pdf = pd.read_json(data)
pdf.ds = pd.to_datetime(pdf.ds, unit='ms')
pdf.to_sql(
'birth_names',
Expand Down
Binary file removed superset/data/countries.json.gz
Binary file not shown.
8 changes: 4 additions & 4 deletions superset/data/country_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
# specific language governing permissions and limitations
# under the License.
import datetime
import os

import pandas as pd
from sqlalchemy import BigInteger, Date, String
Expand All @@ -24,7 +23,7 @@
from superset.connectors.sqla.models import SqlMetric
from superset.utils import core as utils
from .helpers import (
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -35,8 +34,9 @@

def load_country_map_data():
"""Loading data for map with country map"""
csv_path = os.path.join(DATA_FOLDER, 'birth_france_data_for_country_map.csv')
data = pd.read_csv(csv_path, encoding='utf-8')
csv_bytes = get_example_data(
'birth_france_data_for_country_map.csv', is_gzip=False, make_bytes=True)
data = pd.read_csv(csv_bytes, encoding='utf-8')
data['dttm'] = datetime.datetime.now().date()
data.to_sql( # pylint: disable=no-member
'birth_france_by_region',
Expand Down
Binary file removed superset/data/energy.json.gz
Binary file not shown.
10 changes: 5 additions & 5 deletions superset/data/energy.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@
# under the License.
"""Loads datasets, dashboards and slices in a new superset instance"""
# pylint: disable=C,R,W
import gzip
import os
import textwrap

import pandas as pd
Expand All @@ -26,14 +24,16 @@
from superset import db
from superset.connectors.sqla.models import SqlMetric
from superset.utils import core as utils
from .helpers import DATA_FOLDER, merge_slice, misc_dash_slices, Slice, TBL
from .helpers import (
DATA_FOLDER, get_example_data, merge_slice, misc_dash_slices, Slice, TBL,
)


def load_energy():
"""Loads an energy related dataset to use with sankey and graphs"""
tbl_name = 'energy_usage'
with gzip.open(os.path.join(DATA_FOLDER, 'energy.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('energy.json.gz')
pdf = pd.read_json(data)
pdf.to_sql(
tbl_name,
db.engine,
Expand Down
Binary file removed superset/data/flight_data.csv.gz
Binary file not shown.
13 changes: 5 additions & 8 deletions superset/data/flights.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,23 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import gzip
import os

import pandas as pd
from sqlalchemy import DateTime

from superset import db
from superset.utils import core as utils
from .helpers import DATA_FOLDER, TBL
from .helpers import get_example_data, TBL


def load_flights():
"""Loading random time series data from a zip file in the repo"""
tbl_name = 'flights'
with gzip.open(os.path.join(DATA_FOLDER, 'flight_data.csv.gz')) as f:
pdf = pd.read_csv(f, encoding='latin-1')
data = get_example_data('flight_data.csv.gz', make_bytes=True)
pdf = pd.read_csv(data, encoding='latin-1')

# Loading airports info to join and get lat/long
with gzip.open(os.path.join(DATA_FOLDER, 'airports.csv.gz')) as f:
airports = pd.read_csv(f, encoding='latin-1')
airports_bytes = get_example_data('airports.csv.gz', make_bytes=True)
airports = pd.read_csv(airports_bytes, encoding='latin-1')
airports = airports.set_index('IATA_CODE')

pdf['ds'] = pdf.YEAR.map(str) + '-0' + pdf.MONTH.map(str) + '-0' + pdf.DAY.map(str)
Expand Down
15 changes: 15 additions & 0 deletions superset/data/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@
# under the License.
"""Loads datasets, dashboards and slices in a new superset instance"""
# pylint: disable=C,R,W
from io import BytesIO
import json
import os
import zlib

import requests

from superset import app, db
from superset.connectors.connector_registry import ConnectorRegistry
from superset.models import core as models

BASE_URL = 'https://github.com/apache-superset/examples-data/blob/master/'

# Shortcuts
DB = models.Database
Slice = models.Slice
Expand Down Expand Up @@ -60,3 +66,12 @@ def get_slice_json(defaults, **kwargs):
d = defaults.copy()
d.update(kwargs)
return json.dumps(d, indent=4, sort_keys=True)


def get_example_data(filepath, is_gzip=True, make_bytes=False):
content = requests.get(f'{BASE_URL}{filepath}?raw=true').content
if is_gzip:
content = zlib.decompress(content, zlib.MAX_WBITS|16)
if make_bytes:
content = BytesIO(content)
return content
8 changes: 3 additions & 5 deletions superset/data/long_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
# specific language governing permissions and limitations
# under the License.
import datetime
import gzip
import os
import random

import geohash
Expand All @@ -26,7 +24,7 @@
from superset import db
from superset.utils import core as utils
from .helpers import (
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -37,8 +35,8 @@

def load_long_lat_data():
"""Loading lat/long data from a csv file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'san_francisco.csv.gz')) as f:
pdf = pd.read_csv(f, encoding='utf-8')
data = get_example_data('san_francisco.csv.gz', make_bytes=True)
pdf = pd.read_csv(data, encoding='utf-8')
start = datetime.datetime.now().replace(
hour=0, minute=0, second=0, microsecond=0)
pdf['datetime'] = [
Expand Down
Binary file removed superset/data/multiformat_time_series.json.gz
Binary file not shown.
9 changes: 4 additions & 5 deletions superset/data/multiformat_time_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import gzip
import os

import pandas as pd
from sqlalchemy import BigInteger, Date, DateTime, String
Expand All @@ -24,7 +22,7 @@
from superset.utils import core as utils
from .helpers import (
config,
DATA_FOLDER,
get_example_data,
get_slice_json,
merge_slice,
misc_dash_slices,
Expand All @@ -35,8 +33,9 @@

def load_multiformat_time_series():
"""Loading time series data from a zip file in the repo"""
with gzip.open(os.path.join(DATA_FOLDER, 'multiformat_time_series.json.gz')) as f:
pdf = pd.read_json(f)
data = get_example_data('multiformat_time_series.json.gz')
pdf = pd.read_json(data)

pdf.ds = pd.to_datetime(pdf.ds, unit='s')
pdf.ds2 = pd.to_datetime(pdf.ds2, unit='s')
pdf.to_sql(
Expand Down
10 changes: 4 additions & 6 deletions superset/data/paris.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,22 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import gzip
import json
import os

import pandas as pd
from sqlalchemy import String, Text

from superset import db
from superset.utils import core as utils
from .helpers import DATA_FOLDER, TBL
from .helpers import TBL, get_example_data


def load_paris_iris_geojson():
tbl_name = 'paris_iris_mapping'

with gzip.open(os.path.join(DATA_FOLDER, 'paris_iris.json.gz')) as f:
df = pd.read_json(f)
df['features'] = df.features.map(json.dumps)
data = get_example_data('paris_iris.json.gz')
df = pd.read_json(data)
df['features'] = df.features.map(json.dumps)

df.to_sql(
tbl_name,
Expand Down
Binary file removed superset/data/paris_iris.json.gz
Binary file not shown.
Binary file removed superset/data/random_time_series.json.gz
Binary file not shown.
Loading

0 comments on commit 01b71aa

Please sign in to comment.