Skip to content

Commit 6d9181a

Browse files
authored
Replace requests with urllib and further clean up pandas (#112)
* Replace requests with urllib and furture clean up pandas * Fix all the unittest
1 parent 5956370 commit 6d9181a

File tree

14 files changed

+253
-361
lines changed

14 files changed

+253
-361
lines changed

datacommons/core.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
from collections import defaultdict
3030

3131
import datacommons.utils as utils
32-
import requests
3332

3433
# ----------------------------- WRAPPER FUNCTIONS -----------------------------
3534

datacommons/examples/BUILD.bazel

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ py_binary(
55
srcs=["core.py"],
66
deps=[
77
"//datacommons:datacommons",
8-
requirement("pandas"),
98
]
109
)
1110

@@ -14,7 +13,6 @@ py_binary(
1413
srcs=["places.py"],
1514
deps=[
1615
"//datacommons:datacommons",
17-
requirement("pandas"),
1816
]
1917
)
2018

@@ -23,7 +21,6 @@ py_binary(
2321
srcs=["populations.py"],
2422
deps=[
2523
"//datacommons:datacommons",
26-
requirement("pandas"),
2724
]
2825
)
2926

@@ -32,6 +29,5 @@ py_binary(
3229
srcs=["query.py"],
3330
deps=[
3431
"//datacommons:datacommons",
35-
requirement("pandas"),
3632
]
3733
)

datacommons/examples/core.py

Lines changed: 3 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,21 @@
2121
from __future__ import print_function
2222

2323
import datacommons as dc
24-
import pandas as pd
25-
26-
import datacommons.utils as utils
27-
2824

2925
def main():
3026
# Set the dcid to be that of Santa Clara County.
3127
dcids = ['geoId/06085']
3228

3329
# Print all incoming and outgoing properties from Santa Clara County.
34-
utils._print_header('Property Labels for Santa Clara County')
30+
print('Property Labels for Santa Clara County')
3531
in_labels = dc.get_property_labels(dcids)
3632
out_labels = dc.get_property_labels(dcids, out=False)
3733
print('> Printing properties for {}'.format(dcids))
3834
print('> Incoming properties: {}'.format(in_labels))
3935
print('> Outgoing properties: {}'.format(out_labels))
4036

4137
# Print all property values for "containedInPlace" for Santa Clara County.
42-
utils._print_header(
43-
'Property Values for "containedInPlace" of Santa Clara County')
38+
print('Property Values for "containedInPlace" of Santa Clara County')
4439
prop_vals = dc.get_property_values(
4540
dcids, 'containedInPlace', out=False, value_type='City')
4641
print('> Cities contained in {}'.format(dcids))
@@ -49,41 +44,13 @@ def main():
4944
print(' - {}'.format(city_dcid))
5045

5146
# Print the first 10 triples associated with Santa Clara County
52-
utils._print_header('Triples for Santa Clara County')
47+
print('Triples for Santa Clara County')
5348
triples = dc.get_triples(dcids)
5449
for dcid in dcids:
5550
print('> Triples for {}'.format(dcid))
5651
for s, p, o in triples[dcid][:5]:
5752
print(' - ("{}", {}, "{}")'.format(s, p, o))
5853

59-
# get_property_values can be easily used to populate Pandas DataFrames. First
60-
# create a DataFrame with some data.
61-
utils._print_header('Initialize the DataFrame')
62-
pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
63-
print(pd_frame)
64-
65-
# Get the names for the given counties.
66-
utils._print_header('Get County Names')
67-
pd_frame['county_name'] = pd_frame['county'].map(
68-
dc.get_property_values(pd_frame['county'], 'name'))
69-
pd_frame = pd_frame.explode('county_name')
70-
print(pd_frame)
71-
72-
# Get the cities contained in these counties.
73-
utils._print_header('Get Contained Cities')
74-
pd_frame['city'] = pd_frame['county'].map(
75-
dc.get_property_values(
76-
pd_frame['county'], 'containedInPlace', out=False, value_type='City'))
77-
pd_frame = pd_frame.explode('city')
78-
print(pd_frame)
79-
80-
# Get the names for each city.
81-
utils._print_header('Get City Names')
82-
pd_frame['city_name'] = pd_frame['city'].map(
83-
dc.get_property_values(pd_frame['city'], 'name'))
84-
pd_frame = pd_frame.explode('city_name')
85-
print(pd_frame)
86-
8754

8855
if __name__ == '__main__':
8956
main()

datacommons/examples/places.py

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,42 +21,23 @@
2121
from __future__ import print_function
2222

2323
import datacommons as dc
24-
import pandas as pd
25-
26-
import datacommons.utils as utils
27-
2824

2925
def main():
3026
# Create a list of dcids for Santa Clara and Montgomery County.
3127
sc, mc = 'geoId/06085', 'geoId/24031'
3228
dcids = [sc, mc]
3329

3430
# Get all CensusTracts in these two counties.
35-
utils._print_header('Get Census Tracts')
31+
print('Get Census Tracts')
3632
tracts = dc.get_places_in(dcids, 'CensusTract')
3733
if sc in tracts:
3834
print('> 10 CensusTracts in Santa Clara County')
3935
for dcid in tracts[sc][:10]:
4036
print(' - {}'.format(dcid))
41-
print()
4237
if mc in tracts:
4338
print('> 10 CensusTracts in Montgomery County')
4439
for dcid in tracts[mc][:10]:
4540
print(' - {}'.format(dcid))
4641

47-
# We perform the same task using a Pandas DataFrame. First, initialize a
48-
# DataFrame with Santa Clara and Montgomery County.
49-
utils._print_header('Initialize the DataFrame')
50-
pd_frame = pd.DataFrame({'county': ['geoId/06085', 'geoId/24031']})
51-
print(pd_frame)
52-
53-
# Get all CensusTracts in these two counties.
54-
utils._print_header('Get Census Tracts')
55-
pd_frame['tracts'] = pd_frame['county'].map(
56-
dc.get_places_in(pd_frame['county'], 'CensusTract'))
57-
pd_frame = pd_frame.explode('tracts')
58-
print(pd_frame)
59-
60-
6142
if __name__ == '__main__':
6243
main()

datacommons/examples/populations.py

Lines changed: 3 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,7 @@
2121
from __future__ import print_function
2222

2323
import datacommons as dc
24-
import pandas as pd
2524
import pprint
26-
27-
import datacommons.utils as utils
28-
2925
import json
3026

3127

@@ -35,16 +31,14 @@ def main():
3531
dcids = [ca, ky, md]
3632

3733
# Get the population of all employed individuals in the above states.
38-
utils._print_header('Get Populations for All Employed Individuals')
34+
print('Get Populations for All Employed Individuals')
3935
employed = dc.get_populations(dcids, 'Person', constraining_properties={
4036
'employment': 'BLS_Employed'})
41-
print('> Printing all populations of employed individuals\n')
4237
print(json.dumps(employed, indent=2))
4338

4439
# Get the count for all male / females for the above states in 2016
45-
utils._print_header('Get Population Counts for Employed Individuals in Maryland')
40+
print('Get Population Counts for Employed Individuals in Maryland')
4641
pop_dcids = [employed[md]]
47-
print('> Requesting observations for {} in December 2018\n'.format(pop_dcids))
4842
obs = dc.get_observations(pop_dcids,
4943
'count',
5044
'measuredValue',
@@ -53,41 +47,8 @@ def main():
5347
measurement_method='BLSSeasonallyAdjusted')
5448
print(json.dumps(obs, indent=2))
5549

56-
# We perform the same workflow using a Pandas DataFrame. First, initialize a
57-
# DataFrame with Santa Clara and Montgomery County.
58-
utils._print_header('Initialize the DataFrame')
59-
pd_frame = pd.DataFrame({'state': ['geoId/06', 'geoId/21', 'geoId/24']})
60-
pd_frame['state_name'] = pd_frame['state'].map(
61-
dc.get_property_values(pd_frame['state'], 'name'))
62-
pd_frame = pd_frame.explode('state_name').reset_index(drop=True)
63-
64-
# Get populations for employed individuals
65-
utils._print_header('Add Population and Observation to DataFrame')
66-
pd_frame['employed_pop'] = pd_frame['state'].map(dc.get_populations(
67-
pd_frame['state'],
68-
'Person',
69-
constraining_properties={'employment': 'BLS_Employed'}))
70-
71-
# Add the observation for employed individuals
72-
pd_frame['employed_count'] = pd_frame['employed_pop'].map(
73-
dc.get_observations(
74-
pd_frame['employed_pop'],
75-
'count',
76-
'measuredValue',
77-
'2018-12',
78-
observation_period='P1M',
79-
measurement_method='BLSSeasonallyAdjusted'))
80-
print(pd_frame)
81-
82-
# Final dataframe. Use the convenience function "clean_frame" to convert
83-
# columns to numerical types.
84-
utils._print_header('Final Data Frame')
85-
pd_frame = pd_frame.dropna().reset_index(drop=True)
86-
print(pd_frame)
87-
88-
8950
# Get all population and observation data of Mountain View.
90-
utils._print_header('Get Mountain View population and observation')
51+
print('Get Mountain View population and observation')
9152
popobs = dc.get_pop_obs("geoId/0649670")
9253
pprint.pprint(popobs)
9354

datacommons/places.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424

2525
import datacommons.utils as utils
2626

27-
import requests
28-
2927

3028
def get_places_in(dcids, place_type):
3129
""" Returns :obj:`Place`s contained in :code:`dcids` of type

datacommons/populations.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,22 @@
2828

2929
import datacommons.utils as utils
3030

31-
import requests
31+
32+
def _flatten_results(result, default_value=None):
33+
""" Formats results to map to a single value or default value if empty. """
34+
for k in list(result):
35+
v = result[k]
36+
if len(v) > 1:
37+
raise ValueError(
38+
'Expected one result, but more returned for "{}": {}'.format(k, v))
39+
if len(v) == 1:
40+
result[k] = v[0]
41+
else:
42+
if default_value is not None:
43+
result[k] = default_value
44+
else:
45+
del result[k]
46+
return result
3247

3348

3449
def get_populations(dcids, population_type, constraining_properties={}):
@@ -96,7 +111,7 @@ def get_populations(dcids, population_type, constraining_properties={}):
96111
payload, 'population', must_exist=dcids)
97112

98113
# Drop empty results while flattening
99-
return utils._flatten_results(result)
114+
return _flatten_results(result)
100115

101116

102117
def get_observations(dcids,
@@ -184,7 +199,7 @@ def get_observations(dcids,
184199
# Drop empty results by calling _flatten_results without default_value, then
185200
# coerce the type to float if possible.
186201
typed_results = {}
187-
for k, v in utils._flatten_results(result).items():
202+
for k, v in _flatten_results(result).items():
188203
try:
189204
typed_results[k] = float(v)
190205
except ValueError:

datacommons/query.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323

2424
from datacommons.utils import _API_ROOT, _API_ENDPOINTS, _ENV_VAR_API_KEY
2525

26+
import json
2627
import os
27-
import requests
28+
import urllib.request
2829

2930
# ----------------------------- WRAPPER FUNCTIONS -----------------------------
3031

@@ -88,17 +89,26 @@ def query(query_string, select=None):
8889
if not os.environ.get(_ENV_VAR_API_KEY, None):
8990
raise ValueError(
9091
'Request error: Must set an API key before using the API!')
91-
url = _API_ROOT + _API_ENDPOINTS['query']
92-
res = requests.post(url, json={'sparql': query_string}, headers={
93-
'x-api-key': os.environ[_ENV_VAR_API_KEY]
94-
})
95-
96-
# Verify then store the results.
97-
if res.status_code != 200:
92+
req_url = _API_ROOT + _API_ENDPOINTS['query']
93+
94+
headers = {
95+
'x-api-key': os.environ[_ENV_VAR_API_KEY],
96+
'Content-Type': 'application/json'
97+
}
98+
req = urllib.request.Request(
99+
req_url,
100+
data=json.dumps({'sparql': query_string}).encode("utf-8"),
101+
headers=headers)
102+
103+
try:
104+
res = urllib.request.urlopen(req)
105+
except urllib.error.HTTPError as e:
98106
raise ValueError(
99107
'Response error: An HTTP {} code was returned by the mixer. Printing '
100-
'response\n\n{}'.format(res.status_code , res.text))
101-
res_json = res.json()
108+
'response\n\n{}'.format(e.code, e.read()))
109+
110+
# Verify then store the results.
111+
res_json = json.loads(res.read())
102112

103113
# Iterate through the query results
104114
header = res_json['header']

0 commit comments

Comments
 (0)