-
Notifications
You must be signed in to change notification settings - Fork 0
/
billboard_scraper.py
424 lines (360 loc) · 16.2 KB
/
billboard_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
import requests
from bs4 import BeautifulSoup
import hashlib
import shutil
import os
import datetime
import time
from sys import exit
import traceback
from config.scraper_config import *
def date_string_to_numeric(string_week):
"""
takes a date in a string formatting (September 14, 1981) and returns it as 1981-09-14
:param string_week: American style string date
:return: Numeric European style date
"""
numeric_week = datetime.datetime.strptime(string_week, '%B %d, %Y').strftime('%Y-%m-%d')
return numeric_week
def hash_img_name(url):
"""
Crates a unique filename based on a hash of an image url
@param url: image web address
@return: a file name of form 'some_hash_number.jpg'
"""
return hashlib.sha1(url.encode('utf8')).hexdigest() + '.jpg'
def int_if_int(i):
"""
Takes some type and returns it as int only if can actually be converted to int. Otherwise, returns None
@param i: a number, can be of any type (i.e, int, float, string etc).
@return: the provided i, but as an int
"""
try:
return int(i)
except ValueError as e:
return None
def download_image(url, destination):
# TODO: Check why only first 5 images are downloading and not the rest
"""
Downloads an image and saves it to destination folder. The file name is the original url's hash.
@param url: image source url
@param destination: folder to save the image to
@return: 0 if everything runs properly
"""
if url == '':
print('no image url, aborting download')
return
if not os.path.isdir(IMAGE_DIR):
try:
os.mkdir(IMAGE_DIR)
except Exception as e:
print('could not create image directory! ', e)
try:
print('requesting image... ', end='')
img = requests.get(url, stream=True)
print(img)
img.raw.decode_content = True
local_filename = hash_img_name(url) # creates a unique filename
with open(IMAGE_DIR + local_filename, 'wb') as f:
shutil.copyfileobj(img.raw, f)
return 0
except:
print('something went wrong')
class Chart:
"""Class of a weekly chart. The chart is a list of dicts, each dict is a row item from a weekly billboard.com chart."""
def __init__(self, week, chart):
self.chart = chart
self.week = week
def __getitem__(self, item):
"""
getting a specific item (song) from a Chart
:param item: rank in the table
:return: a dict with all data about a song
"""
return self.chart[item]
def __setitem__(self, key, value):
"""
setting a specific item (song) in a Chart
:param key: rank in the table
:param value: dict in the standard structure as defined in Scraper.grab_data()
:return: None
"""
self.chart[key] = value
def set_chart(self, week, chart):
"""
sets an entire Chart list
:param week: week of Chart
:param chart: list of dicts, each dict is a row item data (song)
:return: None
a """
self.chart = chart
self.week = week
def get_chart(self):
"""
get's an entire Chart
:return: a Chart
"""
return self.chart
def get_week(self):
"""
get's a Chart week
:return: a week
"""
return self.week
def generate_week(start_week, end_week):
"""
Given a starting date, yields dates going backwards one week each time.
Runs until the first ever published billboard
@param end_week: the week from which previous week would be calculated backward from
@param start_week: the week to stop once reached (going backwards from end_week).
@return: None
"""
current_week = datetime.datetime.strptime(end_week, '%Y-%m-%d')
first_week = True
while current_week >= datetime.datetime.strptime(start_week, '%Y-%m-%d'):
if not first_week:
current_week -= datetime.timedelta(7)
first_week = False
current_week_str = current_week.strftime('%Y-%m-%d')
yield current_week_str
return None
class Scraper:
"""A scraper object that handles scraping from billboard.com"""
def __init__(self, start_week=FIRST_WEEK_EVER, end_week=MOST_RECENT_WEEK, auto_most_recent=False, query=None):
"""
An initializer for Scraper class
:param start_week: the week to scrape from (default value from scraper_config if nothing provided)
:param end_week: the week to scrape too (default value from scraper_config if nothing provided)
:param auto_most_recent: if True, checks the most recent week on billboard.com and overrides end_week.
:param query: SQL handler object.
"""
self.start_week = start_week
if not auto_most_recent:
self.end_week = end_week
else:
self.end_week = self.figure_out_most_recent_week()
self.query = query
def get_page_soup(self, url):
"""
Grabs a web page and returns a bs4 object
@param url: the web page to grab
@param parser: the parser to use
@return: a bs4 object, representation of whatever is in `url`
"""
try:
print(f'Fetching page from {url}')
trial = 0
while True:
trial += 1
print(f'requesting page, attempt #{trial} --- ', end='')
r = requests.get(url)
if r.status_code == 200:
break
print(f"Successful! {r}")
except requests.exceptions.RequestException:
print(f"something went wrong... ({r})")
page = r.text
print(f'Parsing using {PARSER}... ', end='')
soup = BeautifulSoup(page, PARSER)
print('done!')
return soup
@staticmethod
def grab_data(item):
"""
Collecting each item's data from left to right, as it appears inside the relevant html tag on the site
@param item: an object resulted from a soup.findAll() command
@return: a dictionary holding the data of a row item from a billborad table.
"""
item_data = {
'rank': int(item.find('span', class_='chart-element__rank__number').text),
'delta': int_if_int(item.find('span', class_='chart-element__information__delta__text text--default').text),
'song': item.find('span', class_='chart-element__information__song').text,
'artist': item.find('span', class_='chart-element__information__artist').text,
'last_pos': int_if_int(
item.find('span', class_='chart-element__meta text--center color--secondary text--last').text),
'peak': int(item.find('span', class_='chart-element__meta text--center color--secondary text--peak').text),
'duration': int(
item.find('span', class_='chart-element__meta text--center color--secondary text--week').text),
'img_url': item.find('span', class_='chart-element__image flex--no-shrink')['style'][23:-3]
}
item_data['img_filename'] = hash_img_name(item_data['img_url'])
return item_data
def update_database(self, chart):
"""
updating sql, if sql object was passed to Scraper()
:return: None
"""
if self.query is not None:
try:
print('Inserting current week to database')
self.query.insert(chart.chart, chart.week)
except Exception as e:
print(f'There was something wrong with inserting week {current_week} to database:', e)
traceback.print_exc()
raise
def get_first_item(self, soup):
"""
Grabs a first item from a soup object. Good for quick testing purposes and incremental development.
@param soup: a soup object or a complete web page
@return: a dictionary with a data of a first item
"""
# a first bs4 item
item = soup.find('li', class_='chart-list__element')
# a dictionary that holds the data of a first item
print('collecting an item... ')
item_data = self.grab_data(item)
try:
print(f'getting image from {item_data["img_url"]} ')
download_image(item_data['img_url'], IMAGE_DIR)
except Exception as e:
print(f'image download failed for {item_data["song"]}, {item_data["artist"]}! ', e)
print('done!')
return item_data
def weekly_chart_from_soup(self, soup):
"""
Gets the entire weekly chart (all items).
@param soup: a soup object of a page containing a billboard chart
@return: a Chart object
"""
week = self.get_week_string(soup)
print(f'fetching entire chart for week {week} ...')
try:
items = soup.findAll('li', class_='chart-list__element display--flex')
print('got all items bs4 objects')
except Exception as e:
print(f'something went wrong with getting weekly chart from week {week}!', e)
raise
all_page_items = [] # storing all the items for a page
for item in items:
item_data = self.grab_data(item)
print(f"got data for {item_data['rank']}) {item_data['song']} - {item_data['artist']}")
try:
print(f'getting image from {item_data["img_url"]} ')
download_image(item_data['img_url'], IMAGE_DIR)
except Exception as e:
print(f'image download failed for {item_data["song"]}, {item_data["artist"]}! ', e)
print('done!')
all_page_items.append(item_data)
week = date_string_to_numeric(week)
chart = Chart(week, all_page_items)
return chart
@staticmethod
def get_week_string(soup):
"""
Get's the week of a given chart soup-object. returns a string
@param soup: soup object of a billboard.com chart page
@return: string representing the week
"""
print('figuring out the week')
week = soup.find('button', class_='date-selector__button button--link').text
week = week.strip('\n').strip(' ').strip('\n')
return week
def get_time_range(self, start_week=None, end_week=None):
"""
Scrapes everything - all tables from all weeks as defined by `start_week` and `end_week` (defaults in
parser_config.py).
Outputs to file as defined by SONG_DATA (default: all_time_data.txt)
@return: a dictionary of all weekly chart within the chosen time range as {week: chart} key-value pairs.
"""
if start_week is None:
start_week = self.start_week
if end_week is None:
end_week = self.end_week
if end_week == MOST_RECENT_WEEK and start_week == FIRST_WEEK_EVER:
print('Scraper is getting time range, but since no `start_weekd` and `end_weed` arguments passed when\
creating the Scraper object, it will be using default values')
print(f'getting charts from {self.start_week} to {self.end_week}')
weeks = generate_week(start_week, end_week)
now = datetime.datetime.now().strftime("%m%d%Y%H%M%S")
try:
print(f'Creating data directory at {DATA_DIR}')
os.mkdir(DATA_DIR)
except FileExistsError:
print('Data directory already exists')
data_file = open(f'{DATA_DIR}{now}_{start_week}_{end_week}_{SONG_DATA_FILE}', 'w')
print(f'Writing data to {SONG_DATA_FILE}')
time_range_charts = {}
try:
for current_week in weeks:
print(f'current week: {current_week}')
weekly_soup = self.get_page_soup(BASE_URL + current_week)
current_week_chart = self.weekly_chart_from_soup(weekly_soup)
current_week_list = current_week_chart.get_chart()
# appending weekly Chart object to time_range_charts
time_range_charts[current_week] = current_week_chart
# updating sql, if sql object was passed to Scraper()
if self.query is not None:
try:
print('Inserting current week to database')
self.query.insert(current_week_chart.chart, current_week_chart.week)
except Exception as e:
print(f'There was something wrong with inserting week {current_week} to database:', e)
traceback.print_exc()
raise
# printing current week data to screen
print('\t\t\t<-----------*-----------*-----------*-----------*----------->')
print(f'\t\t\t\t\t\tData for {current_week}')
print('\t\t\t<-----------*-----------*-----------*-----------*----------->')
for item in current_week_list:
print(item)
# writing current week data to text file
data_file.write('\n\n\n' + current_week + '\n')
for item in current_week_list:
data_file.writelines(str(item) + '\n')
# TODO: get tid of the next line if possible
time.sleep(1)
data_file.close()
except Exception as e:
print('Something went wrong while running get_time_range():', e)
traceback.print_exc()
exit(1)
return time_range_charts
def get_all_time(self):
"""
Hard coded method go get all time charts from the first week ever on billboard.com until most recent week
(as defined in MOST_RECENT_WEEK in ./config/scraper_config.py)
:return: list of Chart objects from all time.
"""
print('Getting all time! This might take a while.')
try:
all_time_charts = self.get_time_range(FIRST_WEEK_EVER, MOST_RECENT_WEEK)
except Exception as e:
print(f'There was a problem while running get_all_time(): {e}')
raise Exception
return all_time_charts
def figure_out_most_recent_week(self, url=BASE_URL):
"""
This function runs if __init__() parameter auto_most_recent is set to True (it is False by default).
The method retrieves the week from BASE_URL, and overrides the constant `MOST_RECENT_WEEK` from the
scraper_config with that week.
:param url: by default it is: https://www.billboard.com/charts/hot-100 and probably should not be changed.
:return: string of most recent week with an available chart
"""
website_formatted_date = self.get_week_string(self.get_page_soup(BASE_URL))
scraper_formatted_date = datetime.datetime.strptime(website_formatted_date, '%B %d, %Y').strftime('%Y-%m-%d')
return scraper_formatted_date
def get_update_from_time(self, start_week=None):
"""
Get's the charts from a certain time point until MOST_RECENT_WEEK from scraper_config or until today if
auto_most_recent is set to True.
:param start_week: the week from which the update is starting.
:return: list of Chart objects
"""
if start_week == None:
start_week = self.start_week
return self.get_time_range(start_week, self.end_week)
def get_specific_week(self, specific_week):
"""
Get's a Chart of a specific single week.
:param specific_week: date string in the form of YYYY-MM-DD.
:return: Chart object
"""
try:
weekly_soup = self.get_page_soup(BASE_URL + specific_week)
chart = self.weekly_chart_from_soup(weekly_soup)
self.update_database(chart)
except Exception as e:
print(f'There was an error getting specific week ({specific_week}) chart:', e)
traceback.print_exc()
exit(1)
return chart