-
Notifications
You must be signed in to change notification settings - Fork 115
/
get_flags.py
102 lines (81 loc) · 3.38 KB
/
get_flags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import re
import os
import codecs
import json
import requests
from bs4 import BeautifulSoup
WIKI_URL = 'http://en.wikipedia.org'
_here = os.path.dirname(__file__)
errors = []
countries = []
def main():
with open(os.path.join(_here, 'licenses.csv'), 'w') as f:
f.write('Alpha-3 code,English short name,License\n')
r = requests.get('%s/wiki/ISO_3166-1' % WIKI_URL)
soup = BeautifulSoup(r.text, 'html.parser')
country_rows = soup.select('#mw-content-text table:nth-of-type(2) tr')[1:]
print(f'Found {len(country_rows)} countries')
for row in country_rows:
get_flag_page(dict(
url=row.select_one('td:nth-of-type(1) a')['href'],
alpha3=row.select_one('td:nth-of-type(3)').get_text(),
name=re.sub(r'\[\w+]', '', row.select_one('td:nth-of-type(1)').get_text()).strip().replace(',', ' -'),
))
write_json(countries)
print(f'Done with {len(errors)} errors')
def get_flag_page(country):
r = requests.get(WIKI_URL + country['url'])
soup = BeautifulSoup(r.text, 'html.parser')
media_link = soup.find(title=re.compile('^Flag of'))
if media_link is None:
errors.append(f'No flag found for {country["name"]}')
print(errors[-1])
return False
media_url = media_link['href']
r = requests.get(WIKI_URL + media_url)
soup = BeautifulSoup(r.text, 'html.parser')
file_link = soup.select('#file > a')
if len(file_link) == 0:
errors.append(f'No flag found for "{country["name"]}"')
print(errors[-1])
return False
country['file_url'] = file_link[0]['href']
country['license'] = get_license(soup)
download_flag(country)
append_licenses(country)
countries.append(country)
def get_license(page):
"""Find the location of the licensing info and regex for our magic words."""
img_desc = page.select('#shared-image-desc')
public_domain = re.compile(r'(?i)public domain')
cc = re.compile(r'(?i)attribution-share alike (?P<version>\d+\.\d+) (?P<type>\w+)')
non_protected = re.compile(r'(?i)non-protected works')
if len(img_desc) == 0:
img_desc = page.select('#mw-content-text .imbox-license')
if img_desc[0].find(text=public_domain):
return 'Public domain'
if len(img_desc) > 1 and img_desc[1].find(text=public_domain):
return 'Public domain'
if img_desc[0].find(text=non_protected):
return 'Non-protected works'
if text := img_desc[0].find(text=cc):
match = re.search(cc, text.get_text())
return f'Creative Commons Attribution-Share Alike {match.group("version")} {match.group("type")}'
def append_licenses(country):
with codecs.open(os.path.join(_here, 'licenses.csv'), 'a', 'utf-8') as f:
f.write(','.join(
[country['alpha3'], country['name'], country.get('license', 'Unknown')]
) + '\n')
def download_flag(country):
file_name = os.path.basename(f'{country["alpha3"].lower()}.svg')
path = os.path.join(_here, 'images', 'svg', file_name)
r = requests.get('http:' + country['file_url'])
print(f'Saving file: "{file_name}" for {country["name"]}')
with open(path, 'wb') as f:
for chunk in r.iter_content():
f.write(chunk)
def write_json(countries):
with codecs.open(os.path.join(_here, 'countries.json'), 'w', 'utf-8') as f:
f.write(json.dumps(countries, indent=2))
if __name__ == '__main__':
main()