-
Notifications
You must be signed in to change notification settings - Fork 0
/
aiEgy.py
108 lines (91 loc) · 5.35 KB
/
aiEgy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#! python3
import requests,bs4,openpyxl,re
from openpyxl.utils import get_column_letter
from openpyxl.styles import PatternFill,Font
def writeExcel(names,data):
## write the resulted data in a new excelsheet and save it with name driven_AI3.xlsx
## input: names --a list of names of the companies
## data -- all the data of the companies in the form of dict that includes
## name of companies, thier fields of interest, website link,
## link to facebook page, link to twitter account, link to linkedin account,
## email/s of the company has, thier phone numbers, and thier address in Egypt
wb = openpyxl.Workbook()
sheet=wb.active
columns= ["Name","Interests","Link","Facebook","Twitter","Linkedin",
"Email","Phone","Address"]
for i in range(len(columns)):
# add header column with the names of the columns and format it
sheet.cell(row=1,column=i+1).value = columns[i]
sheet.cell(row=1,column=i+1).fill = PatternFill(bgColor="000000FF", fill_type = "solid")
sheet.cell(row=1,column=i+1).font = Font(color="00FFFFFF")
dims={"Name":[],"Interests":[],"Link":[],"Facebook":[],
"Twitter":[],"Linkedin":[],"Email":[],"Phone":[],
"Address":[]}
for i in range(len(data)):
for j in range(len(columns)):
# write the data dict to the excel sheet
sheet.cell(row=i+2,column=j+1).value ='\n'.join(data[names[i]][columns[j]]) #in case that a company has two results in certain column, joins them with newline
dims[columns[j]].append(len('\n'.join(data[names[i]][columns[j]]))) #store the length of the string in order to be used in modifying the width of columns
for i in range(len(columns)):
#Modify the width of the column using the maximum length of string in each column
sheet.column_dimensions[get_column_letter(i+1)].width = max(dims[columns[i]])
sheet.freeze_panes=sheet['B2']
wb.save('driven_AI3.xlsx')
def getInfo(link):
## Get all the data of a company through it's website
## input: link -- the company's website link address
## output: a dictionary includes:
## link to facebook page, link to twitter account, link to linkedin account,
## email/s of the company has, thier phone numbers, and thier address in Egypt
address_list=[]
phone_list=[]
email_list=[]
facebook_links=[]
twitter_links=[]
linkedin_links=[]
linkOptions = ['contact-us','contact','about-us','about','']
for option in linkOptions:
# Try every page that can contains the contact info from 5 possible page links
try:
res= requests.get(link+option)
res.raise_for_status()
except:continue
soup = bs4.BeautifulSoup(res.text,'lxml')
addressReg = re.compile(r'[^<>{}:\"], Cairo\W?$|[^<>{}:\"], Egypt\W?$',re.IGNORECASE)
phoneReg = re.compile(r'\+\W?2\W?\d?\d?\W?\d{4}\W?\d{4}|\+\W?02\W?\d?\W?\d{4}\W?\d{4}')
emailReg=re.compile(r'[^:\"<>/\s,][A-Za-z0-9._-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}$',re.MULTILINE)
address_list = [*{e.strip('\n<> ').split(":")[-1] for e in soup(text=addressReg)}] #locate the address , ex--> Address: Maadi Technology Park, 11435 Cairo
phone_list = [*{e.strip('\n<> ').split('+')[-1] for e in soup(text=phoneReg)}] #locate the phone number , ex--> +2 02 23456789
email_list = [*{e.strip('\n<> ') for e in soup(text=emailReg)}] #locate the email, ex--> <contact@co.com>
facebook_links = [*{x.get('href').lower().split("//")[1] for x in soup.select('a')\
if "facebook.com" in x.get('href',[])}] #get the Facebook link from all the links in contact page and remove "http://"
twitter_links = [*{x.get('href').lower().split("//")[1] for x in soup.select('a')\
if "twitter.com" in x.get('href',[])}] #get the Twitter link from all the links in contact page and remove "http://"
linkedin_links = [*{x.get('href').lower().split("//")[1] for x in soup.select('a')\
if "linkedin.com" in x.get('href',[])}] #get the Linkedin link from all the links in contact page and remove "http://"
if address_list:
# stop in case the crawler already found the data
break
return({"Address":address_list,"Phone":phone_list,"Email":email_list,\
"Facebook":facebook_links,"Twitter":twitter_links,"Linkedin":linkedin_links})
if __name__ =="__main__":
res = requests.get("https://my-interviews-experience-in-egypt.quora.com/AI-Driven-Companies-in-Egypt")
res.raise_for_status()
soup=bs4.BeautifulSoup(res.text,'lxml')
pageElem = soup.select('ol')
companyElem = pageElem[0].select('li')
names = []
data=dict()
for company in companyElem:
## get the data for every company, and store it in dict with its name as a key
name= company.text.split(':')[0]
website=company.select('a')[0].get('href')
company_interests = company.text.split(':')[-1]
interest_list =company_interests.split('=')[0] #get rid of "======" in the end of each company section
temp=getInfo(website)
temp['Name']=name
temp['Link']=website
temp['Interests']=interest_list
data[name]=temp
names.append(name)
writeExcel(names,data)