-
Notifications
You must be signed in to change notification settings - Fork 0
/
wrangling.py
139 lines (116 loc) · 5.76 KB
/
wrangling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import datetime
import re
def scrape_page(url):
r = requests.get(url)
time_retrieved = datetime.datetime.now().strftime(("%Y-%m-%d %H:%M:%S"))
return BeautifulSoup(r.content, "lxml"), time_retrieved
def remove_comments(soup):
comments = soup.find_all(string=lambda x: isinstance(x, Comment))
for comment in comments:
comment.extract()
def is_relevant_listing(tag):
# find names of tag parents by mapping
if tag.name == "app-housing-list-item":
parentsNames = list(map((lambda x: x.name), tag.parents))
is_not_public = tag.find("app-listing-information-hidden")
return not ("ngb-carousel" in parentsNames) and not ("swiper" in parentsNames) \
and not is_not_public
else:
return False
def append_to_dictlist(tag_list, dict_list, time_retrieved, status):
for tag in tag_list:
# structuring for code readability
top_info = tag.find("app-listing-information-lg").div.div
middle_info = tag.find("app-listing-information-lg").find("app-property-label").parent
bottom_info = tag.find(class_="house-details-blocks")
link = tag.a['href']
address=top_info.div.div.span.string
city=address.parent.next_sibling.string
price=top_info.div.next_sibling.div.contents[1]
price_per_m2=price.parent.next_sibling.string
price_diff_tag=top_info.div.next_sibling.div.find("span", class_="badge")
price_diff = price_diff_tag.string if price_diff_tag else "0"
housing_type = middle_info.find("app-property-label").find("span", class_="text").string
date_added = middle_info.p.string
rooms = bottom_info.contents[0].span.string
area = bottom_info.contents[1].span.string
energy_label = bottom_info.contents[2].span.string
year_built = bottom_info.contents[3].span.string
ground_area = bottom_info.contents[4].span.string
monthly_cost = bottom_info.contents[5].span.string
date_removed = tag.find("app-listing-information-lg").find("span", class_="bg-warning").string if status == "removed" else ""
# appending data dictionnary to dict_list. BeautifulSoup converts HTML non-breaking spaces ( ) to \xa0 (unicode), so we need to remove them.
dict_list.append({
"link": link,
"address": str(address).replace(u'\xa0', u' '),
"city": city,
"housing_type": housing_type,
"price": str(price).replace(u'\xa0', u' '),
"price_per_m2": str(price_per_m2).replace(u'\xa0', u' '),
"price_diff%": price_diff,
"area": area,
"rooms": rooms,
"ground_area": ground_area,
"energy_label": energy_label,
"year_built": year_built,
"monthly_cost": monthly_cost,
"date_added": date_added,
"retrieved": time_retrieved,
"date_removed": date_removed,
"status": status
})
def create_dataframe(dict_list):
df = pd.DataFrame(dict_list)
df["id"] = df.link.str.split("/").str[2]
df = df.apply(lambda col: col.str.strip())
df.address = df.address.str.rstrip(",")
df["postcode"] = df.city.str.split(n=1, expand=True)[0]
df.city = df.city.str.split(n=1, expand=True)[1]
df.energy_label = df.energy_label.str.replace("Energimærke: ", "")
df.price = df.price.str.replace(" kr\.","").str.replace("\.","")
df.price_per_m2 = df.price_per_m2.str.replace(" kr\. \/ m²","")\
.str.replace("\.","")
df["price_diff%"] = df["price_diff%"].str.replace("%","")
df.area = df.area.str.replace(" m²","")
df.monthly_cost = df.monthly_cost.str.replace(" kr\. \/ md\.","")\
.str.replace("Ejerudgift: ","")\
.str.replace("Boligydelse: ","")\
.str.replace("\.","")
df.rooms = df.rooms.str.replace("Værelser: ","").str.replace(" værelse","")
df.ground_area = df.ground_area.str.replace(" m²","").str.replace("\.","")
numeric_columns = ["id", "price","price_per_m2", "area","rooms",
"ground_area", "monthly_cost"]
df[numeric_columns] = df[numeric_columns]\
.apply(lambda col: pd.to_numeric(col, errors="coerce").astype(int))
# to replace the non-numbers/missing info with NaN
df["price_diff%"] = pd.to_numeric(df["price_diff%"], errors="coerce").fillna(0).astype(int)
df["price_diff%"] = df["price_diff%"]/100
# to replace the non-numbers/missing info with 0
def convert_to_date_string(str):
arr = str.replace("Oprettet ","").replace(".","").split(" ")
if (len(arr[0])==1):
arr[0] = "0" + arr[0] # adding a 0 to numbers from 1 to 9
months_conversion = {"jan": "01", "feb": "02", "mar": "03", "apr": "04",
"maj": "05", "jun": "06", "jul": "07", "aug": "08", "sep":"09", "okt": "10",
"nov": "11", "dec": "12"}
date = arr[2]+ "-" + months_conversion[arr[1]] + "-" + arr[0]
return date
df.date_added = df.date_added.apply(convert_to_date_string)
df.date_added = df.date_added.astype("datetime64[ns]")
df.retrieved = df.retrieved.astype("datetime64[ns]")
df.date_removed = df.date_removed\
.str.replace("Ikke længere til salg - sidst set ", "")
df.date_removed = pd.to_datetime(df.date_removed, format="%d-%m-%Y", errors="coerce")
df.loc[df.status=="online", "days_on_sale"] = (df.retrieved-df.date_added).dt.days
df.loc[df.status=="removed", "days_on_sale"] = (df.date_removed-df.date_added).dt.days
df.year_built = pd.to_datetime(df.year_built, format="%Y", errors="coerce")
df = df[["id", "address", "postcode", "city", "housing_type", "price", "price_per_m2", \
"price_diff%", "area", "ground_area", "rooms", "year_built", "energy_label", \
"monthly_cost", "date_added", "date_removed", "days_on_sale", "status", \
"retrieved", "link"]]
return df