-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpararius_scrape.py
153 lines (122 loc) · 4.78 KB
/
pararius_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import re
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup
def scrape_data():
city = input("Enter the city you want to scrape: ").lower()
min_price = int(input("Enter the min price you want to pay: "))
max_price = int(input("Enter the max price you want to pay: "))
page = 1
records = []
while True:
if page == 1:
url = f"https://www.pararius.com/apartments/{city}/{min_price}-{max_price}"
else:
url = f"https://www.pararius.com/apartments/{city}/{min_price}-{max_price}/page-{page}"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
results = soup.find_all("li", attrs={"class": "property-list-item-container "})
print(f"page {page} loading...")
for result in results:
# Gets the type of house (appartment, studio, house)
type_house = result.find("span", attrs={"class": "type"}).text
# Gets the name and adress of the item
house_title = result.find("h2").find("a").text.split()[1:]
house_title = " ".join(house_title)
# Gets the location data (postal codel, neighborhood)
location = (
result.find("ul", attrs={"class": "breadcrumbs"})
.find_all("li")[0]
.text.split()
)
postal_code = " ".join(location[:2])
part_of_city = location[-1]
# Gets the amount of bedrooms
bedrooms = result.find("li", attrs={"class": "bedrooms"}).text.split()[0]
# Gets the surface area in m2 of the house
surface = int(result.find("li", attrs={"class": "surface"}).text.split()[0])
# Gets if the house if the house is furnished or not
furniture = result.find("li", attrs={"class": "furniture"}).text.split()[0]
# Gets the url of the house
link = result.find("a")["href"]
# Gets the description of the house
description = result.find("p", attrs={"class": "description"}).text
# Gets the estate agent of the house
estate_agent = (
result.find("p", attrs={"class": "estate-agent"}).find("a").text
)
# Gets the amount of rent of the house, and if it's inclusive or exclusive
rent = result.find("p", attrs={"class": "price"}).text.split()[0]
rent_regex = float(re.sub("\D", "", rent))
inclusive = result.find("p", attrs={"class": "price"}).text.split()[2][1:-1]
# Gets the available and offered from dates, from the detail page of the house
stored_data = []
detail_list = []
r2 = requests.get(f"https://www.pararius.com{link}")
soup2 = BeautifulSoup(r2.text, "html.parser")
details = soup2.find_all("dd")
for detail in details:
stored_data.append(detail)
# available from date
available = str(stored_data[-3])
available_regex = re.sub(r"<.*?>", "", available)
detail_list.append(available_regex)
# offered since date
offered = str(stored_data[-1])
offered_regex = re.sub(r"<.*?>", "", offered)
detail_list.append(offered_regex)
# Ads all the items to a tuple
records.append(
(
house_title,
type_house,
part_of_city,
postal_code,
bedrooms,
rent_regex,
inclusive,
surface,
furniture,
detail_list[0],
detail_list[1],
description,
estate_agent,
link,
)
)
# Goes to the next page if possible, otherwise breaks
if soup.find("li", attrs={"class": "next"}) is None:
break
print(f"page {page} completed!")
# Added delay to stop overloading the servers
time.sleep(5)
page += 1
print(f"page {page} completed!")
print("Extracted all data.")
return records
def make_csv(data):
# Creates a dataframe with Pandas
df = pd.DataFrame(
data,
columns=[
"Title",
"Type",
"Location",
"Postal Code",
"Bedrooms",
"Rent",
"Inclusive",
"Surface (m2)",
"Furnished",
"Available from",
"Offered since",
"Description",
"Agent",
"Link",
],
)
# Creates a csv file from the data frame
df.to_csv("house_data.csv", index=False)
d = scrape_data()
make_csv(d)