-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathimmoscout24_scrapper.py
146 lines (124 loc) · 5.13 KB
/
immoscout24_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Scraper for immobilienscout24
from tools import get_header
from datetime import datetime
import re
import pprint
import requests
from bs4 import BeautifulSoup
#proxies = {'https': '51.75.160.176:9999'} proxies = proxies
def get_page_content_hdr(url, headers, proxies):
'''
Get page content using proxies
'''
#pprint.pprint(headers)
response = requests.get(url, headers = headers, proxies= proxies)
pprint.pprint(response)
content = BeautifulSoup(response.text, 'html5lib')
return content, headers
def get_page_content(url):
"""[Get page content]
Args:
url ([str]): [web url]
Returns:
[type]: [description]
"""
headers = get_header()
#pprint.pprint(headers)
response = requests.get(url, headers = headers)
pprint.pprint(response)
content = BeautifulSoup(response.text, 'html5lib')
return content, headers
def get_new_flats_info(immo24_search_url, immo24_base_url):
"""[Get list of new flats from the search page]
Args:
immo24_search_url ([str]): [search url]
immo24_base_url ([type]): [base search url]
Returns:
[new_flats_url_list]: [urls list on flats]
[blocked]:[if crawler got blocked]
"""
immo24_content, header = get_page_content(immo24_search_url)
warning = 'To regain access, please make sure that cookies and JavaScript \
are enabled before reloading the page'
if warning in immo24_content.text:
# Scraper got blocked -> no need to parse further
print('----------------------BLOCKED')
blocked = True
new_flats_url_list = []
else:
flats = immo24_content.find_all(class_='result-list__listing') # get all flats on 1st page
new_flats_url_list = []
for flat in flats:
flat_id = flat.find(attrs={"data-id": True})['data-id'] # get flat ID
flat_url = immo24_base_url + flat_id # make flat url
new_flats_url_list.append(flat_url)
blocked = False
return new_flats_url_list, blocked, header
def get_flat_full_details(immo_flat_url):
"""[Make a dict with data of 1 flat]
Args:
immo_flat_url ([str]): [Flat specific url]
Returns:
[dict]: [Flat data]
"""
flat_content, header = get_page_content(immo_flat_url)
#pprint.pprint(flat_content)
flat_full_info = {}
flat_full_info['weblink'] = immo_flat_url
print(immo_flat_url)
flat_full_info['source'] = 'immoscout24'
flat_full_info['phone'] = '' # not scraping
flat_full_info['email'] = '' # not scraping
try:
flat_full_info['description'] = flat_content.find(class_= 'criteriagroup').h1.text.strip()
except AttributeError:
flat_full_info ['description'] = ''
print('AttributeError - no descripttion')
try:
flat_full_info['address'] = flat_content.find(class_= 'address-block').text.strip()
except AttributeError:
flat_full_info ['address'] = ''
try:
flat_full_info['price'] = flat_content.find(class_= 'is24qa-kaltmiete-main is24-value font-semibold is24-preis-value').text.strip()
except AttributeError:
flat_full_info['price'] = ''
#flat_full_info['price'] = flat_content.find(class_= 'is24qa-kaltmiete is24-value font-semibold is24-preis-value').text.strip()
try:
flat_full_info['rooms'] = flat_content.find(class_= 'is24qa-zi-main is24-value font-semibold').text.strip()
except AttributeError:
flat_full_info['rooms'] = ''
try:
flat_full_info['Area'] = flat_content.find(class_= 'is24qa-flaeche-main is24-value font-semibold').text.strip()
except AttributeError:
flat_full_info['Area'] = ''
try:
flat_full_info['movinDate'] = flat_content.find(class_= 'is24qa-bezugsfrei-ab grid-item three-fifths').text.strip()
except AttributeError:
flat_full_info['movinDate'] = ''
print('AttributeError - no move in date')
#flat_full_info['onlineSince'] = 'no data' #! IF IT GOES WRONG AGAIN!
try:
flat_full_info['petsAllowed'] = flat_content.find(class_= 'is24qa-haustiere grid-item three-fifths').text.strip()
except AttributeError:
flat_full_info['petsAllowed'] = ''
print('AttributeError - no pets info')
try:
online_since = flat_content.find(class_= 'criteriagroup flex flex--wrap criteria-group--spacing padding-top-s').find('script').text.strip()
for online in online_since.splitlines():
if "exposeOnlineSince" in online:
online_date = re.findall(r'"([^"]*)"', online)
flat_full_info['onlineSince'] = ''.join(online_date).split('.')[0].strip()
break
else:
flat_full_info['onlineSince'] = 'no data'
except Exception as e:
print(e)
flat_full_info['onlineSince'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%S')
#Adding a pic to it
try:
flat_full_info['imageLink'] = flat_content.find(class_= 'first-gallery-picture-container').find('img').get('src')
#print(type(flat_full_info['imageLink']))
except:
print('no picture provided')
flat_full_info['imageLink'] = ''
return flat_full_info