135 lines
3.9 KiB
Python
Executable File
135 lines
3.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import re
|
|
import json
|
|
import uuid
|
|
from time import sleep
|
|
|
|
import selenium
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
|
|
|
|
def gen_uuid():
|
|
return str(uuid.uuid1())
|
|
|
|
def get_address():
|
|
pass
|
|
|
|
def is_mieter_plus(listing):
|
|
try:
|
|
listing.find_element(By.CSS_SELECTOR, 'div.plusBooking')
|
|
return True
|
|
except selenium.common.exceptions.NoSuchElementException:
|
|
return False
|
|
|
|
def is_exact_address(address):
|
|
if address.count(',') == 1:
|
|
return False
|
|
|
|
|
|
return True
|
|
|
|
def to_number(value):
|
|
orig = value
|
|
values = value.split('-')
|
|
converted_values = []
|
|
for value in values:
|
|
value = re.sub(r'[^\d,]', '', value)
|
|
value = re.sub(r',', '.', value)
|
|
try:
|
|
value = float(value)
|
|
converted_values.append(value)
|
|
except Exception as e:
|
|
print(f'{orig=} eur', e)
|
|
return converted_values
|
|
|
|
def format_title(title):
|
|
title = title.removeprefix('NEU')
|
|
return title
|
|
|
|
def main():
|
|
global driver
|
|
url = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?' \
|
|
'centerofsearchaddress=Hamburg;;;1276006001;Hamburg;&' \
|
|
'price=-1000.0&' \
|
|
'pricetype=calculatedtotalrent&' \
|
|
'geocoordinates=53.55384;9.99165;10.0&' \
|
|
'enteredFrom=result_list'
|
|
|
|
driver = webdriver.Firefox()
|
|
|
|
driver.get(url)
|
|
|
|
input('Start?')
|
|
|
|
page = 1
|
|
while True:
|
|
data = []
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
listings = driver.find_elements(By.CSS_SELECTOR, 'li.result-list__listing')
|
|
|
|
for listing in listings:
|
|
try:
|
|
title_el = listing.find_element(By.CSS_SELECTOR, 'a.result-list-entry__brand-title-container')
|
|
title = format_title(title_el.text)
|
|
link = title_el.get_attribute('href')
|
|
except selenium.common.exceptions.NoSuchElementException:
|
|
title = None
|
|
link = None
|
|
|
|
data.append({
|
|
'uuid': gen_uuid(),
|
|
'title': title,
|
|
'link': link
|
|
})
|
|
|
|
address = listing.find_element(By.CSS_SELECTOR, 'button.result-list-entry__map-link').text
|
|
data[-1]['address'] = address
|
|
data[-1]['exactAddress'] = is_exact_address(address)
|
|
data[-1]['mieterPlus'] = is_mieter_plus(listing)
|
|
|
|
primary_critera = listing.find_elements(By.CSS_SELECTOR, 'dl.result-list-entry__primary-criterion')
|
|
for pc in primary_critera:
|
|
name = pc.find_element(By.CSS_SELECTOR, 'dt').text
|
|
value = pc.find_element(By.CSS_SELECTOR, 'dd').text
|
|
orig = value
|
|
values = to_number(value)
|
|
if name == 'Warmmiete':
|
|
key = 'rent'
|
|
elif name == 'Wohnfläche':
|
|
key = 'area'
|
|
elif name == 'Zi.':
|
|
key = 'rooms'
|
|
else:
|
|
key = 'unknown'
|
|
values = [orig]
|
|
data[-1][key] = values
|
|
|
|
next_page = driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Next page"]')
|
|
parent_next_page_class = next_page.find_element(By.XPATH, '..').get_attribute('class')
|
|
|
|
if 'disabled' in parent_next_page_class:
|
|
print('\033[31mNo next page. Exiting\033[m')
|
|
break
|
|
|
|
try:
|
|
with open('out.json', 'r', encoding='utf-8') as in_data:
|
|
loaded_data = json.loads(in_data.read())
|
|
except FileNotFoundError:
|
|
loaded_data = []
|
|
all_data = loaded_data + data
|
|
with open('out.json', 'w', encoding='utf-8') as out:
|
|
out.write(json.dumps(all_data, ensure_ascii=False))
|
|
|
|
print(f'\033[33mScraped page {page}\033[m')
|
|
|
|
next_page.click()
|
|
|
|
page += 1
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
|
|
print('\033[32mDone\033[m')
|