wohnungssuche/immoscout.py

135 lines
3.9 KiB
Python
Executable File

#!/usr/bin/env python3
import re
import json
import uuid
from time import sleep
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
def gen_uuid():
return str(uuid.uuid1())
def get_address():
pass
def is_mieter_plus(listing):
try:
listing.find_element(By.CSS_SELECTOR, 'div.plusBooking')
return True
except selenium.common.exceptions.NoSuchElementException:
return False
def is_exact_address(address):
if address.count(',') == 1:
return False
return True
def to_number(value):
orig = value
values = value.split('-')
converted_values = []
for value in values:
value = re.sub(r'[^\d,]', '', value)
value = re.sub(r',', '.', value)
try:
value = float(value)
converted_values.append(value)
except Exception as e:
print(f'{orig=} eur', e)
return converted_values
def format_title(title):
title = title.removeprefix('NEU')
return title
def main():
global driver
url = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?' \
'centerofsearchaddress=Hamburg;;;1276006001;Hamburg;&' \
'price=-1000.0&' \
'pricetype=calculatedtotalrent&' \
'geocoordinates=53.55384;9.99165;10.0&' \
'enteredFrom=result_list'
driver = webdriver.Firefox()
driver.get(url)
input('Start?')
page = 1
while True:
data = []
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
listings = driver.find_elements(By.CSS_SELECTOR, 'li.result-list__listing')
for listing in listings:
try:
title_el = listing.find_element(By.CSS_SELECTOR, 'a.result-list-entry__brand-title-container')
title = format_title(title_el.text)
link = title_el.get_attribute('href')
except selenium.common.exceptions.NoSuchElementException:
title = None
link = None
data.append({
'uuid': gen_uuid(),
'title': title,
'link': link
})
address = listing.find_element(By.CSS_SELECTOR, 'button.result-list-entry__map-link').text
data[-1]['address'] = address
data[-1]['exactAddress'] = is_exact_address(address)
data[-1]['mieterPlus'] = is_mieter_plus(listing)
primary_critera = listing.find_elements(By.CSS_SELECTOR, 'dl.result-list-entry__primary-criterion')
for pc in primary_critera:
name = pc.find_element(By.CSS_SELECTOR, 'dt').text
value = pc.find_element(By.CSS_SELECTOR, 'dd').text
orig = value
values = to_number(value)
if name == 'Warmmiete':
key = 'rent'
elif name == 'Wohnfläche':
key = 'area'
elif name == 'Zi.':
key = 'rooms'
else:
key = 'unknown'
values = [orig]
data[-1][key] = values
next_page = driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Next page"]')
parent_next_page_class = next_page.find_element(By.XPATH, '..').get_attribute('class')
if 'disabled' in parent_next_page_class:
print('\033[31mNo next page. Exiting\033[m')
break
try:
with open('out.json', 'r', encoding='utf-8') as in_data:
loaded_data = json.loads(in_data.read())
except FileNotFoundError:
loaded_data = []
all_data = loaded_data + data
with open('out.json', 'w', encoding='utf-8') as out:
out.write(json.dumps(all_data, ensure_ascii=False))
print(f'\033[33mScraped page {page}\033[m')
next_page.click()
page += 1
if __name__ == '__main__':
main()
print('\033[32mDone\033[m')