#!/usr/bin/env python3 import re import json import uuid from time import sleep import selenium from selenium import webdriver from selenium.webdriver.common.by import By def gen_uuid(): return str(uuid.uuid1()) def get_address(): pass def is_mieter_plus(listing): try: listing.find_element(By.CSS_SELECTOR, 'div.plusBooking') return True except selenium.common.exceptions.NoSuchElementException: return False def is_exact_address(address): if address.count(',') == 1: return False return True def to_number(value): orig = value values = value.split('-') converted_values = [] for value in values: value = re.sub(r'[^\d,]', '', value) value = re.sub(r',', '.', value) try: value = float(value) converted_values.append(value) except Exception as e: print(f'{orig=} eur', e) return converted_values def format_title(title): title = title.removeprefix('NEU') return title def main(): global driver url = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?' \ 'centerofsearchaddress=Hamburg;;;1276006001;Hamburg;&' \ 'price=-1000.0&' \ 'pricetype=calculatedtotalrent&' \ 'geocoordinates=53.55384;9.99165;10.0&' \ 'enteredFrom=result_list' driver = webdriver.Firefox() driver.get(url) input('Start?') page = 1 while True: data = [] driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") listings = driver.find_elements(By.CSS_SELECTOR, 'li.result-list__listing') for listing in listings: try: title_el = listing.find_element(By.CSS_SELECTOR, 'a.result-list-entry__brand-title-container') title = format_title(title_el.text) link = title_el.get_attribute('href') except selenium.common.exceptions.NoSuchElementException: title = None link = None data.append({ 'uuid': gen_uuid(), 'title': title, 'link': link }) address = listing.find_element(By.CSS_SELECTOR, 'button.result-list-entry__map-link').text data[-1]['address'] = address data[-1]['exactAddress'] = is_exact_address(address) data[-1]['mieterPlus'] = is_mieter_plus(listing) primary_critera = listing.find_elements(By.CSS_SELECTOR, 'dl.result-list-entry__primary-criterion') for pc in primary_critera: name = pc.find_element(By.CSS_SELECTOR, 'dt').text value = pc.find_element(By.CSS_SELECTOR, 'dd').text orig = value values = to_number(value) if name == 'Warmmiete': key = 'rent' elif name == 'Wohnfläche': key = 'area' elif name == 'Zi.': key = 'rooms' else: key = 'unknown' values = [orig] data[-1][key] = values next_page = driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Next page"]') parent_next_page_class = next_page.find_element(By.XPATH, '..').get_attribute('class') if 'disabled' in parent_next_page_class: print('\033[31mNo next page. Exiting\033[m') break try: with open('out.json', 'r', encoding='utf-8') as in_data: loaded_data = json.loads(in_data.read()) except FileNotFoundError: loaded_data = [] all_data = loaded_data + data with open('out.json', 'w', encoding='utf-8') as out: out.write(json.dumps(all_data, ensure_ascii=False)) print(f'\033[33mScraped page {page}\033[m') next_page.click() page += 1 if __name__ == '__main__': main() print('\033[32mDone\033[m')