Kein Plan alter
This commit is contained in:
parent
4996420156
commit
61dc4b2c98
File diff suppressed because one or more lines are too long
|
@ -1,10 +1,134 @@
|
|||
import scrapy
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
import json
|
||||
import uuid
|
||||
from time import sleep
|
||||
|
||||
import selenium
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
|
||||
class ImmoscoutSpider(scrapy.Spider):
|
||||
name = "immoscout"
|
||||
allowed_domains = ["www.immobilienscout24.de"]
|
||||
start_urls = ["http://www.immobilienscout24.de/"]
|
||||
def gen_uuid():
|
||||
return str(uuid.uuid1())
|
||||
|
||||
def parse(self, response):
|
||||
pass
|
||||
def get_address():
|
||||
pass
|
||||
|
||||
def is_mieter_plus(listing):
|
||||
try:
|
||||
listing.find_element(By.CSS_SELECTOR, 'div.plusBooking')
|
||||
return True
|
||||
except selenium.common.exceptions.NoSuchElementException:
|
||||
return False
|
||||
|
||||
def is_exact_address(address):
|
||||
if address.count(',') == 1:
|
||||
return False
|
||||
|
||||
|
||||
return True
|
||||
|
||||
def to_number(value):
|
||||
orig = value
|
||||
values = value.split('-')
|
||||
converted_values = []
|
||||
for value in values:
|
||||
value = re.sub(r'[^\d,]', '', value)
|
||||
value = re.sub(r',', '.', value)
|
||||
try:
|
||||
value = float(value)
|
||||
converted_values.append(value)
|
||||
except Exception as e:
|
||||
print(f'{orig=} eur', e)
|
||||
return converted_values
|
||||
|
||||
def format_title(title):
|
||||
title = title.removeprefix('NEU')
|
||||
return title
|
||||
|
||||
def main():
|
||||
global driver
|
||||
url = 'https://www.immobilienscout24.de/Suche/radius/wohnung-mieten?' \
|
||||
'centerofsearchaddress=Hamburg;;;1276006001;Hamburg;&' \
|
||||
'price=-1000.0&' \
|
||||
'pricetype=calculatedtotalrent&' \
|
||||
'geocoordinates=53.55384;9.99165;10.0&' \
|
||||
'enteredFrom=result_list'
|
||||
|
||||
driver = webdriver.Firefox()
|
||||
|
||||
driver.get(url)
|
||||
|
||||
input('Start?')
|
||||
|
||||
page = 1
|
||||
while True:
|
||||
data = []
|
||||
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
listings = driver.find_elements(By.CSS_SELECTOR, 'li.result-list__listing')
|
||||
|
||||
for listing in listings:
|
||||
try:
|
||||
title_el = listing.find_element(By.CSS_SELECTOR, 'a.result-list-entry__brand-title-container')
|
||||
title = format_title(title_el.text)
|
||||
link = title_el.get_attribute('href')
|
||||
except selenium.common.exceptions.NoSuchElementException:
|
||||
title = None
|
||||
link = None
|
||||
|
||||
data.append({
|
||||
'uuid': gen_uuid(),
|
||||
'title': title,
|
||||
'link': link
|
||||
})
|
||||
|
||||
address = listing.find_element(By.CSS_SELECTOR, 'button.result-list-entry__map-link').text
|
||||
data[-1]['address'] = address
|
||||
data[-1]['exactAddress'] = is_exact_address(address)
|
||||
data[-1]['mieterPlus'] = is_mieter_plus(listing)
|
||||
|
||||
primary_critera = listing.find_elements(By.CSS_SELECTOR, 'dl.result-list-entry__primary-criterion')
|
||||
for pc in primary_critera:
|
||||
name = pc.find_element(By.CSS_SELECTOR, 'dt').text
|
||||
value = pc.find_element(By.CSS_SELECTOR, 'dd').text
|
||||
orig = value
|
||||
values = to_number(value)
|
||||
if name == 'Warmmiete':
|
||||
key = 'rent'
|
||||
elif name == 'Wohnfläche':
|
||||
key = 'area'
|
||||
elif name == 'Zi.':
|
||||
key = 'rooms'
|
||||
else:
|
||||
key = 'unknown'
|
||||
values = [orig]
|
||||
data[-1][key] = values
|
||||
|
||||
next_page = driver.find_element(By.CSS_SELECTOR, 'a[aria-label="Next page"]')
|
||||
parent_next_page_class = next_page.find_element(By.XPATH, '..').get_attribute('class')
|
||||
|
||||
if 'disabled' in parent_next_page_class:
|
||||
print('\033[31mNo next page. Exiting\033[m')
|
||||
break
|
||||
|
||||
try:
|
||||
with open('out.json', 'r', encoding='utf-8') as in_data:
|
||||
loaded_data = json.loads(in_data.read())
|
||||
except FileNotFoundError:
|
||||
loaded_data = []
|
||||
all_data = loaded_data + data
|
||||
with open('out.json', 'w', encoding='utf-8') as out:
|
||||
out.write(json.dumps(all_data, ensure_ascii=False))
|
||||
|
||||
print(f'\033[33mScraped page {page}\033[m')
|
||||
|
||||
next_page.click()
|
||||
|
||||
page += 1
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
print('\033[32mDone\033[m')
|
||||
|
|
Loading…
Reference in New Issue