#!/usr/bin/python3 # -*- coding: utf-8 -*- import os import statistics import sys import logging import codecs import time import xlsxwriter import datetime import smtplib from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.application import MIMEApplication from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException, StaleElementReferenceException href_1 = 'https://en.aruodas.lt/nt_zemelapis/?obj=1&on_map=1&type=map&FSelectedArea=u99znh05tds4%2Cu99znqdmts7u%2Cu99znx4p99ee%2Cu99znvstves2%2Cu99znenb3dhq%2Cu99zne2djsev%2Cu99znkhh1th2%2Cu99zjgkyjwh3%2Cu99zjge8t8h4%2Cu99zjggyvwec%2Cu99znh0y3x5b#zoom:13;center:(54.698048468880216,25.254161356320424)' href_2 = 'https://en.aruodas.lt/nt_zemelapis/?obj=1&on_map=1&type=map&FSelectedArea=u99znv59jpvx%2Cu99znu91thv0%2Cu99zndzq3ptt%2Cu99znfcyjnt1%2Cu99zp56n9jtn%2Cu99zp5n3ejmw%2Cu99zp71dg4t3%2Cu99zp7mwvnv7%2Cu99zp7tvt5tp%2Cu99zpk5um5me%2Cu99zph7t10tp%2Cu99znvp055j4%2Cu99znv58ejt2#zoom:14;center:(54.68989490798361,25.272123999999963)' href_3 = 'https://en.aruodas.lt/nt_zemelapis/?obj=1&on_map=1&type=map&FSelectedArea=u99zp75y38fj%2Cu99zp7p5d91g%2Cu99zpe1mqt1s%2Cu99zpe499scs%2Cu99zpdfdwwcu%2Cu99zpddj8t1f%2Cu99zpd6nkt9r%2Cu99zpd1rqe1x%2Cu99zpd4368cz%2Cu99zp9uzfd9d%2Cu99zp9usnd64%2Cu99zp9ejdwcy%2Cu99zp90wud6p%2Cu99zp3pkref4%2Cu99zp3hq1e3s%2Cu99zp38xkw3d%2Cu99zp4nvqd3g%2Cu99zp4wdh96p%2Cu99zp4yvsxcz%2Cu99zp5r8399e%2Cu99zp760ddct%2Cu99zp7hj2t1u#zoom:14;center:(54.68006792227714,25.287418499999944)' href_4 = 'https://en.aruodas.lt/nt_zemelapis/?obj=1&on_map=1&type=map&FSelectedArea=u99zpdtuqntv%2Cu99zpdt9d4jb%2Cu99zpdd531t6%2Cu99zpd6jdjm3%2Cu99zpd1rtpt9%2Cu99zpd41fnt7%2Cu99zpdj9tjm1%2Cu99zpdnz9pt1%2Cu99zpdrc1jvb%2Cu99zpfhd4pvw%2Cu99zpfpsm0tk%2Cu9dp042mk5vp%2Cu99zpfxvq4m0%2Cu99zpfy904tj%2Cu99zpghgqpte%2Cu99zpg5xf4m1%2Cu99zpgdcd0jh%2Cu99zpgdhs1jr%2Cu99zpg95h5j2%2Cu99zpdyjypvd%2Cu99zpdtg3njr#zoom:15;center:(54.68360773485153,25.302744000000075)' filename_output = os.path.join('output', 'ImoBot_output.xlsx') history_filepath = os.path.join('output', 'history.log') driver_win_filepath = os.path.join('driver', 'chromedriver.exe') driver_nt_filepath = os.path.join('driver', 'chromedriver') mailto = ['dennis.thiessen@riskahead.de', 'kevin.gruendel@riskahead.de', 'florianbergel@yahoo.de'] pages = [['Zverynas', href_1], ['Gedimino Prospektas', href_2], ['Old City', href_3], ['Uzupis', href_4]] #mailto = ['dennis.thiessen@riskahead.de'] #pages = [['Zverynas', href_1]] class Area: def __init__(self, name, href, flats_for_rent=None, flats_for_sale=None, houses_for_sale=None, plots_for_sale=None): self.name = name self.href = href self.flats_for_rent = flats_for_rent self.flats_for_sale = flats_for_sale self.houses_for_sale = houses_for_sale self.plots_for_sale = plots_for_sale class Property: def __init__(self, href, addr, desc, price): self.href = href self.addr = addr self.desc = desc self.price = price def get_price(self): return float(self.price.replace(" ", "").replace(",", ".").split('€')[0]) def get_price_m2(self): return float(self.price.replace(" ", "").replace(",", ".").split('€')[1][1:]) def get_m2(self): return float(self.get_price() / self.get_price_m2()) def get_roi(self, avg_rent_m2): return float(avg_rent_m2 * self.get_m2() * 12 / self.get_price()) class PropertyCollection: def __init__(self, properties): self.properties = properties self.average_price = self.calc_average_price() self.average_price_m2 = self.calc_average_price_m2() self.median_price = self.calc_median_price() self.median_price_m2 = self.calc_median_price_m2() def calc_average_price(self): return sum(c.get_price() for c in self.properties) / len(self.properties) if len(self.properties) > 0 else 0 def calc_average_price_m2(self): return sum(c.get_price_m2() for c in self.properties) / len(self.properties) if len(self.properties) > 0 else 0 def calc_median_price(self): return statistics.median([c.get_price() for c in self.properties]) if len(self.properties) > 0 else 0 def calc_median_price_m2(self): return statistics.median([c.get_price_m2() for c in self.properties]) if len(self.properties) > 0 else 0 def sort_by_roi(self, price_m2): if len(self.properties) > 0: self.properties = sorted(self.properties, key=lambda x: x.get_roi(price_m2), reverse=True) def count(self): return len(self.properties) def get_script_path(): return os.path.dirname(os.path.realpath(sys.argv[0])) def send_mail(send_from, send_to, subject, text, files=None, server="mail.riskahead.de"): assert isinstance(send_to, list) msg = MIMEMultipart() msg['From'] = send_from msg['To'] = ", ".join(send_to) msg['Subject'] = subject msg.attach(MIMEText(text)) for f in files or []: with open(os.path.join(get_script_path(), f), "rb") as fil: part = MIMEApplication( fil.read(), Name=os.path.basename(f) ) # After the file is closed part['Content-Disposition'] = 'attachment; filename="%s"' % os.path.basename(f) msg.attach(part) smtp = smtplib.SMTP_SSL(server) smtp.login('support@riskahead.de', "405risksupport") smtp.sendmail(send_from, send_to, msg.as_string()) smtp.close() def switch_page(driver, page): driver.switch_to.default_content() driver.find_element_by_id('searchFormField_obj') \ .click() driver.find_element_by_id('options_obj') \ .find_element_by_xpath('//li/label[contains(text(),\''+page+'\')]/..') \ .click() driver.switch_to.frame("sideListIframe") def scrape_page(driver): elements = driver.find_elements_by_class_name('result-item-v3') items = [] for element in elements: href = element.find_element_by_class_name('object-image-link').get_attribute('href') addr = element.find_element_by_class_name('item-address-v3').text desc = element.find_element_by_class_name('item-description-v3').text price = element.find_element_by_class_name('item-price-main-v3').text items.append(Property(href, addr, desc, price)) try: pagination = driver.find_element_by_class_name('sidebar-pagination') next_page_element = pagination.find_elements_by_xpath('//a[contains(text(), \'Next page\')]') if len(next_page_element) == 1: next_page_element[0].click() items += scrape_page(driver) except NoSuchElementException as e: logger.debug("no pagination found") return items def scrape_pages(pages): logger.info('Start ImoBot...') start = time.time() chrome_driver = os.path.join(get_script_path(), driver_win_filepath) if os.name == 'nt' else os.path.join(get_script_path(), driver_nt_filepath) chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver) workbook = xlsxwriter.Workbook(os.path.join(get_script_path(), filename_output)) worksheet = workbook.add_worksheet() now = datetime.datetime.now() worksheet.write('A2', now.strftime("%Y-%m-%d %H:%M")) areas = [] row = 1 for page in pages: row, area = scrape_and_write_page(driver, worksheet, page[1], page[0], row) areas.append(area) workbook.close() end = time.time() subject = "ImoBot-Weekly-Results" text = "ImoBot-Weekly-Results\n\n" + \ "Last run: {}\n".format(now.strftime("%Y-%m-%d %H:%M")) for area in areas: text += "{}: (Flats: {}, Houses: {}, Plots: {})\n".format(area.name, area.flats_for_sale, area.houses_for_sale, area.plots_for_sale) send_mail('webmaster@riskahead.de', mailto, subject, text, [filename_output]) logger.info('Finished in {:.0f} sec'.format(end-start)) def scrape_and_write_page(driver, worksheet, href, name, row): driver.get(href) flats_for_sale, houses_for_sale, plots_for_sale, flats_for_rent = get_prices_from_site(driver) row = row + 2 worksheet.write('A' + str(row), name) row, ffr = write_to_ws(flats_for_rent, "Flats for rent", flats_for_rent, worksheet, row) row, ffs = write_to_ws(flats_for_rent, "Flats for sale", flats_for_sale, worksheet, row) row, hfs = write_to_ws(flats_for_rent, "Houses for sale", houses_for_sale, worksheet, row) row, pfs = write_to_ws(flats_for_rent, "Plots for sale", plots_for_sale, worksheet, row) return row, Area(name, href, flats_for_rent=ffr, flats_for_sale=ffs, houses_for_sale=hfs, plots_for_sale=pfs) def write_to_ws(flats_for_rent, name, properties, worksheet, row): row += 2 worksheet.write('B' + str(row), name) row += 1 worksheet.write('C' + str(row), 'Amount') worksheet.write('D' + str(row), properties.count()) row += 1 worksheet.write('C' + str(row), 'AVG Price') worksheet.write('D' + str(row), properties.average_price) row += 1 worksheet.write('C' + str(row), 'AVG m² Price') worksheet.write('D' + str(row), properties.average_price_m2) row += 2 if name == 'Flats for rent': worksheet.write('C' + str(row), 'MED Price') worksheet.write('D' + str(row), properties.median_price) row += 1 worksheet.write('C' + str(row), 'MED m² Price') worksheet.write('D' + str(row), properties.median_price_m2) else: worksheet.write('C' + str(row), 'Address') worksheet.write('D' + str(row), 'GRY (AVG)') worksheet.write('E' + str(row), 'GRY (MED)') worksheet.write('F' + str(row), 'Total Price') worksheet.write('G' + str(row), 'Price per m²') worksheet.write('H' + str(row), 'Description') worksheet.write('I' + str(row), 'Link') row += 1 for prop in properties.properties[:10]: logger.info("ROI (Median): {:3.2%}, ROI (Average): {:3.2%}, Price: {}€, Address: {}, URL: {}".format( prop.get_roi(flats_for_rent.median_price_m2), prop.get_roi(flats_for_rent.average_price_m2), prop.price, prop.addr, prop.href)) worksheet.write('C' + str(row), prop.addr) worksheet.write('D' + str(row), prop.get_roi(flats_for_rent.average_price_m2)) worksheet.write('E' + str(row), prop.get_roi(flats_for_rent.median_price_m2)) worksheet.write('F' + str(row), prop.get_price()) worksheet.write('G' + str(row), prop.get_price_m2()) worksheet.write('H' + str(row), prop.desc) worksheet.write('I' + str(row), prop.href) row += 1 return row, properties.count() def get_prices_from_site(driver): try: driver.find_element_by_class_name('close-button').click() except ElementNotVisibleException: pass driver.switch_to.frame("sideListIframe") time.sleep(3) # Avoid race condition by giving chromedriver enough time to load the page logger.info("Start scraping flats for sale...") flats_for_sale = PropertyCollection(scrape_page(driver)) logger.info("Found {} flats for sale".format(flats_for_sale.count())) logger.info("Average price: {:8.2f} €".format(flats_for_sale.average_price)) logger.info("Average m2 price: {:5.2f} €".format(flats_for_sale.average_price_m2)) logger.info("Median m2 price: {:5.2f} €".format(flats_for_sale.median_price_m2)) logger.info("Start scraping houses for sale...") switch_page(driver, 'Houses for sale') time.sleep(3) # Avoid race condition by giving chromedriver enough time to load the page houses_for_sale = PropertyCollection(scrape_page(driver)) logger.info("Found {} houses for sale".format(houses_for_sale.count())) logger.info("Average price: {:8.2f} €".format(houses_for_sale.average_price)) logger.info("Average m2 price: {:5.2f} €".format(houses_for_sale.average_price_m2)) logger.info("Median m2 price: {:5.2f} €".format(houses_for_sale.median_price_m2)) logger.info("Start scraping plots for sale...") switch_page(driver, 'Plots for sale') time.sleep(3) plots_for_sale = PropertyCollection(scrape_page(driver)) logger.info("Found {} plots for sale".format(plots_for_sale.count())) logger.info("Average price: {:8.2f} €".format(plots_for_sale.average_price)) logger.info("Average m2 price: {:5.2f} €".format(plots_for_sale.average_price_m2)) logger.info("Median m2 price: {:5.2f} €".format(plots_for_sale.median_price_m2)) logger.info("Start scraping flats for rent...") switch_page(driver, 'Flats for rent') time.sleep(3) flats_for_rent = PropertyCollection(scrape_page(driver)) logger.info("Found {} flats for rent".format(flats_for_rent.count())) logger.info("Average price: {:4.2f} €".format(flats_for_rent.average_price)) logger.info("Average m2 price: {:3.2f} €".format(flats_for_rent.average_price_m2)) logger.info("Median m2 price: {:3.2f} €".format(flats_for_rent.median_price_m2)) flats_for_sale.sort_by_roi(flats_for_rent.average_price_m2) houses_for_sale.sort_by_roi(flats_for_rent.average_price_m2) plots_for_sale.sort_by_roi(flats_for_rent.average_price_m2) return flats_for_sale, houses_for_sale, plots_for_sale, flats_for_rent if __name__ == '__main__': assert (len(sys.argv) == 1), "Wrong number of arguments given" os.environ["PYTHONIOENCODING"] = "utf-8" logger = logging.getLogger('imobot_logger') logger.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(get_script_path(), history_filepath), "a", encoding='utf-8') fh.setLevel(logging.DEBUG) sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter('%(asctime)s - %(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) logger.addHandler(fh) logger.addHandler(ch) scrape_pages(pages)