diff --git a/departiculares.py b/departiculares.py new file mode 100644 index 0000000..9be31f9 --- /dev/null +++ b/departiculares.py @@ -0,0 +1,96 @@ +import requests +import json +from bs4 import BeautifulSoup +import re +from time import sleep + + +class Anunci: + def __init__(self, json_text, preu_capturat): + json_tree = json.loads(json_text, strict=False) + self.titol = json_tree['name'] + self.descripcio = json_tree['description'] + self.url = json_tree['url'] + self.preu = preu_capturat + + def te_piscina(self): + regex = ".*pi(c|s|z)*ina.*" + compilador = re.compile(regex, re.IGNORECASE) + te = compilador.match(desc) + return te + + def te_parquing(self): + regex = ".*p.r(qu|k)in.*" + compilador = re.compile(regex, re.IGNORECASE) + te = compilador.match(desc) + return te + + def te_traster(self): + regex = ".*traster.*" + compilador = re.compile(regex, re.IGNORECASE) + te = compilador.match(desc) + return te + + +headers = { + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" +} +base_url = "https://www.departiculares.com/alquiler/barcelona" + +pagina = 1 +pagina_horitzo = 2 +while pagina < pagina_horitzo: + print("Aranya visitant pàgina " + str(pagina)) + cua_url = "" + if pagina > 1: + cua_url = "&page=" + str(pagina) + url = base_url + cua_url + + respuesta = requests.get(url, headers=headers) + soup = BeautifulSoup(respuesta.text, features="lxml") + resultats = soup.find('ul', class_="list-results") + if resultats is not None: + resultats_item = resultats.find_all('li', class_="list-result-item") + anuncis_list = [] + + for resultat_item in resultats_item: + preu_txt = resultat_item.find('p', class_="details-price") + preu = -1 + if preu_txt is not None: + preu = int(preu_txt.text.strip().replace('.', '').replace('€', '')) + + if preu <= 1000: + res_json = resultat_item.find('script', type='application/ld+json') + anunci = Anunci(res_json.string, preu) + titol = anunci.titol + desc = anunci.descripcio + + if anunci.te_piscina() and anunci.te_parquing(): + anuncis_list.append(anunci) + + for anunci_seleccionat in anuncis_list: + print("** Anunci **") + print("Títol: " + anunci_seleccionat.titol) + print("Descripció: " + anunci_seleccionat.descripcio) + print("Url: " + anunci_seleccionat.url) + if anunci_seleccionat.preu > 0: + print("Preu: " + str(anunci_seleccionat.preu)) + print("************") + + resultats = soup.find('ul', class_="pager") + resultats_item = resultats.find_all('a') + for resultat_item in resultats_item: + paginador = resultat_item.text + if paginador.isnumeric(): + pagina_horitzo = int(paginador) + + print("setejat horitzo a " + str(pagina_horitzo)) + + pagina = pagina + 1 + print("fent una pausa...") + sleep(10) + + else: + print("sembla que no hi ha res a tractar") + +print("Fi de recorregut") diff --git a/totsrucs.py b/totsrucs.py new file mode 100644 index 0000000..bb58f2d --- /dev/null +++ b/totsrucs.py @@ -0,0 +1,40 @@ +from scrapy.item import Field +from scrapy.item import Item +from scrapy.spiders import CrawlSpider, Rule +from scrapy.selector import Selector +from scrapy.loader.processors import MapCompose +from scrapy.linkextractors import LinkExtractor +from scrapy.loader import ItemLoader + + +class Capitol(Item): + titol = Field() + url = Field() + + +class TotsRucs(CrawlSpider): + + name = "Capitols" + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36" + } + start_urls = ['https://web.totsrucs.cat/index.php?pagina=elinks&veure=temporada&id=1970'] + + download_delay = 2 + + rules = ( + Rule( + LinkExtractor( + allow=r'/index.php\?pagina=elinks&veure=elink&id=' + ), follow=True, callback="parse_elink" + ), + ) + + def parse_elink(self, response): + sel = Selector(response) + item = ItemLoader(Capitol(), sel) + + # item.add_xpath('titol', '//div[@id="contingut"]/a/text()') + item.add_xpath('url', '//div[@id="contingut"]/a/@href') + + yield item.load_item() \ No newline at end of file