1
0
Fork 0
PythonScripts/links-capture/totsrucs.py

40 lines
1.1 KiB
Python

from scrapy.item import Field
from scrapy.item import Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.loader.processors import MapCompose
from scrapy.linkextractors import LinkExtractor
from scrapy.loader import ItemLoader
class Capitol(Item):
titol = Field()
url = Field()
class TotsRucs(CrawlSpider):
name = "Capitols"
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
start_urls = ['https://web.totsrucs.cat/index.php?pagina=elinks&veure=temporada&id=1970']
download_delay = 2
rules = (
Rule(
LinkExtractor(
allow=r'/index.php\?pagina=elinks&veure=elink&id='
), follow=True, callback="parse_elink"
),
)
def parse_elink(self, response):
sel = Selector(response)
item = ItemLoader(Capitol(), sel)
# item.add_xpath('titol', '//div[@id="contingut"]/a/text()')
item.add_xpath('url', '//div[@id="contingut"]/a/@href')
yield item.load_item()