40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
from scrapy.item import Field
|
|
from scrapy.item import Item
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
from scrapy.selector import Selector
|
|
from scrapy.loader.processors import MapCompose
|
|
from scrapy.linkextractors import LinkExtractor
|
|
from scrapy.loader import ItemLoader
|
|
|
|
|
|
class Capitol(Item):
|
|
titol = Field()
|
|
url = Field()
|
|
|
|
|
|
class TotsRucs(CrawlSpider):
|
|
|
|
name = "Capitols"
|
|
custom_settings = {
|
|
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
|
|
}
|
|
start_urls = ['https://web.totsrucs.cat/index.php?pagina=elinks&veure=temporada&id=1970']
|
|
|
|
download_delay = 2
|
|
|
|
rules = (
|
|
Rule(
|
|
LinkExtractor(
|
|
allow=r'/index.php\?pagina=elinks&veure=elink&id='
|
|
), follow=True, callback="parse_elink"
|
|
),
|
|
)
|
|
|
|
def parse_elink(self, response):
|
|
sel = Selector(response)
|
|
item = ItemLoader(Capitol(), sel)
|
|
|
|
# item.add_xpath('titol', '//div[@id="contingut"]/a/text()')
|
|
item.add_xpath('url', '//div[@id="contingut"]/a/@href')
|
|
|
|
yield item.load_item() |