# -*- coding: utf8 -*- import re from scrapy.xpath import HtmlXPathSelector from scrapy.link.extractors import RegexLinkExtractor from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib_exp import adaptors from uiofag.items import UiofagItem import pickle class UioFagSpider(CrawlSpider): domain_name = 'uio.no' start_urls = ['http://www.uio.no/studier/emner/alfabetisk/'] rules = ( Rule(RegexLinkExtractor(allow=('uio.no/studier/emner/alfabetisk/[a-zA-Z\-]+.xml$', )), 'parse_fagliste', follow=True), ) items = [] def save_state(self): print "saving state..." pickle.dump(self.items, open("./state", 'wb')) print "saved state!" def parse_fagliste(self, response): hxs = HtmlXPathSelector(response) links = hxs.x('//ul//li') adaptor_pipe = [adaptors.extract, adaptors.delist(''), adaptors.strip] adaptor_map = { 'name': adaptor_pipe, 'url': adaptor_pipe, 'description': adaptor_pipe, } for link in links: item = UiofagItem() item.set_adaptors(adaptor_map) item.attribute('name', link.x('a/text()')) item.attribute('url', link.x('a/@href')) save_item = [ link.x('a/text()').extract(), link.x('a/@href').extract()] self.items.append(save_item) yield item def parse_item(self, response): i = UiofagItem() #xs = HtmlXPathSelector(response) #i.attribute('site_id', xs.x('//input[@id="sid"]/@value')) #i.attribute('name', xs.x('//div[@id="name"]')) #i.attribute('description', xs.x('//div[@id="description"]')) return [i] SPIDER = UioFagSpider() def domain_closed(domain, spider, status): if status=='cancelled': return print "saving state for %s" % (str(domain)) spider.save_state() from pydispatch import dispatcher from scrapy.core import signals dispatcher.connect(domain_closed, signal=signals.domain_closed)