dashwood.net -

Ryan Stefan's Micro Blog

Scraping Domains in Order with Scrapy and Time Meta

Feb 112019

I wanted to come up with a way to scrape domains that have had the most time to cool off first, so I just used a time.time() stamp in the meta (or after the request) and grab the smallest number (the oldest).

class PageSpider(Spider):
    name = 'page_spider'
    allowed_urls = ['https://www.amazon.com', 'https://www.ebay.com', 'https://www.etsy.com']
    custom_settings = {
        'ITEM_PIPELINES': {
            'pipelines.MainPipeline': 90,
        },
        'CONCURRENT_REQUESTS': 200,
        'CONCURRENT_REQUESTS_PER_DOMAIN': 200,
        'ROBOTSTXT_OBEY': False,
        'CONCURRENT_ITEMS': 800,
        'REACTOR_THREADPOOL_MAXSIZE': 1600,
        # Hides printing item dicts
        'LOG_LEVEL': 'INFO',
        'RETRY_ENABLED': False,
        'REDIRECT_MAX_TIMES': 1,
        # Stops loading page after 5mb
        'DOWNLOAD_MAXSIZE': 5592405,
        # Grabs xpath before site finish loading
        'DOWNLOAD_FAIL_ON_DATALOSS': False

    }

    def __init__(self):
        self.links = ['www.test.com', 'www.different.org', 'www.pogostickaddict.net']
        self.domain_count = {}

    def start_requests(self):
        while self.links:
            start_time = time.time()
            url = next(x for x in self.links if min(domain_count, key=domain_count.get) in x)
            request = scrapy.Request(url, callback=self.parse, dont_filter=True,
                                     meta={'time': time.time()})

            request.meta['start_time'] = start_time
            request.meta['url'] = url
            yield request

    def parse(self, response):
        domain = response.url.split('//')[-1].split('/')[0]
        self.domain_count[domain] = time.time()

        pageloader = PageItemLoader(PageItem(), response=response)

        pageloader.add_xpath('search_results', '//div[1]/text()')
        self.links.remove(response.meta['url'])

        yield pageloader.load_item()

Comments