12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- # encoding=utf-8
- import itertools
- import logging
- import time
- from terroroftinytown.client.errors import PleaseRetry, ScraperError,\
- MalformedResponse
- from terroroftinytown.services.registry import registry
- from terroroftinytown.six import u
- _logger = logging.getLogger(__name__)
- class Scraper(object):
- '''URL shortner scraper.
- Args:
- shortener_params (dict): The mapping has the keys:
- * url_template (str)
- * alphabet (str)
- * redirect_codes (list)
- * no_redirect_codes (list)
- * unavailable_codes (list)
- * banned_codes (list)
- todo_list (list): A list of integers.
- '''
- MAX_RETRY_COUNT = 10
- def __init__(self, shortener_params, todo_list, max_try_count=MAX_RETRY_COUNT):
- self.params = shortener_params
- self.todo_list = todo_list
- self.max_try_count = max_try_count
- self.results = {}
- self.service = self.get_service()(self.params)
- def run(self):
- self.service.prepare()
- for item in self.todo_list:
- for try_count in itertools.count():
- if try_count > 0:
- _logger.info('Attempt %d', (try_count + 1))
- if try_count > self.max_try_count:
- if hasattr(self.service, 'current_shortcode'):
- shortcode = self.service.current_shortcode
- else:
- shortcode = ''
- raise ScraperError(
- 'Number of attempts exceeded for {0} ({1}).'
- .format(repr(item), shortcode)
- )
- try:
- result = self.service.scrape_one(item)
- except PleaseRetry:
- time.sleep(10 * try_count)
- except MalformedResponse:
- _logger.info('Skipped URL due to malformed response.')
- self.service.wait()
- break
- else:
- if result:
- self.results[result['shortcode']] = result
- self.service.wait()
- break
- return self.results
- def get_service(self):
- if self.params['name'] in registry:
- return registry[self.params['name']]
- else:
- return registry[u('_default')]
|