scraper.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. # encoding=utf-8
  2. import itertools
  3. import logging
  4. import time
  5. from terroroftinytown.client.errors import PleaseRetry, ScraperError,\
  6. MalformedResponse
  7. from terroroftinytown.services.registry import registry
  8. from terroroftinytown.six import u
  9. _logger = logging.getLogger(__name__)
  10. class Scraper(object):
  11. '''URL shortner scraper.
  12. Args:
  13. shortener_params (dict): The mapping has the keys:
  14. * url_template (str)
  15. * alphabet (str)
  16. * redirect_codes (list)
  17. * no_redirect_codes (list)
  18. * unavailable_codes (list)
  19. * banned_codes (list)
  20. todo_list (list): A list of integers.
  21. '''
  22. MAX_RETRY_COUNT = 10
  23. def __init__(self, shortener_params, todo_list, max_try_count=MAX_RETRY_COUNT):
  24. self.params = shortener_params
  25. self.todo_list = todo_list
  26. self.max_try_count = max_try_count
  27. self.results = {}
  28. self.service = self.get_service()(self.params)
  29. def run(self):
  30. self.service.prepare()
  31. for item in self.todo_list:
  32. for try_count in itertools.count():
  33. if try_count > 0:
  34. _logger.info('Attempt %d', (try_count + 1))
  35. if try_count > self.max_try_count:
  36. if hasattr(self.service, 'current_shortcode'):
  37. shortcode = self.service.current_shortcode
  38. else:
  39. shortcode = ''
  40. raise ScraperError(
  41. 'Number of attempts exceeded for {0} ({1}).'
  42. .format(repr(item), shortcode)
  43. )
  44. try:
  45. result = self.service.scrape_one(item)
  46. except PleaseRetry:
  47. time.sleep(10 * try_count)
  48. except MalformedResponse:
  49. _logger.info('Skipped URL due to malformed response.')
  50. self.service.wait()
  51. break
  52. else:
  53. if result:
  54. self.results[result['shortcode']] = result
  55. self.service.wait()
  56. break
  57. return self.results
  58. def get_service(self):
  59. if self.params['name'] in registry:
  60. return registry[self.params['name']]
  61. else:
  62. return registry[u('_default')]