base.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # encoding=utf-8
  2. '''Base class for URL shortener services'''
  3. import datetime
  4. import logging
  5. import re
  6. import sys
  7. import time
  8. from requests.exceptions import ConnectionError
  9. from terroroftinytown.client import alphabet, VERSION
  10. from terroroftinytown.client.errors import (UnhandledStatusCode,
  11. UnexpectedNoResult, ScraperError, PleaseRetry, MalformedResponse)
  12. from terroroftinytown.services.status import URLStatus
  13. from terroroftinytown.six.moves import html_parser
  14. import terroroftinytown
  15. __all__ = ['BaseService', 'registry']
  16. DEFAULT_USER_AGENT = (
  17. 'URLTeam TerrorOfTinyTown/{version} (ArchiveTeam; '
  18. '+http://archiveteam.org/index.php?title=URLTeam/Appeal)'
  19. ).format(version=VERSION)
  20. class BaseService(object):
  21. def __init__(self, params):
  22. self.params = params
  23. self.logger = logging.getLogger(self.__class__.__name__)
  24. self.current_shortcode = None
  25. self.user_agent = DEFAULT_USER_AGENT
  26. self.tolerate_missing_location_header = bool(
  27. self.params.get('location_anti_regex') and \
  28. re.search(self.params['location_anti_regex'], ''))
  29. def prepare(self):
  30. pass
  31. def wait(self):
  32. sleep_time = self.params['request_delay']
  33. time.sleep(sleep_time)
  34. def transform_sequence_num(self, sequence_number):
  35. return alphabet.int_to_str(
  36. sequence_number, self.params['alphabet']
  37. )
  38. def scrape_one(self, sequence_number):
  39. self.current_shortcode = shortcode = self.transform_sequence_num(sequence_number)
  40. url = self.params['url_template'].format(shortcode=shortcode)
  41. self.logger.info('Requesting %s', url)
  42. response = self.fetch_url(url)
  43. url_status, result_url, encoding = self.process_response(response)
  44. if url_status == URLStatus.ok:
  45. assert result_url is not None
  46. self.logger.info('Got a result.')
  47. self.logger.debug('%s %s', result_url, response.encoding)
  48. return {
  49. 'shortcode': shortcode,
  50. 'url': result_url,
  51. 'encoding': encoding or 'latin-1'
  52. }
  53. def fetch_url(self, url, method=None):
  54. # this import is moved here so that tracker can import
  55. # registry without installing requests
  56. import requests
  57. assert method in (None, 'get', 'head'), method
  58. headers = {
  59. 'User-Agent': self.user_agent,
  60. }
  61. try:
  62. if method == 'get' or self.params['method'] == 'get':
  63. response = requests.get(
  64. url, allow_redirects=False, headers=headers, timeout=60)
  65. else:
  66. response = requests.head(
  67. url, allow_redirects=False, headers=headers, timeout=60)
  68. except (ConnectionError, ValueError) as e:
  69. return self.process_connection_error(e)
  70. return response
  71. def process_response(self, response):
  72. status_code = response.status_code
  73. if status_code in self.params['redirect_codes']:
  74. return self.process_redirect(response)
  75. elif status_code in self.params['no_redirect_codes']:
  76. return self.process_no_redirect(response)
  77. elif status_code in self.params['unavailable_codes']:
  78. return self.process_unavailable(response)
  79. elif status_code in self.params['banned_codes']:
  80. return self.process_banned(response)
  81. else:
  82. return self.process_unknown_code(response)
  83. def process_redirect(self, response):
  84. if 'Location' in response.headers:
  85. result_url = response.headers['Location']
  86. if sys.version_info[0] == 2 and \
  87. isinstance(result_url, terroroftinytown.six.binary_type):
  88. # Headers are treated as latin-1
  89. # This is needed so that unit tests don't need to
  90. # do implicit unicode conversion. Ick!
  91. result_url = result_url.decode('latin-1')
  92. response.content # read the response to allow connection reuse
  93. return self.check_anti_regex(response, result_url, None)
  94. elif self.params.get('body_regex'):
  95. return self.process_redirect_body(response)
  96. elif self.tolerate_missing_location_header:
  97. response.content # read the response to allow connection reuse
  98. return self.process_no_redirect(response)
  99. else:
  100. response.content # read the response to allow connection reuse
  101. raise UnexpectedNoResult(
  102. 'Unexpectedly did not get a redirect result for {0}'
  103. .format(repr(response.url))
  104. )
  105. def process_redirect_body(self, response):
  106. pattern = self.params['body_regex']
  107. match = re.search(pattern, html_unescape(response.text))
  108. if match:
  109. return self.check_anti_regex(response, match.group(1), response.encoding)
  110. else:
  111. raise UnexpectedNoResult(
  112. 'Unexpectedly did not get a body result for {0}'
  113. .format(repr(response.url))
  114. )
  115. def process_no_redirect(self, response):
  116. return (URLStatus.not_found, None, None)
  117. def process_unavailable(self, response):
  118. return (URLStatus.unavailable, None, None)
  119. def process_banned(self, response):
  120. raise PleaseRetry('Server said: {0}'.format(repr(response.reason)))
  121. def process_unknown_code(self, response):
  122. raise UnhandledStatusCode(
  123. 'Unknown status code {0} for {1}'.format(response.status_code,
  124. repr(response.url))
  125. )
  126. def process_connection_error(self, exception):
  127. ex_args = repr(exception.args)
  128. if 'ProtocolError' in ex_args or 'Invalid IPv6 URL' in ex_args:
  129. raise MalformedResponse(
  130. 'Malformed response: {0}'.format(ex_args))
  131. else:
  132. raise PleaseRetry('Connection error: {0}'.format(ex_args))
  133. def check_anti_regex(self, response, result_url, encoding):
  134. if not result_url or self.matches_anti_regex(result_url):
  135. return self.process_no_redirect(response)
  136. else:
  137. return (URLStatus.ok, result_url, encoding)
  138. def matches_anti_regex(self, result_url):
  139. anti_regex = self.params.get('location_anti_regex')
  140. return (anti_regex and re.search(anti_regex, result_url))
  141. class DefaultService(BaseService):
  142. pass
  143. _html_parser_unescaper = html_parser.HTMLParser()
  144. def html_unescape(text):
  145. return _html_parser_unescaper.unescape(text)