tinyurl.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. import logging
  2. import re
  3. import sys
  4. from terroroftinytown.client.errors import UnexpectedNoResult, \
  5. UnhandledStatusCode, PleaseRetry
  6. from terroroftinytown.services.base import BaseService
  7. from terroroftinytown.services.rand import HashRandMixin
  8. from terroroftinytown.services.status import URLStatus
  9. from terroroftinytown.six.moves import html_parser
  10. from terroroftinytown.six.moves.urllib import parse as urlparse
  11. _logger = logging.getLogger(__name__)
  12. class TinyurlService(BaseService):
  13. def prepare(self):
  14. self.user_agent = 'curl/7.37.1 (not really) {0}'.format(self.user_agent)
  15. def process_redirect(self, response):
  16. if response.status_code == 200:
  17. return self._fetch_200(response)
  18. else:
  19. if 'Location' in response.headers and response.status_code == 301:
  20. tiny = response.headers.get("X-tiny")
  21. if tiny and tiny[:3] == "aff":
  22. return self._preview(
  23. self.current_shortcode, response.headers['Location']
  24. )
  25. try:
  26. return BaseService.process_redirect(self, response)
  27. except UnexpectedNoResult:
  28. return (URLStatus.unavailable, None, None)
  29. def _fetch_200(self, response):
  30. new_response = self.fetch_url(response.url, method='get')
  31. new_response.encoding = 'utf-8'
  32. if new_response.status_code != 200:
  33. raise PleaseRetry(
  34. 'Strange 200 change to {0} for {1}'.format(
  35. new_response.status_code, repr(response.url))
  36. )
  37. if "<title>Redirecting...</title>" in new_response.text:
  38. return self._parse_errorhelp(new_response)
  39. elif "Error: TinyURL redirects to a TinyURL." in new_response.text:
  40. return self._parse_tinyurl_redirect(new_response)
  41. elif 'This TinyURL went to:':
  42. return self._parse_spam_blocklist(new_response)
  43. else:
  44. raise UnhandledStatusCode(
  45. 'Unhandled 200 change to {0} for {1}'.format(
  46. new_response.status_code, repr(response.url))
  47. )
  48. def _parse_errorhelp(self, response):
  49. match = re.search('<meta http-equiv="refresh" content="0;url=(.*?)">', response.text)
  50. if not match:
  51. raise UnexpectedNoResult("No redirect on \"errorhelp\" page on HTTP status 200 for {0}".format(response.url))
  52. url = urlparse.urlparse(match.group(1))
  53. if url.scheme != "http" or url.netloc != "tinyurl.com" or url.path != "/errorb.php":
  54. raise UnexpectedNoResult("Unexpected redirect on \"errorhelp\" page on HTTP status 200 for {0}".format(response.url))
  55. if sys.version_info[0] == 2:
  56. query = urlparse.parse_qs(url.query.encode('utf-8'))
  57. else:
  58. query = urlparse.parse_qs(url.query)
  59. if not ("url" in query and len(query["url"]) == 1) or not ("path" in query and len(query["path"]) == 1):
  60. raise UnexpectedNoResult("Unexpected redirect on \"errorhelp\" page on HTTP status 200 for {0}".format(response.url))
  61. if query["path"][0] != ("/" + self.current_shortcode):
  62. raise UnexpectedNoResult("Code mismatch on \"errorhelp\" on HTTP status 200")
  63. encoding = response.encoding
  64. if sys.version_info[0] == 2:
  65. try:
  66. result_url = query["url"][0].decode('utf-8')
  67. except UnicodeError:
  68. try:
  69. result_url = query["url"][0].decode('cp1252')
  70. encoding = 'cp1252'
  71. except UnicodeError:
  72. result_url = query["url"][0].decode('latin-1')
  73. encoding = 'latin-1'
  74. else:
  75. result_url = query["url"][0]
  76. return (URLStatus.ok, result_url, encoding)
  77. def _parse_tinyurl_redirect(self, response):
  78. match = re.search("<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is (?:<script>.*?</script>)?<a href=\"(.*?)\">", response.text, re.DOTALL)
  79. if not match:
  80. raise UnexpectedNoResult("No redirect on \"tinyurl redirect\" page on HTTP status 200 for {0}".format(response.url))
  81. url = match.group(1)
  82. return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
  83. def _parse_spam_blocklist(self, response):
  84. match = re.search("<p>This TinyURL went to: (.*?)</p>", response.text, re.DOTALL)
  85. if not match:
  86. raise UnexpectedNoResult("No redirect on \"spam redirect\" page on HTTP status 200 for {0}".format(response.url))
  87. url = match.group(1)
  88. return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
  89. def _preview(self, code, affiliate_url):
  90. response = self.fetch_url("https://tinyurl.com/preview.php?num=" + code, method='get')
  91. if response.status_code != 200:
  92. raise UnexpectedNoResult("Unexpected HTTP status %i on preview page %s" % (response.status_code, response.url))
  93. match = re.search("<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>", response.text, re.DOTALL)
  94. if not match:
  95. raise UnexpectedNoResult("No redirect on preview page {0}".format(response.url))
  96. url = match.group(1)
  97. if url == "":
  98. return self._scrub_url(code, affiliate_url)
  99. return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
  100. def _scrub_url(self, code, url):
  101. parsed_url = urlparse.urlparse(url)
  102. if parsed_url.hostname == "redirect.tinyurl.com" and parsed_url.path == "/api/click":
  103. if sys.version_info[0] == 2:
  104. query = urlparse.parse_qs(parsed_url.query.encode('latin-1'))
  105. else:
  106. query = urlparse.parse_qs(parsed_url.query, encoding='latin-1')
  107. if query["out"]:
  108. if sys.version_info[0] == 2:
  109. scrubbed_url = query["out"][0].decode('latin-1')
  110. else:
  111. scrubbed_url = query["out"][0]
  112. return (URLStatus.ok, scrubbed_url, 'latin-1')
  113. return (URLStatus.ok, url, 'latin-1')
  114. class Tinyurl7Service(HashRandMixin, TinyurlService):
  115. def get_shortcode_width(self):
  116. return 7