isgd.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. # encoding=utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import time
  5. from terroroftinytown.client import errors
  6. from terroroftinytown.client.errors import PleaseRetry
  7. from terroroftinytown.services.base import BaseService
  8. from terroroftinytown.services.rand import HashRandMixin
  9. from terroroftinytown.services.status import URLStatus
  10. from terroroftinytown.six.moves import html_parser
  11. # __all__ = ['IsgdService']
  12. class IsgdService(BaseService):
  13. # NOTE: VgdService inherits from this class!
  14. # unavailable status code: 200 410
  15. # banned status code: 502
  16. def __init__(self, *args, **kwargs):
  17. BaseService.__init__(self, *args, **kwargs)
  18. self._processing_phishing_page = False
  19. def scrape_one(self, sequence_number):
  20. self._processing_phishing_page = False
  21. return BaseService.scrape_one(self, sequence_number)
  22. def process_unavailable(self, response):
  23. if not response.text:
  24. return (URLStatus.unavailable, None, None)
  25. # Catch both types encountered in the wild:
  26. # <div id="main"><p>Rate limit exceeded - you must wait at least 1798 seconds before we'll service this request.</p></div>
  27. # <div id="main"><p>Rate limit exceeded - please wait 1 minute before accessing more shortened URLs</p></div>
  28. if '<div id="main"><p>Rate limit exceeded - ' in response.text:
  29. raise PleaseRetry()
  30. if "<div id=\"disabled\"><h2>Link Disabled</h2>" in response.text:
  31. return self.parse_blocked(response)
  32. if "<p>The full original link is shown below. <b>Click the link</b> if you'd like to proceed to the destination shown:" in response.text:
  33. return self.parse_preview(response)
  34. if '<title>Suspected phishing site | CloudFlare</title>' in response.text:
  35. return self.process_phishing(response)
  36. raise errors.UnexpectedNoResult("Could not find processing unavailable for %s" % self.current_shortcode)
  37. def parse_blocked(self, response):
  38. response.encoding = 'utf-8'
  39. match = re.search("<p>For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -<br />(.*)</p><h2>is\.gd</h2><p>is\.gd is a free service used to shorten long URLs\.", response.text)
  40. if not match:
  41. raise errors.UnexpectedNoResult("Could not find target URL in 'Link Disabled' page")
  42. url = match.group(1)
  43. url = html_parser.HTMLParser().unescape(url)
  44. if url == "":
  45. return (URLStatus.unavailable, None, None)
  46. return (URLStatus.ok, url, response.encoding)
  47. def parse_preview(self, response):
  48. response.encoding = 'utf-8'
  49. match = re.search("<b>Click the link</b> if you'd like to proceed to the destination shown: -<br /><a href=\"(.*)\" class=\"biglink\">", response.text)
  50. if not match:
  51. raise errors.UnexpectedNoResult("Could not find target URL in 'Preview' page")
  52. url = match.group(1)
  53. return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
  54. def process_phishing(self, response):
  55. if self._processing_phishing_page:
  56. raise errors.UnexpectedNoResult("Alreadying processing phishing page for %s" % self.current_shortcode)
  57. self._processing_phishing_page = True
  58. time.sleep(1)
  59. match = re.search(r'<input type="hidden" name="atok" value="([a-z0-9]+)">', response.text)
  60. url = 'https://is.gd/cdn-cgi/phish-bypass?u=/{0}&atok={1}'.format(
  61. self.current_shortcode, match.group(1))
  62. response = self.fetch_url(url)
  63. return self.process_response(response)
  64. class Isgd6Service(HashRandMixin, IsgdService):
  65. def get_shortcode_width(self):
  66. return 6