123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- # encoding=utf-8
- from __future__ import unicode_literals
- import re
- import time
- from terroroftinytown.client import errors
- from terroroftinytown.client.errors import PleaseRetry
- from terroroftinytown.services.base import BaseService
- from terroroftinytown.services.rand import HashRandMixin
- from terroroftinytown.services.status import URLStatus
- from terroroftinytown.six.moves import html_parser
- # __all__ = ['IsgdService']
- class IsgdService(BaseService):
- # NOTE: VgdService inherits from this class!
- # unavailable status code: 200 410
- # banned status code: 502
-
- def __init__(self, *args, **kwargs):
- BaseService.__init__(self, *args, **kwargs)
- self._processing_phishing_page = False
- def scrape_one(self, sequence_number):
- self._processing_phishing_page = False
- return BaseService.scrape_one(self, sequence_number)
- def process_unavailable(self, response):
- if not response.text:
- return (URLStatus.unavailable, None, None)
- # Catch both types encountered in the wild:
- # <div id="main"><p>Rate limit exceeded - you must wait at least 1798 seconds before we'll service this request.</p></div>
- # <div id="main"><p>Rate limit exceeded - please wait 1 minute before accessing more shortened URLs</p></div>
- if '<div id="main"><p>Rate limit exceeded - ' in response.text:
- raise PleaseRetry()
- if "<div id=\"disabled\"><h2>Link Disabled</h2>" in response.text:
- return self.parse_blocked(response)
- if "<p>The full original link is shown below. <b>Click the link</b> if you'd like to proceed to the destination shown:" in response.text:
- return self.parse_preview(response)
- if '<title>Suspected phishing site | CloudFlare</title>' in response.text:
- return self.process_phishing(response)
- raise errors.UnexpectedNoResult("Could not find processing unavailable for %s" % self.current_shortcode)
- def parse_blocked(self, response):
- response.encoding = 'utf-8'
- match = re.search("<p>For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -<br />(.*)</p><h2>is\.gd</h2><p>is\.gd is a free service used to shorten long URLs\.", response.text)
- if not match:
- raise errors.UnexpectedNoResult("Could not find target URL in 'Link Disabled' page")
- url = match.group(1)
- url = html_parser.HTMLParser().unescape(url)
- if url == "":
- return (URLStatus.unavailable, None, None)
- return (URLStatus.ok, url, response.encoding)
- def parse_preview(self, response):
- response.encoding = 'utf-8'
- match = re.search("<b>Click the link</b> if you'd like to proceed to the destination shown: -<br /><a href=\"(.*)\" class=\"biglink\">", response.text)
- if not match:
- raise errors.UnexpectedNoResult("Could not find target URL in 'Preview' page")
- url = match.group(1)
- return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
-
- def process_phishing(self, response):
- if self._processing_phishing_page:
- raise errors.UnexpectedNoResult("Alreadying processing phishing page for %s" % self.current_shortcode)
-
- self._processing_phishing_page = True
- time.sleep(1)
-
- match = re.search(r'<input type="hidden" name="atok" value="([a-z0-9]+)">', response.text)
-
- url = 'https://is.gd/cdn-cgi/phish-bypass?u=/{0}&atok={1}'.format(
- self.current_shortcode, match.group(1))
-
- response = self.fetch_url(url)
- return self.process_response(response)
- class Isgd6Service(HashRandMixin, IsgdService):
- def get_shortcode_width(self):
- return 6
|