# encoding=utf-8 from __future__ import unicode_literals import re import time from terroroftinytown.client import errors from terroroftinytown.client.errors import PleaseRetry from terroroftinytown.services.base import BaseService from terroroftinytown.services.rand import HashRandMixin from terroroftinytown.services.status import URLStatus from terroroftinytown.six.moves import html_parser # __all__ = ['IsgdService'] class IsgdService(BaseService): # NOTE: VgdService inherits from this class! # unavailable status code: 200 410 # banned status code: 502 def __init__(self, *args, **kwargs): BaseService.__init__(self, *args, **kwargs) self._processing_phishing_page = False def scrape_one(self, sequence_number): self._processing_phishing_page = False return BaseService.scrape_one(self, sequence_number) def process_unavailable(self, response): if not response.text: return (URLStatus.unavailable, None, None) # Catch both types encountered in the wild: #

Rate limit exceeded - you must wait at least 1798 seconds before we'll service this request.

#

Rate limit exceeded - please wait 1 minute before accessing more shortened URLs

if '

Rate limit exceeded - ' in response.text: raise PleaseRetry() if "

Link Disabled

" in response.text: return self.parse_blocked(response) if "

The full original link is shown below. Click the link if you'd like to proceed to the destination shown:" in response.text: return self.parse_preview(response) if 'Suspected phishing site | CloudFlare' in response.text: return self.process_phishing(response) raise errors.UnexpectedNoResult("Could not find processing unavailable for %s" % self.current_shortcode) def parse_blocked(self, response): response.encoding = 'utf-8' match = re.search("

For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -
(.*)

is\.gd

is\.gd is a free service used to shorten long URLs\.", response.text) if not match: raise errors.UnexpectedNoResult("Could not find target URL in 'Link Disabled' page") url = match.group(1) url = html_parser.HTMLParser().unescape(url) if url == "": return (URLStatus.unavailable, None, None) return (URLStatus.ok, url, response.encoding) def parse_preview(self, response): response.encoding = 'utf-8' match = re.search("Click the link if you'd like to proceed to the destination shown: -
", response.text) if not match: raise errors.UnexpectedNoResult("Could not find target URL in 'Preview' page") url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding) def process_phishing(self, response): if self._processing_phishing_page: raise errors.UnexpectedNoResult("Alreadying processing phishing page for %s" % self.current_shortcode) self._processing_phishing_page = True time.sleep(1) match = re.search(r'', response.text) url = 'https://is.gd/cdn-cgi/phish-bypass?u=/{0}&atok={1}'.format( self.current_shortcode, match.group(1)) response = self.fetch_url(url) return self.process_response(response) class Isgd6Service(HashRandMixin, IsgdService): def get_shortcode_width(self): return 6