imgur.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. #!/usr/bin/env python3
  2. import json
  3. import re
  4. from typing import Optional
  5. import bs4
  6. from praw.models import Submission
  7. from bdfr.exceptions import SiteDownloaderError
  8. from bdfr.resource import Resource
  9. from bdfr.site_authenticator import SiteAuthenticator
  10. from bdfr.site_downloaders.base_downloader import BaseDownloader
  11. class Imgur(BaseDownloader):
  12. def __init__(self, post: Submission):
  13. super().__init__(post)
  14. self.raw_data = {}
  15. def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
  16. self.raw_data = self._get_data(self.post.url)
  17. out = []
  18. if 'album_images' in self.raw_data:
  19. images = self.raw_data['album_images']
  20. for image in images['images']:
  21. out.append(self._compute_image_url(image))
  22. else:
  23. out.append(self._compute_image_url(self.raw_data))
  24. return out
  25. def _compute_image_url(self, image: dict) -> Resource:
  26. ext = self._validate_extension(image['ext'])
  27. if image.get('prefer_video', False):
  28. ext = '.mp4'
  29. image_url = 'https://i.imgur.com/' + image['hash'] + ext
  30. return Resource(self.post, image_url, Resource.retry_download(image_url))
  31. @staticmethod
  32. def _get_data(link: str) -> dict:
  33. try:
  34. imgur_id = re.match(r'.*/(.*?)(\..{0,})?$', link).group(1)
  35. gallery = 'a/' if re.search(r'.*/(.*?)(gallery/|a/)', link) else ''
  36. link = f'https://imgur.com/{gallery}{imgur_id}'
  37. except AttributeError:
  38. raise SiteDownloaderError(f'Could not extract Imgur ID from {link}')
  39. res = Imgur.retrieve_url(link, cookies={'over18': '1', 'postpagebeta': '0'})
  40. soup = bs4.BeautifulSoup(res.text, 'html.parser')
  41. scripts = soup.find_all('script', attrs={'type': 'text/javascript'})
  42. scripts = [script.string.replace('\n', '') for script in scripts if script.string]
  43. script_regex = re.compile(r'\s*\(function\(widgetFactory\)\s*{\s*widgetFactory\.mergeConfig\(\'gallery\'')
  44. chosen_script = list(filter(lambda s: re.search(script_regex, s), scripts))
  45. if len(chosen_script) != 1:
  46. raise SiteDownloaderError(f'Could not read page source from {link}')
  47. chosen_script = chosen_script[0]
  48. outer_regex = re.compile(r'widgetFactory\.mergeConfig\(\'gallery\', ({.*})\);')
  49. inner_regex = re.compile(r'image\s*:(.*),\s*group')
  50. try:
  51. image_dict = re.search(outer_regex, chosen_script).group(1)
  52. image_dict = re.search(inner_regex, image_dict).group(1)
  53. except AttributeError:
  54. raise SiteDownloaderError(f'Could not find image dictionary in page source')
  55. try:
  56. image_dict = json.loads(image_dict)
  57. except json.JSONDecodeError as e:
  58. raise SiteDownloaderError(f'Could not parse received dict as JSON: {e}')
  59. return image_dict
  60. @staticmethod
  61. def _validate_extension(extension_suffix: str) -> str:
  62. extension_suffix = re.sub(r'\?.*', '', extension_suffix)
  63. possible_extensions = ('.jpg', '.png', '.mp4', '.gif')
  64. selection = [ext for ext in possible_extensions if ext == extension_suffix]
  65. if len(selection) == 1:
  66. return selection[0]
  67. else:
  68. raise SiteDownloaderError(f'"{extension_suffix}" is not recognized as a valid extension for Imgur')