vidble.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. #!/usr/bin/env python3
  2. # coding=utf-8
  3. import itertools
  4. import logging
  5. import re
  6. from typing import Optional
  7. import bs4
  8. import requests
  9. from praw.models import Submission
  10. from bdfr.exceptions import SiteDownloaderError
  11. from bdfr.resource import Resource
  12. from bdfr.site_authenticator import SiteAuthenticator
  13. from bdfr.site_downloaders.base_downloader import BaseDownloader
  14. logger = logging.getLogger(__name__)
  15. class Vidble(BaseDownloader):
  16. def __init__(self, post: Submission):
  17. super().__init__(post)
  18. def find_resources(self, authenticator: Optional[SiteAuthenticator] = None) -> list[Resource]:
  19. try:
  20. res = self.get_links(self.post.url)
  21. except AttributeError:
  22. raise SiteDownloaderError(f'Could not read page at {self.post.url}')
  23. if not res:
  24. raise SiteDownloaderError(rf'No resources found at {self.post.url}')
  25. res = [Resource(self.post, r, Resource.retry_download(r)) for r in res]
  26. return res
  27. @staticmethod
  28. def get_links(url: str) -> set[str]:
  29. if not re.search(r'vidble.com/(show/|album/|watch\?v)', url):
  30. url = re.sub(r'/(\w*?)$', r'/show/\1', url)
  31. page = requests.get(url)
  32. soup = bs4.BeautifulSoup(page.text, 'html.parser')
  33. content_div = soup.find('div', attrs={'id': 'ContentPlaceHolder1_divContent'})
  34. images = content_div.find_all('img')
  35. images = [i.get('src') for i in images]
  36. videos = content_div.find_all('source', attrs={'type': 'video/mp4'})
  37. videos = [v.get('src') for v in videos]
  38. resources = filter(None, itertools.chain(images, videos))
  39. resources = ['https://www.vidble.com' + r for r in resources]
  40. resources = [Vidble.change_med_url(r) for r in resources]
  41. return set(resources)
  42. @staticmethod
  43. def change_med_url(url: str) -> str:
  44. out = re.sub(r'_med(\..{3,4})$', r'\1', url)
  45. return out