youtubedl_archiver.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. import os, datetime
  2. import yt_dlp
  3. from loguru import logger
  4. from .base_archiver import Archiver, ArchiveResult
  5. from storages import Storage
  6. class YoutubeDLArchiver(Archiver):
  7. name = "youtube_dl"
  8. ydl_opts = {'outtmpl': f'{Storage.TMP_FOLDER}%(id)s.%(ext)s', 'quiet': False}
  9. def __init__(self, storage: Storage, driver, fb_cookie):
  10. super().__init__(storage, driver)
  11. self.fb_cookie = fb_cookie
  12. def download(self, url, check_if_exists=False):
  13. netloc = self.get_netloc(url)
  14. if netloc in ['facebook.com', 'www.facebook.com'] and self.fb_cookie:
  15. logger.debug('Using Facebook cookie')
  16. yt_dlp.utils.std_headers['cookie'] = self.fb_cookie
  17. ydl = yt_dlp.YoutubeDL(YoutubeDLArchiver.ydl_opts)
  18. cdn_url = None
  19. status = 'success'
  20. try:
  21. info = ydl.extract_info(url, download=False)
  22. except yt_dlp.utils.DownloadError as e:
  23. logger.debug(f'No video - Youtube normal control flow: {e}')
  24. return False
  25. except Exception as e:
  26. logger.debug(f'ytdlp exception which is normal for example a facebook page with images only will cause a IndexError: list index out of range. Exception here is: \n {e}')
  27. return False
  28. if info.get('is_live', False):
  29. logger.warning("Live streaming media, not archiving now")
  30. return ArchiveResult(status="Streaming media")
  31. if 'twitter.com' in netloc:
  32. if 'https://twitter.com/' in info['webpage_url']:
  33. logger.info('Found https://twitter.com/ in the download url from Twitter')
  34. else:
  35. logger.info('Found a linked video probably in a link in a tweet - not getting that video as there may be images in the tweet')
  36. return False
  37. if check_if_exists:
  38. if 'entries' in info:
  39. if len(info['entries']) > 1:
  40. logger.warning('YoutubeDLArchiver succeeded but cannot archive channels or pages with multiple videos')
  41. return False
  42. elif len(info['entries']) == 0:
  43. logger.warning(
  44. 'YoutubeDLArchiver succeeded but did not find video')
  45. return False
  46. filename = ydl.prepare_filename(info['entries'][0])
  47. else:
  48. filename = ydl.prepare_filename(info)
  49. key = self.get_key(filename)
  50. if self.storage.exists(key):
  51. status = 'already archived'
  52. cdn_url = self.storage.get_cdn_url(key)
  53. # sometimes this results in a different filename, so do this again
  54. info = ydl.extract_info(url, download=True)
  55. # TODO: add support for multiple videos
  56. if 'entries' in info:
  57. if len(info['entries']) > 1:
  58. logger.warning(
  59. 'YoutubeDLArchiver cannot archive channels or pages with multiple videos')
  60. return False
  61. else:
  62. info = info['entries'][0]
  63. filename = ydl.prepare_filename(info)
  64. if not os.path.exists(filename):
  65. filename = filename.split('.')[0] + '.mkv'
  66. if status != 'already archived':
  67. key = self.get_key(filename)
  68. self.storage.upload(filename, key)
  69. # filename ='tmp/sDE-qZdi8p8.webm'
  70. # key ='SM0022/youtube_dl_sDE-qZdi8p8.webm'
  71. cdn_url = self.storage.get_cdn_url(key)
  72. hash = self.get_hash(filename)
  73. screenshot = self.get_screenshot(url)
  74. # get duration
  75. duration = info.get('duration')
  76. # get thumbnails
  77. try:
  78. key_thumb, thumb_index = self.get_thumbnails(filename, key, duration=duration)
  79. except:
  80. key_thumb = ''
  81. thumb_index = 'Could not generate thumbnails'
  82. os.remove(filename)
  83. timestamp = None
  84. if 'timestamp' in info and info['timestamp'] is not None:
  85. timestamp = datetime.datetime.utcfromtimestamp(info['timestamp']).replace(tzinfo=datetime.timezone.utc).isoformat()
  86. elif 'upload_date' in info and info['upload_date'] is not None:
  87. timestamp = datetime.datetime.strptime(info['upload_date'], '%Y%m%d').replace(tzinfo=datetime.timezone.utc)
  88. return ArchiveResult(status=status, cdn_url=cdn_url, thumbnail=key_thumb, thumbnail_index=thumb_index, duration=duration,
  89. title=info['title'] if 'title' in info else None, timestamp=timestamp, hash=hash, screenshot=screenshot)