base_archiver.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. import os, datetime, shutil, hashlib, time, requests, re, mimetypes
  2. from dataclasses import dataclass
  3. from abc import ABC, abstractmethod
  4. from urllib.parse import urlparse
  5. from random import randrange
  6. import ffmpeg
  7. from loguru import logger
  8. from selenium.common.exceptions import TimeoutException
  9. from selenium.webdriver.common.by import By
  10. from slugify import slugify
  11. from storages import Storage
  12. from utils import mkdir_if_not_exists
  13. @dataclass
  14. class ArchiveResult:
  15. status: str
  16. cdn_url: str = None
  17. thumbnail: str = None
  18. thumbnail_index: str = None
  19. duration: float = None
  20. title: str = None
  21. timestamp: datetime.datetime = None
  22. screenshot: str = None
  23. hash: str = None
  24. class Archiver(ABC):
  25. name = "default"
  26. retry_regex = r"retrying at (\d+)$"
  27. def __init__(self, storage: Storage, driver):
  28. self.storage = storage
  29. self.driver = driver
  30. def __str__(self):
  31. return self.__class__.__name__
  32. def __repr__(self):
  33. return self.__str__()
  34. @abstractmethod
  35. def download(self, url, check_if_exists=False): pass
  36. def get_netloc(self, url):
  37. return urlparse(url).netloc
  38. # generate the html page eg SM3013/twitter__minmyatnaing13_status_1499415562937503751.html
  39. def generate_media_page_html(self, url, urls_info: dict, object, thumbnail=None):
  40. """
  41. Generates an index.html page where each @urls_info is displayed
  42. """
  43. page = f'''<html><head><title>{url}</title><meta charset="UTF-8"></head>
  44. <body>
  45. <h2>Archived media from {self.name}</h2>
  46. <h3><a href="{url}">{url}</a></h3><ul>'''
  47. for url_info in urls_info:
  48. mime_global = self._guess_file_type(url_info["key"])
  49. preview = ""
  50. if mime_global == "image":
  51. preview = f'<img src="{url_info["cdn_url"]}" style="max-height:200px;max-width:400px;"></img>'
  52. elif mime_global == "video":
  53. preview = f'<video src="{url_info["cdn_url"]}" controls style="max-height:400px;max-width:400px;"></video>'
  54. page += f'''<li><a href="{url_info['cdn_url']}">{preview}{url_info['key']}</a>: {url_info['hash']}</li>'''
  55. page += f"</ul><h2>{self.name} object data:</h2><code>{object}</code>"
  56. page += f"</body></html>"
  57. page_key = self.get_html_key(url)
  58. page_filename = os.path.join(Storage.TMP_FOLDER, page_key)
  59. with open(page_filename, "w") as f:
  60. f.write(page)
  61. page_hash = self.get_hash(page_filename)
  62. self.storage.upload(page_filename, page_key, extra_args={
  63. 'ACL': 'public-read', 'ContentType': 'text/html'})
  64. page_cdn = self.storage.get_cdn_url(page_key)
  65. return (page_cdn, page_hash, thumbnail)
  66. def _guess_file_type(self, path: str):
  67. """
  68. Receives a URL or filename and returns global mimetype like 'image' or 'video'
  69. see https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types/Common_types
  70. """
  71. mime = mimetypes.guess_type(path)[0]
  72. if mime is not None:
  73. return mime.split("/")[0]
  74. return ""
  75. def download_from_url(self, url, to_filename):
  76. headers = {
  77. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
  78. }
  79. d = requests.get(url, headers=headers)
  80. with open(to_filename, 'wb') as f:
  81. f.write(d.content)
  82. def generate_media_page(self, urls, url, object):
  83. """
  84. For a list of media urls, fetch them, upload them
  85. and call self.generate_media_page_html with them
  86. """
  87. thumbnail = None
  88. uploaded_media = []
  89. for media_url in urls:
  90. key = self._get_key_from_url(media_url, ".jpg")
  91. filename = os.path.join(Storage.TMP_FOLDER, key)
  92. self.download_from_url(media_url, filename)
  93. self.storage.upload(filename, key)
  94. hash = self.get_hash(filename)
  95. cdn_url = self.storage.get_cdn_url(key)
  96. if thumbnail is None:
  97. thumbnail = cdn_url
  98. uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
  99. return self.generate_media_page_html(url, uploaded_media, object, thumbnail=thumbnail)
  100. def get_key(self, filename):
  101. """
  102. returns a key in the format "[archiverName]_[filename]" includes extension
  103. """
  104. tail = os.path.split(filename)[1] # returns filename.ext from full path
  105. _id, extension = os.path.splitext(tail) # returns [filename, .ext]
  106. if 'unknown_video' in _id:
  107. _id = _id.replace('unknown_video', 'jpg')
  108. # long filenames can cause problems, so trim them if necessary
  109. if len(_id) > 128:
  110. _id = _id[-128:]
  111. return f'{self.name}_{_id}{extension}'
  112. def get_html_key(self, url):
  113. return self._get_key_from_url(url, ".html")
  114. def _get_key_from_url(self, url, with_extension: str = None, append_datetime: bool = False):
  115. """
  116. Receives a URL and returns a slugified version of the URL path
  117. if a string is passed in @with_extension the slug is appended with it if there is no "." in the slug
  118. if @append_date is true, the key adds a timestamp after the URL slug and before the extension
  119. """
  120. url_path = urlparse(url).path
  121. path, ext = os.path.splitext(url_path)
  122. slug = slugify(path)
  123. if append_datetime:
  124. slug += "-" + slugify(datetime.datetime.utcnow().isoformat())
  125. if len(ext):
  126. slug += ext
  127. if with_extension is not None:
  128. if "." not in slug:
  129. slug += with_extension
  130. return self.get_key(slug)
  131. def get_hash(self, filename):
  132. with open(filename, "rb") as f:
  133. bytes = f.read() # read entire file as bytes
  134. # TODO: customizable hash
  135. hash = hashlib.sha256(bytes)
  136. # option to use SHA3_512 instead
  137. # hash = hashlib.sha3_512(bytes)
  138. return hash.hexdigest()
  139. def get_screenshot(self, url):
  140. logger.debug(f"getting screenshot for {url=}")
  141. key = self._get_key_from_url(url, ".png", append_datetime=True)
  142. filename = os.path.join(Storage.TMP_FOLDER, key)
  143. # Accept cookies popup dismiss for ytdlp video
  144. if 'facebook.com' in url:
  145. try:
  146. logger.debug(f'Trying fb click accept cookie popup for {url}')
  147. self.driver.get("http://www.facebook.com")
  148. foo = self.driver.find_element(By.XPATH, "//button[@data-cookiebanner='accept_only_essential_button']")
  149. foo.click()
  150. logger.debug(f'fb click worked')
  151. # linux server needs a sleep otherwise facebook cookie won't have worked and we'll get a popup on next page
  152. time.sleep(2)
  153. except:
  154. logger.warning(f'Failed on fb accept cookies for url {url}')
  155. try:
  156. self.driver.get(url)
  157. time.sleep(6)
  158. except TimeoutException:
  159. logger.info("TimeoutException loading page for screenshot")
  160. self.driver.save_screenshot(filename)
  161. self.storage.upload(filename, key, extra_args={
  162. 'ACL': 'public-read', 'ContentType': 'image/png'})
  163. return self.storage.get_cdn_url(key)
  164. def get_thumbnails(self, filename, key, duration=None):
  165. thumbnails_folder = os.path.splitext(filename)[0] + os.path.sep
  166. key_folder = key.split('.')[0] + os.path.sep
  167. mkdir_if_not_exists(thumbnails_folder)
  168. fps = 0.5
  169. if duration is not None:
  170. duration = float(duration)
  171. if duration < 60:
  172. fps = 10.0 / duration
  173. elif duration < 120:
  174. fps = 20.0 / duration
  175. else:
  176. fps = 40.0 / duration
  177. stream = ffmpeg.input(filename)
  178. stream = ffmpeg.filter(stream, 'fps', fps=fps).filter('scale', 512, -1)
  179. stream.output(thumbnails_folder + 'out%d.jpg').run()
  180. thumbnails = os.listdir(thumbnails_folder)
  181. cdn_urls = []
  182. for fname in thumbnails:
  183. if fname[-3:] == 'jpg':
  184. thumbnail_filename = thumbnails_folder + fname
  185. key = os.path.join(key_folder, fname)
  186. self.storage.upload(thumbnail_filename, key)
  187. cdn_url = self.storage.get_cdn_url(key)
  188. cdn_urls.append(cdn_url)
  189. if len(cdn_urls) == 0:
  190. return ('', '')
  191. key_thumb = cdn_urls[int(len(cdn_urls) * 0.1)]
  192. index_page = f'''<html><head><title>{filename}</title><meta charset="UTF-8"></head>
  193. <body>'''
  194. for t in cdn_urls:
  195. index_page += f'<img src="{t}" />'
  196. index_page += f"</body></html>"
  197. index_fname = thumbnails_folder + 'index.html'
  198. with open(index_fname, 'w') as f:
  199. f.write(index_page)
  200. thumb_index = key_folder + 'index.html'
  201. self.storage.upload(index_fname, thumb_index, extra_args={
  202. 'ACL': 'public-read', 'ContentType': 'text/html'})
  203. shutil.rmtree(thumbnails_folder)
  204. thumb_index_cdn_url = self.storage.get_cdn_url(thumb_index)
  205. return (key_thumb, thumb_index_cdn_url)
  206. def signal_retry_in(self, min_seconds=1800, max_seconds=7200, **kwargs):
  207. """
  208. sets state to retry in random between (min_seconds, max_seconds)
  209. """
  210. now = datetime.datetime.now().timestamp()
  211. retry_at = int(now + randrange(min_seconds, max_seconds))
  212. logger.debug(f"signaling {retry_at=}")
  213. return ArchiveResult(status=f'retrying at {retry_at}', **kwargs)
  214. def is_retry(status):
  215. return re.search(Archiver.retry_regex, status) is not None
  216. def should_retry_from_status(status):
  217. """
  218. checks status against message in signal_retry_in
  219. returns true if enough time has elapsed, false otherwise
  220. """
  221. match = re.search(Archiver.retry_regex, status)
  222. if match:
  223. retry_at = int(match.group(1))
  224. now = datetime.datetime.now().timestamp()
  225. should_retry = now >= retry_at
  226. logger.debug(f"{should_retry=} since {now=} and {retry_at=}")
  227. return should_retry
  228. return False
  229. def remove_retry(status):
  230. """
  231. transforms the status from retry into something else
  232. """
  233. new_status = re.sub(Archiver.retry_regex, "failed: too many retries", status, 0)
  234. logger.debug(f"removing retry message at {status=}, got {new_status=}")
  235. return new_status