telethon_archiver.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import os, re
  2. import html
  3. from loguru import logger
  4. from telethon.sync import TelegramClient
  5. from telethon.errors import ChannelInvalidError
  6. from storages import Storage
  7. from .base_archiver import Archiver, ArchiveResult
  8. from configs import TelethonConfig
  9. from utils import getattr_or
  10. class TelethonArchiver(Archiver):
  11. name = "telethon"
  12. link_pattern = re.compile(r"https:\/\/t\.me(\/c){0,1}\/(.+)\/(\d+)")
  13. def __init__(self, storage: Storage, driver, config: TelethonConfig):
  14. super().__init__(storage, driver)
  15. if config:
  16. self.client = TelegramClient("./anon", config.api_id, config.api_hash)
  17. self.bot_token = config.bot_token
  18. def _get_media_posts_in_group(self, chat, original_post, max_amp=10):
  19. """
  20. Searches for Telegram posts that are part of the same group of uploads
  21. The search is conducted around the id of the original post with an amplitude
  22. of `max_amp` both ways
  23. Returns a list of [post] where each post has media and is in the same grouped_id
  24. """
  25. if getattr_or(original_post, "grouped_id") is None:
  26. return [original_post] if getattr_or(original_post, "media") else []
  27. search_ids = [i for i in range(original_post.id - max_amp, original_post.id + max_amp + 1)]
  28. posts = self.client.get_messages(chat, ids=search_ids)
  29. media = []
  30. for post in posts:
  31. if post is not None and post.grouped_id == original_post.grouped_id and post.media is not None:
  32. media.append(post)
  33. return media
  34. def download(self, url, check_if_exists=False):
  35. if not hasattr(self, "client"):
  36. logger.warning('Missing Telethon config')
  37. return False
  38. # detect URLs that we definitely cannot handle
  39. matches = self.link_pattern.findall(url)
  40. if not len(matches):
  41. return False
  42. status = "success"
  43. # app will ask (stall for user input!) for phone number and auth code if anon.session not found
  44. with self.client.start(bot_token=self.bot_token):
  45. matches = list(matches[0])
  46. chat, post_id = matches[1], matches[2]
  47. post_id = int(post_id)
  48. try:
  49. post = self.client.get_messages(chat, ids=post_id)
  50. except ValueError as e:
  51. logger.error(f"Could not fetch telegram {url} possibly it's private: {e}")
  52. return False
  53. except ChannelInvalidError as e:
  54. logger.error(f"Could not fetch telegram {url}. This error can be fixed if you setup a bot_token in addition to api_id and api_hash: {e}")
  55. return False
  56. if post is None: return False
  57. media_posts = self._get_media_posts_in_group(chat, post)
  58. logger.debug(f'got {len(media_posts)=} for {url=}')
  59. screenshot = self.get_screenshot(url)
  60. if len(media_posts) > 0:
  61. key = self.get_html_key(url)
  62. if check_if_exists and self.storage.exists(key):
  63. # only s3 storage supports storage.exists as not implemented on gd
  64. cdn_url = self.storage.get_cdn_url(key)
  65. return ArchiveResult(status='already archived', cdn_url=cdn_url, title=post.message, timestamp=post.date, screenshot=screenshot)
  66. key_thumb, thumb_index = None, None
  67. group_id = post.grouped_id if post.grouped_id is not None else post.id
  68. uploaded_media = []
  69. message = post.message
  70. for mp in media_posts:
  71. if len(mp.message) > len(message): message = mp.message
  72. # media can also be in entities
  73. if mp.entities:
  74. other_media_urls = [e.url for e in mp.entities if hasattr(e, "url") and e.url and self._guess_file_type(e.url) in ["video", "image"]]
  75. logger.debug(f"Got {len(other_media_urls)} other medial urls from {mp.id=}: {other_media_urls}")
  76. for om_url in other_media_urls:
  77. filename = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}_{self._get_key_from_url(om_url)}')
  78. self.download_from_url(om_url, filename)
  79. key = filename.split(Storage.TMP_FOLDER)[1]
  80. self.storage.upload(filename, key)
  81. hash = self.get_hash(filename)
  82. cdn_url = self.storage.get_cdn_url(key)
  83. uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
  84. filename_dest = os.path.join(Storage.TMP_FOLDER, f'{chat}_{group_id}', str(mp.id))
  85. filename = self.client.download_media(mp.media, filename_dest)
  86. if not filename:
  87. logger.debug(f"Empty media found, skipping {str(mp)=}")
  88. continue
  89. key = filename.split(Storage.TMP_FOLDER)[1]
  90. self.storage.upload(filename, key)
  91. hash = self.get_hash(filename)
  92. cdn_url = self.storage.get_cdn_url(key)
  93. uploaded_media.append({'cdn_url': cdn_url, 'key': key, 'hash': hash})
  94. if key_thumb is None:
  95. key_thumb, thumb_index = self.get_thumbnails(filename, key)
  96. os.remove(filename)
  97. page_cdn, page_hash, _ = self.generate_media_page_html(url, uploaded_media, html.escape(str(post)))
  98. return ArchiveResult(status=status, cdn_url=page_cdn, title=message, timestamp=post.date, hash=page_hash, screenshot=screenshot, thumbnail=key_thumb, thumbnail_index=thumb_index)
  99. page_cdn, page_hash, _ = self.generate_media_page_html(url, [], html.escape(str(post)))
  100. return ArchiveResult(status=status, cdn_url=page_cdn, title=post.message, timestamp=getattr_or(post, "date"), hash=page_hash, screenshot=screenshot)