webexteamsarchiver.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. """Webex Teams Room Archiver.
  2. Copyright (c) 2018-2021 Cisco and/or its affiliates.
  3. Permission is hereby granted, free of charge, to any person obtaining a copy
  4. of this software and associated documentation files (the "Software"), to deal
  5. in the Software without restriction, including without limitation the rights
  6. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. copies of the Software, and to permit persons to whom the Software is
  8. furnished to do so, subject to the following conditions:
  9. The above copyright notice and this permission notice shall be included in all
  10. copies or substantial portions of the Software.
  11. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  16. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  17. SOFTWARE.
  18. """
  19. import concurrent.futures
  20. import os
  21. import re
  22. import requests
  23. import shutil
  24. import logging
  25. import json
  26. import datetime
  27. from collections import namedtuple
  28. from webexteamssdk import WebexTeamsAPI
  29. from webexteamssdk.exceptions import MalformedResponse, ApiError
  30. from webexteamssdk.models.immutable import Person
  31. from webexteamssdk.generator_containers import GeneratorContainer
  32. from .jinja_env import env as jinja_env
  33. from .jinja_env import sanitize_name
  34. __all__ = ['WebexTeamsArchiver', 'File', 'UserNotFound', 'UserApiFailed']
  35. File = namedtuple(
  36. "File", "content_disposition content_length content_type filename deleted")
  37. UserNotFound = namedtuple(
  38. "UserNotFound", "id emails displayName avatar"
  39. )
  40. UserApiFailed = namedtuple(
  41. "UserApiFailed", "id emails displayName avatar"
  42. )
  43. logger = logging.getLogger(__name__)
  44. logger.setLevel(logging.INFO)
  45. class WebexTeamsArchiver:
  46. """
  47. Initializes object that can be used to archive a Webex Teams room.
  48. Args:
  49. access_token: User's personal Webex Teams API bearer token.
  50. single_request_timeout: Timeout in seconds for the API requests.
  51. special_token: The supplied access_token has access to all messages in a space.
  52. Raises:
  53. webexteamssdkException: An error occurred calling the Webex Teams API.
  54. """
  55. def __init__(self, access_token: str, single_request_timeout: int = 60, special_token: bool = False) -> None:
  56. self.access_token = access_token
  57. self.special_token = special_token
  58. self.sdk = WebexTeamsAPI(
  59. self.access_token, single_request_timeout=single_request_timeout)
  60. def file_details(self, url: str) -> File:
  61. """
  62. Retrieves the file details using the Webex Teams attachments endpoint.
  63. Args:
  64. url: The URL of the file found in the files list in the message.
  65. single_request_timeout: Webex API call single request timeout.
  66. Returns:
  67. File: Details about the file.
  68. """
  69. headers = {
  70. "Authorization": f"Bearer {self.access_token}",
  71. "Accept-Encoding": "", # ensures content-length always gets returned
  72. }
  73. r = requests.head(url, headers=headers)
  74. if r.status_code == 404:
  75. # Item must have been deleted since url was retrieved
  76. return File("", 0, "", "", True)
  77. if r.ok:
  78. filename_re = re.search(r"filename=\"(.+?)\"",
  79. r.headers.get("Content-Disposition", ""), re.I)
  80. if not filename_re:
  81. new_filename = re.sub(r'^.+/([^/]+)$', r'\1', url)
  82. message = (
  83. f"Set filename to '{new_filename}' in {r.headers.get('Content-Disposition', '')} for url {url}"
  84. )
  85. logger.debug(message)
  86. filename_re = re.search(r"filename=\"(.+?)\"", f"filename=\"{new_filename}\"", re.I)
  87. return File(r.headers.get("Content-Disposition", ""),
  88. r.headers.get("Content-Length", 0),
  89. r.headers.get("Content-Type", ""),
  90. sanitize_name(filename_re.group(1)),
  91. False)
  92. else:
  93. return File("", 0, "", "UNKNOWN", True)
  94. def archive_room(self, room_id: str, text_format: bool = True, html_format: bool = True,
  95. json_format: bool = True, **options) -> str:
  96. """
  97. Archives a Webex Teams room. This creates a file called roomTitle_timestamp_roomId with the
  98. appropriate file extension as defined by file_format param with the following contents:
  99. - roomTitle_roomId.txt - Text version of the conversations (if `text_format` is True)
  100. - roomTitle_roomId.html - HTML version of the conversations (if `html_format` is True)
  101. - files/ - Attachments added to the room (if `download_attachments` is True)
  102. Args:
  103. room_id: ID of the room to archive.
  104. text_format: Create a text version of the archive.
  105. html_format: Create an HTML version of the archive.
  106. json_format: Create a json version of the archive.
  107. Options:
  108. compress_folder: Compress archive folder.
  109. delete_folder: Delete the archive folder when done.
  110. reverse_order: Order messages by most recent on the bottom.
  111. download_attachments: Download attachments sent to the room.
  112. download_avatars: Download avatar images.
  113. download_workers: Number of download workers for downloading files.
  114. timestamp_format: Timestamp strftime format.
  115. file_format: Archive format as supported by shutil.make_archive
  116. Returns:
  117. Name of archive file.
  118. Raises:
  119. IOError: Error occurred while creating/writing to files.
  120. shutil.Error: Error occurred creating/copying/deleting files/folders.
  121. ValueError: Exception message will contain more details.
  122. TypeError: Messages contained non JSON serializable data.
  123. webexteamssdkException: An error occurred calling the Webex Teams API.
  124. """
  125. # Configure options
  126. compress_folder = options.get("compress_folder", True)
  127. delete_folder = options.get("delete_folder", False)
  128. reverse_order = options.get("reverse_order", True)
  129. download_attachments = options.get("download_attachments", True)
  130. download_avatars = options.get("download_avatars", True)
  131. download_workers = options.get("download_workers", 15)
  132. timestamp_format = options.get("timestamp_format", "%Y-%m-%dT%H:%M:%S")
  133. file_format = options.get("file_format", "gztar")
  134. if delete_folder and not compress_folder:
  135. raise ValueError("delete_folder cannot be True while compress_folder is False")
  136. self._gather_room_information(room_id, download_avatars)
  137. # Prepare folder
  138. self._setup_folder(download_attachments, download_avatars, html_format)
  139. try:
  140. self._archive(reverse_order, download_attachments, download_avatars, download_workers,
  141. text_format, html_format, json_format, timestamp_format)
  142. if compress_folder:
  143. filename = self._compress_folder(file_format)
  144. else:
  145. filename = self.archive_folder_name
  146. except Exception:
  147. self._tear_down_folder()
  148. raise
  149. if delete_folder:
  150. self._tear_down_folder()
  151. return filename
  152. def _archive(self, reverse_order: bool, download_attachments: bool,
  153. download_avatars: bool, download_workers: int, text_format: bool,
  154. html_format: bool, json_format: bool, timestamp_format: str) -> None:
  155. """
  156. Collects room messages and attachments using Webex Teams
  157. APIs and writes them to text/html files.
  158. """
  159. if reverse_order:
  160. self.messages_with_threads = list(reversed(list(self.messages_with_threads)))
  161. else:
  162. self.messages_with_threads = list(self.messages_with_threads)
  163. if html_format:
  164. self._create_html_transcript(self.messages_with_threads, self.attachments, self.people,
  165. download_avatars, timestamp_format)
  166. logger.debug("HTML transcript completed.")
  167. if text_format:
  168. self._create_text_transcript(
  169. self.messages_with_threads, self.attachments, self.people, timestamp_format)
  170. logger.debug("Text transcript completed.")
  171. if json_format:
  172. self._create_json_transcript(self.messages)
  173. logger.debug("JSON transcript completed.")
  174. if download_attachments:
  175. self._download_files(
  176. "attachments", self.attachments, download_workers)
  177. logger.debug("Attachments download completed.")
  178. if download_avatars:
  179. self._download_files("avatars", self.avatars, download_workers)
  180. logger.debug("Avatars download completed.")
  181. # Write space information to json file
  182. with open(os.path.join(os.getcwd(), self.archive_folder_name, f"space_details.json"), "w", encoding="utf-8") as fh:
  183. space_details = {
  184. "space": self.room.to_dict(),
  185. "creator": self.room_creator._asdict() if isinstance(self.room_creator, UserNotFound)
  186. else self.room_creator.to_dict(),
  187. }
  188. json.dump(space_details, fh)
  189. logger.info("Room %s archived successfully.", self.room.id)
  190. def _setup_folder(self, download_attachments: bool,
  191. download_avatars, html_format: bool) -> None:
  192. """Creates a folder roomTitle_roomId to store archive."""
  193. os.makedirs(self.archive_folder_name)
  194. if download_attachments:
  195. os.makedirs(f"{self.archive_folder_name}/attachments")
  196. if download_avatars:
  197. os.makedirs(f"{self.archive_folder_name}/avatars")
  198. if html_format:
  199. basepath = os.path.dirname(os.path.realpath(__file__))
  200. shutil.copytree(f"{basepath}/static/.css",
  201. f"{self.archive_folder_name}/.css")
  202. shutil.copytree(f"{basepath}/static/.js",
  203. f"{self.archive_folder_name}/.js")
  204. shutil.copytree(f"{basepath}/static/.fonts",
  205. f"{self.archive_folder_name}/.fonts")
  206. def _tear_down_folder(self) -> None:
  207. """Deletes the roomTitle_roomId folder in case an exception was raised."""
  208. if os.path.isdir(self.archive_folder_name):
  209. shutil.rmtree(self.archive_folder_name, ignore_errors=False)
  210. def _gather_room_information(self, room_id: str, download_avatars: bool) -> None:
  211. """Calls Webex Teams APIs to get room information and messages."""
  212. # Structure: {"personId": webexteamssdk.models.immutable.Person}
  213. self.people = {}
  214. # Structure: {"url": File}
  215. self.attachments = {}
  216. # Structure: {"url": File}
  217. self.avatars = {}
  218. # Threads: {"parentId": [webexteamssdk.models.immutable.Message, ...]}
  219. self.threads = {}
  220. self.room = self.sdk.rooms.get(room_id)
  221. timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
  222. self.archive_folder_name = f"{sanitize_name(self.room.title)}_{timestamp}"
  223. try:
  224. self.room_creator = self.sdk.people.get(self.room.creatorId)
  225. except ApiError as e:
  226. if e.response.status_code == 404:
  227. self.room_creator = UserNotFound(
  228. id=self.room.creatorId,
  229. emails=["unknown"],
  230. displayName="Person Not Found",
  231. avatar=None,
  232. )
  233. else:
  234. logger.error(e)
  235. raise
  236. if self.room.type == "group" and self.sdk.people.me().type == "bot" and not self.special_token:
  237. self.messages = self.sdk.messages.list(
  238. room_id, mentionedPeople="me")
  239. else:
  240. self.messages = self.sdk.messages.list(room_id)
  241. self.messages_with_threads = self.messages
  242. self._organize_by_threads(self.messages, download_avatars)
  243. def _organize_by_threads(self, messages: GeneratorContainer, download_avatars: bool) -> None:
  244. """Extracts threaded messages from all messages."""
  245. for index, msg in enumerate(messages):
  246. if hasattr(msg, "parentId"):
  247. if msg.parentId in self.threads:
  248. self.threads[msg.parentId].insert(0, msg)
  249. else:
  250. self.threads[msg.parentId] = [msg]
  251. if msg.personId and msg.personId not in self.people:
  252. try:
  253. self.people[msg.personId] = self.sdk.people.get(
  254. msg.personId)
  255. if not msg.personEmail:
  256. if isinstance(self.people[msg.personId], Person) and \
  257. isinstance(self.people[msg.personId].emails, list) and \
  258. len(self.people[msg.personId].emails) > 0:
  259. msg.personEmail = self.people[msg.personId].emails[0]
  260. except ApiError as e:
  261. if e.response.status_code == 404:
  262. self.people[msg.personId] = UserNotFound(
  263. id=str(msg.personId),
  264. emails=[str(msg.personEmail)],
  265. displayName="Person Not Found",
  266. avatar=None,
  267. )
  268. else:
  269. logger.error(e)
  270. self.people[msg.personId] = UserApiFailed(
  271. id=str(msg.personId),
  272. emails=[str(msg.personEmail)],
  273. displayName="User API Failed",
  274. avatar=None,
  275. )
  276. if download_avatars and self.people[msg.personId].avatar:
  277. self.avatars[self.people[msg.personId].avatar] = File(
  278. "", "", "", msg.personId, False)
  279. if msg.files:
  280. for url in msg.files:
  281. file_metadata = self.file_details(url)
  282. self.attachments[url] = file_metadata
  283. return
  284. def _create_text_transcript(self, messages: list, attachments: dict, people: dict,
  285. timestamp_format: str) -> None:
  286. """Writes room messages to a text file."""
  287. template = jinja_env.get_template("default.txt")
  288. text_transcript = template.render(
  289. room=self.room,
  290. room_creator=self.room_creator,
  291. messages=messages,
  292. attachments=attachments,
  293. people=people,
  294. timestamp_format=timestamp_format,
  295. threads=self.threads
  296. )
  297. with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.txt"), "w", encoding="utf-8") as fh:
  298. fh.write(text_transcript)
  299. def _create_json_transcript(self, messages: GeneratorContainer) -> None:
  300. """Writes room messages to a JSON file."""
  301. data = {
  302. "items": [m.to_dict() for m in messages]
  303. }
  304. with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.json"), "w", encoding="utf-8") as fh:
  305. json.dump(data, fh)
  306. def _create_html_transcript(self, messages: list, attachments: dict, people: dict,
  307. download_avatars: dict, timestamp_format: str) -> None:
  308. """Writes room messages to an HTML file."""
  309. template = jinja_env.get_template("default.html")
  310. html = template.render(
  311. room=self.room,
  312. room_creator=self.room_creator,
  313. messages=messages,
  314. attachments=attachments,
  315. people=people,
  316. download_avatars=download_avatars,
  317. timestamp_format=timestamp_format,
  318. threads=self.threads
  319. )
  320. with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.html"), "w", encoding="utf-8") as fh:
  321. fh.write(html)
  322. def _download_files(self, folder_name: str, links: dict, workers: int) -> None:
  323. """Downloads files given a list of URL links."""
  324. with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
  325. result = {
  326. executor.submit(self._download_file, folder_name, url, links[url].filename): url for url in links if not links[url].deleted
  327. }
  328. # Do this to check if any downloads failed.
  329. for future in concurrent.futures.as_completed(result):
  330. future.result()
  331. def _download_file(self, folder_name: str, url: str, filename: str) -> None:
  332. """Download file from Webex Teams."""
  333. headers = {
  334. "Authorization": f"Bearer {self.access_token}"
  335. }
  336. # https://stackoverflow.com/questions/16694907/how-to-download-
  337. # large-file-in-python-with-requests-py
  338. # Removing as it's not support in all requests versions
  339. # with requests.get(url, headers=headers, stream=True) as r:
  340. # with open(os.path.join(os.getcwd(), self.archive_folder_name, folder_name, f"{filename}"), "wb") as f:
  341. # shutil.copyfileobj(r.raw, f)
  342. r = requests.get(url, headers=headers, stream=True)
  343. with open(os.path.join(os.getcwd(), self.archive_folder_name, folder_name, f"{filename}"), "wb") as f:
  344. for chunk in r.iter_content(chunk_size=1024):
  345. f.write(chunk)
  346. def _compress_folder(self, file_format: str) -> str:
  347. """Compress `archive_folder_name` folder with the format defined by file_format param"""
  348. return shutil.make_archive(self.archive_folder_name, file_format, self.archive_folder_name)