123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432 |
- """Webex Teams Room Archiver.
- Copyright (c) 2018-2021 Cisco and/or its affiliates.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in all
- copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- SOFTWARE.
- """
- import concurrent.futures
- import os
- import re
- import requests
- import shutil
- import logging
- import json
- import datetime
- from collections import namedtuple
- from webexteamssdk import WebexTeamsAPI
- from webexteamssdk.exceptions import MalformedResponse, ApiError
- from webexteamssdk.models.immutable import Person
- from webexteamssdk.generator_containers import GeneratorContainer
- from .jinja_env import env as jinja_env
- from .jinja_env import sanitize_name
- __all__ = ['WebexTeamsArchiver', 'File', 'UserNotFound', 'UserApiFailed']
- File = namedtuple(
- "File", "content_disposition content_length content_type filename deleted")
- UserNotFound = namedtuple(
- "UserNotFound", "id emails displayName avatar"
- )
- UserApiFailed = namedtuple(
- "UserApiFailed", "id emails displayName avatar"
- )
- logger = logging.getLogger(__name__)
- logger.setLevel(logging.INFO)
- class WebexTeamsArchiver:
- """
- Initializes object that can be used to archive a Webex Teams room.
- Args:
- access_token: User's personal Webex Teams API bearer token.
- single_request_timeout: Timeout in seconds for the API requests.
- special_token: The supplied access_token has access to all messages in a space.
- Raises:
- webexteamssdkException: An error occurred calling the Webex Teams API.
- """
- def __init__(self, access_token: str, single_request_timeout: int = 60, special_token: bool = False) -> None:
- self.access_token = access_token
- self.special_token = special_token
- self.sdk = WebexTeamsAPI(
- self.access_token, single_request_timeout=single_request_timeout)
- def file_details(self, url: str) -> File:
- """
- Retrieves the file details using the Webex Teams attachments endpoint.
- Args:
- url: The URL of the file found in the files list in the message.
- single_request_timeout: Webex API call single request timeout.
- Returns:
- File: Details about the file.
- """
- headers = {
- "Authorization": f"Bearer {self.access_token}",
- "Accept-Encoding": "", # ensures content-length always gets returned
- }
- r = requests.head(url, headers=headers)
- if r.status_code == 404:
- # Item must have been deleted since url was retrieved
- return File("", 0, "", "", True)
- if r.ok:
- filename_re = re.search(r"filename=\"(.+?)\"",
- r.headers.get("Content-Disposition", ""), re.I)
- if not filename_re:
- new_filename = re.sub(r'^.+/([^/]+)$', r'\1', url)
- message = (
- f"Set filename to '{new_filename}' in {r.headers.get('Content-Disposition', '')} for url {url}"
- )
- logger.debug(message)
- filename_re = re.search(r"filename=\"(.+?)\"", f"filename=\"{new_filename}\"", re.I)
- return File(r.headers.get("Content-Disposition", ""),
- r.headers.get("Content-Length", 0),
- r.headers.get("Content-Type", ""),
- sanitize_name(filename_re.group(1)),
- False)
- else:
- return File("", 0, "", "UNKNOWN", True)
- def archive_room(self, room_id: str, text_format: bool = True, html_format: bool = True,
- json_format: bool = True, **options) -> str:
- """
- Archives a Webex Teams room. This creates a file called roomTitle_timestamp_roomId with the
- appropriate file extension as defined by file_format param with the following contents:
- - roomTitle_roomId.txt - Text version of the conversations (if `text_format` is True)
- - roomTitle_roomId.html - HTML version of the conversations (if `html_format` is True)
- - files/ - Attachments added to the room (if `download_attachments` is True)
- Args:
- room_id: ID of the room to archive.
- text_format: Create a text version of the archive.
- html_format: Create an HTML version of the archive.
- json_format: Create a json version of the archive.
- Options:
- compress_folder: Compress archive folder.
- delete_folder: Delete the archive folder when done.
- reverse_order: Order messages by most recent on the bottom.
- download_attachments: Download attachments sent to the room.
- download_avatars: Download avatar images.
- download_workers: Number of download workers for downloading files.
- timestamp_format: Timestamp strftime format.
- file_format: Archive format as supported by shutil.make_archive
-
- Returns:
- Name of archive file.
- Raises:
- IOError: Error occurred while creating/writing to files.
- shutil.Error: Error occurred creating/copying/deleting files/folders.
- ValueError: Exception message will contain more details.
- TypeError: Messages contained non JSON serializable data.
- webexteamssdkException: An error occurred calling the Webex Teams API.
- """
- # Configure options
- compress_folder = options.get("compress_folder", True)
- delete_folder = options.get("delete_folder", False)
- reverse_order = options.get("reverse_order", True)
- download_attachments = options.get("download_attachments", True)
- download_avatars = options.get("download_avatars", True)
- download_workers = options.get("download_workers", 15)
- timestamp_format = options.get("timestamp_format", "%Y-%m-%dT%H:%M:%S")
- file_format = options.get("file_format", "gztar")
- if delete_folder and not compress_folder:
- raise ValueError("delete_folder cannot be True while compress_folder is False")
- self._gather_room_information(room_id, download_avatars)
- # Prepare folder
- self._setup_folder(download_attachments, download_avatars, html_format)
- try:
- self._archive(reverse_order, download_attachments, download_avatars, download_workers,
- text_format, html_format, json_format, timestamp_format)
-
- if compress_folder:
- filename = self._compress_folder(file_format)
- else:
- filename = self.archive_folder_name
- except Exception:
- self._tear_down_folder()
- raise
- if delete_folder:
- self._tear_down_folder()
- return filename
- def _archive(self, reverse_order: bool, download_attachments: bool,
- download_avatars: bool, download_workers: int, text_format: bool,
- html_format: bool, json_format: bool, timestamp_format: str) -> None:
- """
- Collects room messages and attachments using Webex Teams
- APIs and writes them to text/html files.
- """
- if reverse_order:
- self.messages_with_threads = list(reversed(list(self.messages_with_threads)))
- else:
- self.messages_with_threads = list(self.messages_with_threads)
- if html_format:
- self._create_html_transcript(self.messages_with_threads, self.attachments, self.people,
- download_avatars, timestamp_format)
- logger.debug("HTML transcript completed.")
- if text_format:
- self._create_text_transcript(
- self.messages_with_threads, self.attachments, self.people, timestamp_format)
- logger.debug("Text transcript completed.")
- if json_format:
- self._create_json_transcript(self.messages)
- logger.debug("JSON transcript completed.")
- if download_attachments:
- self._download_files(
- "attachments", self.attachments, download_workers)
- logger.debug("Attachments download completed.")
- if download_avatars:
- self._download_files("avatars", self.avatars, download_workers)
- logger.debug("Avatars download completed.")
- # Write space information to json file
- with open(os.path.join(os.getcwd(), self.archive_folder_name, f"space_details.json"), "w", encoding="utf-8") as fh:
- space_details = {
- "space": self.room.to_dict(),
- "creator": self.room_creator._asdict() if isinstance(self.room_creator, UserNotFound)
- else self.room_creator.to_dict(),
- }
- json.dump(space_details, fh)
- logger.info("Room %s archived successfully.", self.room.id)
- def _setup_folder(self, download_attachments: bool,
- download_avatars, html_format: bool) -> None:
- """Creates a folder roomTitle_roomId to store archive."""
- os.makedirs(self.archive_folder_name)
- if download_attachments:
- os.makedirs(f"{self.archive_folder_name}/attachments")
- if download_avatars:
- os.makedirs(f"{self.archive_folder_name}/avatars")
- if html_format:
- basepath = os.path.dirname(os.path.realpath(__file__))
- shutil.copytree(f"{basepath}/static/.css",
- f"{self.archive_folder_name}/.css")
- shutil.copytree(f"{basepath}/static/.js",
- f"{self.archive_folder_name}/.js")
- shutil.copytree(f"{basepath}/static/.fonts",
- f"{self.archive_folder_name}/.fonts")
- def _tear_down_folder(self) -> None:
- """Deletes the roomTitle_roomId folder in case an exception was raised."""
- if os.path.isdir(self.archive_folder_name):
- shutil.rmtree(self.archive_folder_name, ignore_errors=False)
- def _gather_room_information(self, room_id: str, download_avatars: bool) -> None:
- """Calls Webex Teams APIs to get room information and messages."""
- # Structure: {"personId": webexteamssdk.models.immutable.Person}
- self.people = {}
- # Structure: {"url": File}
- self.attachments = {}
- # Structure: {"url": File}
- self.avatars = {}
- # Threads: {"parentId": [webexteamssdk.models.immutable.Message, ...]}
- self.threads = {}
- self.room = self.sdk.rooms.get(room_id)
- timestamp = datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
- self.archive_folder_name = f"{sanitize_name(self.room.title)}_{timestamp}"
- try:
- self.room_creator = self.sdk.people.get(self.room.creatorId)
- except ApiError as e:
- if e.response.status_code == 404:
- self.room_creator = UserNotFound(
- id=self.room.creatorId,
- emails=["unknown"],
- displayName="Person Not Found",
- avatar=None,
- )
- else:
- logger.error(e)
- raise
- if self.room.type == "group" and self.sdk.people.me().type == "bot" and not self.special_token:
- self.messages = self.sdk.messages.list(
- room_id, mentionedPeople="me")
- else:
- self.messages = self.sdk.messages.list(room_id)
- self.messages_with_threads = self.messages
- self._organize_by_threads(self.messages, download_avatars)
- def _organize_by_threads(self, messages: GeneratorContainer, download_avatars: bool) -> None:
- """Extracts threaded messages from all messages."""
- for index, msg in enumerate(messages):
- if hasattr(msg, "parentId"):
- if msg.parentId in self.threads:
- self.threads[msg.parentId].insert(0, msg)
- else:
- self.threads[msg.parentId] = [msg]
- if msg.personId and msg.personId not in self.people:
- try:
- self.people[msg.personId] = self.sdk.people.get(
- msg.personId)
- if not msg.personEmail:
- if isinstance(self.people[msg.personId], Person) and \
- isinstance(self.people[msg.personId].emails, list) and \
- len(self.people[msg.personId].emails) > 0:
- msg.personEmail = self.people[msg.personId].emails[0]
- except ApiError as e:
- if e.response.status_code == 404:
- self.people[msg.personId] = UserNotFound(
- id=str(msg.personId),
- emails=[str(msg.personEmail)],
- displayName="Person Not Found",
- avatar=None,
- )
- else:
- logger.error(e)
- self.people[msg.personId] = UserApiFailed(
- id=str(msg.personId),
- emails=[str(msg.personEmail)],
- displayName="User API Failed",
- avatar=None,
- )
- if download_avatars and self.people[msg.personId].avatar:
- self.avatars[self.people[msg.personId].avatar] = File(
- "", "", "", msg.personId, False)
- if msg.files:
- for url in msg.files:
- file_metadata = self.file_details(url)
- self.attachments[url] = file_metadata
- return
- def _create_text_transcript(self, messages: list, attachments: dict, people: dict,
- timestamp_format: str) -> None:
- """Writes room messages to a text file."""
- template = jinja_env.get_template("default.txt")
- text_transcript = template.render(
- room=self.room,
- room_creator=self.room_creator,
- messages=messages,
- attachments=attachments,
- people=people,
- timestamp_format=timestamp_format,
- threads=self.threads
- )
- with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.txt"), "w", encoding="utf-8") as fh:
- fh.write(text_transcript)
- def _create_json_transcript(self, messages: GeneratorContainer) -> None:
- """Writes room messages to a JSON file."""
- data = {
- "items": [m.to_dict() for m in messages]
- }
- with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.json"), "w", encoding="utf-8") as fh:
- json.dump(data, fh)
- def _create_html_transcript(self, messages: list, attachments: dict, people: dict,
- download_avatars: dict, timestamp_format: str) -> None:
- """Writes room messages to an HTML file."""
- template = jinja_env.get_template("default.html")
- html = template.render(
- room=self.room,
- room_creator=self.room_creator,
- messages=messages,
- attachments=attachments,
- people=people,
- download_avatars=download_avatars,
- timestamp_format=timestamp_format,
- threads=self.threads
- )
- with open(os.path.join(os.getcwd(), self.archive_folder_name, f"{self.archive_folder_name}.html"), "w", encoding="utf-8") as fh:
- fh.write(html)
- def _download_files(self, folder_name: str, links: dict, workers: int) -> None:
- """Downloads files given a list of URL links."""
- with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
- result = {
- executor.submit(self._download_file, folder_name, url, links[url].filename): url for url in links if not links[url].deleted
- }
- # Do this to check if any downloads failed.
- for future in concurrent.futures.as_completed(result):
- future.result()
- def _download_file(self, folder_name: str, url: str, filename: str) -> None:
- """Download file from Webex Teams."""
- headers = {
- "Authorization": f"Bearer {self.access_token}"
- }
- # https://stackoverflow.com/questions/16694907/how-to-download-
- # large-file-in-python-with-requests-py
- # Removing as it's not support in all requests versions
- # with requests.get(url, headers=headers, stream=True) as r:
- # with open(os.path.join(os.getcwd(), self.archive_folder_name, folder_name, f"{filename}"), "wb") as f:
- # shutil.copyfileobj(r.raw, f)
- r = requests.get(url, headers=headers, stream=True)
- with open(os.path.join(os.getcwd(), self.archive_folder_name, folder_name, f"{filename}"), "wb") as f:
- for chunk in r.iter_content(chunk_size=1024):
- f.write(chunk)
- def _compress_folder(self, file_format: str) -> str:
- """Compress `archive_folder_name` folder with the format defined by file_format param"""
- return shutil.make_archive(self.archive_folder_name, file_format, self.archive_folder_name)
|