formatter.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import logging
  2. import re
  3. import sys
  4. import emoji
  5. import markdown2
  6. from slackviewer.user import User
  7. # Workaround for ASCII encoding error in Python 2.7
  8. # See https://github.com/hfaran/slack-export-viewer/issues/81
  9. if sys.version_info[0] == 2:
  10. reload(sys)
  11. sys.setdefaultencoding('utf8')
  12. class SlackFormatter(object):
  13. "This formats messages and provides access to workspace-wide data (user and channel metadata)."
  14. # Class-level constants for precompilation of frequently-reused regular expressions
  15. # URL detection relies on http://stackoverflow.com/a/1547940/1798683
  16. _LINK_PAT = re.compile(r"<(https|http|mailto):[A-Za-z0-9_\.\-\/\?\,\=\#\:\@]+\|[^>]+>")
  17. _MENTION_PAT = re.compile(r"<((?:#C|@[UB])\w+)(?:\|([A-Za-z0-9.-_]+))?>")
  18. _HASHTAG_PAT = re.compile(r"(^| )#[A-Za-z][\w\.\-\_]+( |$)")
  19. def __init__(self, USER_DATA, CHANNEL_DATA):
  20. self.__USER_DATA = USER_DATA
  21. self.__CHANNEL_DATA = CHANNEL_DATA
  22. def find_user(self, message):
  23. if message.get("subtype", "").startswith("bot_") and "bot_id" in message and message["bot_id"] not in self.__USER_DATA:
  24. bot_id = message["bot_id"]
  25. logging.debug("bot addition for %s", bot_id)
  26. if "bot_link" in message:
  27. (bot_url, bot_name) = message["bot_link"].strip("<>").split("|", 1)
  28. elif "username" in message:
  29. bot_name = message["username"]
  30. bot_url = None
  31. else:
  32. bot_name = None
  33. bot_url = None
  34. self.__USER_DATA[bot_id] = User({
  35. "user": bot_id,
  36. "real_name": bot_name,
  37. "bot_url": bot_url,
  38. "is_bot": True,
  39. "is_app_user": True
  40. })
  41. user_id = message.get("user") or message.get("bot_id")
  42. if user_id in self.__USER_DATA:
  43. return self.__USER_DATA.get(user_id)
  44. logging.error("unable to find user in %s", message)
  45. def render_text(self, message, process_markdown=True):
  46. message = message.replace("<!channel>", "@channel")
  47. message = message.replace("<!channel|@channel>", "@channel")
  48. message = message.replace("<!here>", "@here")
  49. message = message.replace("<!here|@here>", "@here")
  50. message = message.replace("<!everyone>", "@everyone")
  51. message = message.replace("<!everyone|@everyone>", "@everyone")
  52. # Handle mentions of users, channels and bots (e.g "<@U0BM1CGQY|calvinchanubc> has joined the channel")
  53. message = self._MENTION_PAT.sub(self._sub_annotated_mention, message)
  54. # Handle links
  55. message = self._LINK_PAT.sub(self._sub_hyperlink, message)
  56. # Handle hashtags (that are meant to be hashtags and not headings)
  57. message = self._HASHTAG_PAT.sub(self._sub_hashtag, message)
  58. # Introduce unicode emoji
  59. message = self.slack_to_accepted_emoji(message)
  60. message = emoji.emojize(message, language='alias')
  61. if process_markdown:
  62. # Handle bold (convert * * to ** **)
  63. message = re.sub(r'\*', "**", message)
  64. message = markdown2.markdown(
  65. message,
  66. extras=[
  67. "cuddled-lists",
  68. # This gives us <pre> and <code> tags for ```-fenced blocks
  69. "fenced-code-blocks",
  70. "pyshell"
  71. ]
  72. ).strip()
  73. # Special handling cases for lists
  74. message = message.replace("\n\n<ul>", "<ul>")
  75. message = message.replace("\n<li>", "<li>")
  76. return message
  77. def slack_to_accepted_emoji(self, message):
  78. """Convert some Slack emoji shortcodes to more universal versions"""
  79. # Convert -'s to _'s except for the 1st char (preserve things like :-1:)
  80. # For example, Slack's ":woman-shrugging:" is converted to ":woman_shrugging:"
  81. message = re.sub(
  82. r":([^ <>/:])([^ <>/:]+):",
  83. lambda x: ":{}{}:".format(x.group(1), x.group(2).replace("-", "_")),
  84. message
  85. )
  86. # https://github.com/Ranks/emojione/issues/114
  87. message = message.replace(":simple_smile:", ":slightly_smiling_face:")
  88. return message
  89. def _sub_annotated_mention(self, matchobj):
  90. ref_id = matchobj.group(1)[1:] # drop #/@ from the start, we don't care
  91. annotation = matchobj.group(2)
  92. if ref_id.startswith('C'):
  93. mention_format = "<b>#{}</b>"
  94. if not annotation:
  95. channel = self.__CHANNEL_DATA.get(ref_id)
  96. annotation = channel["name"] if channel else ref_id
  97. else:
  98. mention_format = "@{}"
  99. if not annotation:
  100. user = self.__USER_DATA.get(ref_id)
  101. annotation = user.display_name if user else ref_id
  102. return mention_format.format(annotation)
  103. def _sub_hyperlink(self, matchobj):
  104. compound = matchobj.group(0)[1:-1]
  105. if len(compound.split("|")) == 2:
  106. url, title = compound.split("|")
  107. else:
  108. url, title = compound, compound
  109. result = "<a href=\"{url}\">{title}</a>".format(url=url, title=title)
  110. return result
  111. def _sub_hashtag(self, matchobj):
  112. text = matchobj.group(0)
  113. starting_space = " " if text[0] == " " else ""
  114. ending_space = " " if text[-1] == " " else ""
  115. return "{}<b>{}</b>{}".format(
  116. starting_space,
  117. text.strip(),
  118. ending_space
  119. )