offline_reading.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import os
  2. import markdown
  3. from . import common
  4. from . import exceptions
  5. from . import tsdb
  6. HTML_HEADER = '''
  7. <html>
  8. <head>
  9. <title>{title}</title>
  10. <meta charset="UTF-8">
  11. <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
  12. <style>
  13. .submission, .comment
  14. {{
  15. padding-left: 20px;
  16. padding-right: 4px;
  17. }}
  18. .comment
  19. {{
  20. margin-top: 4px;
  21. margin-bottom: 4px;
  22. border: 1px solid black;
  23. }}
  24. .submission
  25. {{
  26. border: 2px solid blue;
  27. }}
  28. .hidden
  29. {{
  30. display: none;
  31. }}
  32. </style>
  33. </head>
  34. <body>
  35. '''.strip()
  36. HTML_FOOTER = '''
  37. </body>
  38. <script>
  39. function toggle_collapse(comment_div)
  40. {
  41. var button = comment_div.getElementsByClassName("toggle_hide_button")[0];
  42. var collapsible = comment_div.getElementsByClassName("collapsible")[0];
  43. if (collapsible.classList.contains("hidden"))
  44. {
  45. collapsible.classList.remove("hidden");
  46. button.innerText = "[-]";
  47. }
  48. else
  49. {
  50. collapsible.classList.add("hidden");
  51. button.innerText = "[+]";
  52. }
  53. }
  54. </script>
  55. </html>
  56. '''.strip()
  57. HTML_COMMENT = '''
  58. <div class="comment" id="{id}">
  59. <p class="userinfo">
  60. <a
  61. class="toggle_hide_button"
  62. href="javascript:void(0)"
  63. onclick="toggle_collapse(this.parentElement.parentElement)">[-]
  64. </a>
  65. {usernamelink}
  66. |
  67. <span class="score">{score} points</span>
  68. |
  69. <a class="timestamp" href="{permalink}">{human}</a>
  70. </p>
  71. <div class="collapsible">
  72. {body}
  73. {{children}}
  74. </div>
  75. </div>
  76. '''.strip()
  77. HTML_SUBMISSION = '''
  78. <div class="submission" id="{id}">
  79. <p class="userinfo">
  80. {usernamelink}
  81. |
  82. <span class="score">{score} points</span>
  83. |
  84. <a class="timestamp" href="{permalink}">{human}</a>
  85. </p>
  86. <strong>{title}</strong>
  87. <p>{url_or_text}</p>
  88. </div>
  89. {{children}}
  90. '''.strip()
  91. class TreeNode:
  92. def __init__(self, identifier, data, parent=None):
  93. assert isinstance(identifier, str)
  94. assert '\\' not in identifier
  95. self.identifier = identifier
  96. self.data = data
  97. self.parent = parent
  98. self.children = {}
  99. def __getitem__(self, key):
  100. return self.children[key]
  101. def __repr__(self):
  102. return 'TreeNode %s' % self.abspath()
  103. def abspath(self):
  104. node = self
  105. nodes = [node]
  106. while node.parent is not None:
  107. node = node.parent
  108. nodes.append(node)
  109. nodes.reverse()
  110. nodes = [node.identifier for node in nodes]
  111. return '\\'.join(nodes)
  112. def add_child(self, other_node, overwrite_parent=False):
  113. self.check_child_availability(other_node.identifier)
  114. if other_node.parent is not None and not overwrite_parent:
  115. raise ValueError('That node already has a parent. Try `overwrite_parent=True`')
  116. other_node.parent = self
  117. self.children[other_node.identifier] = other_node
  118. return other_node
  119. def check_child_availability(self, identifier):
  120. if ':' in identifier:
  121. raise Exception('Only roots may have a colon')
  122. if identifier in self.children:
  123. raise Exception('Node %s already has child %s' % (self.identifier, identifier))
  124. def detach(self):
  125. del self.parent.children[self.identifier]
  126. self.parent = None
  127. def listnodes(self, customsort=None):
  128. items = list(self.children.items())
  129. if customsort is None:
  130. items.sort(key=lambda x: x[0].lower())
  131. else:
  132. items.sort(key=customsort)
  133. return [item[1] for item in items]
  134. def merge_other(self, othertree, otherroot=None):
  135. newroot = None
  136. if ':' in othertree.identifier:
  137. if otherroot is None:
  138. raise Exception('Must specify a new name for the other tree\'s root')
  139. else:
  140. newroot = otherroot
  141. else:
  142. newroot = othertree.identifier
  143. othertree.identifier = newroot
  144. othertree.parent = self
  145. self.check_child_availability(newroot)
  146. self.children[newroot] = othertree
  147. def printtree(self, customsort=None):
  148. for node in self.walk(customsort):
  149. print(node.abspath())
  150. def walk(self, customsort=None):
  151. yield self
  152. for child in self.listnodes(customsort=customsort):
  153. #print(child)
  154. #print(child.listnodes())
  155. yield from child.walk(customsort=customsort)
  156. def html_format_comment(comment):
  157. text = HTML_COMMENT.format(
  158. id=comment.idstr,
  159. body=sanitize_braces(render_markdown(comment.body)),
  160. usernamelink=html_helper_userlink(comment),
  161. score=comment.score,
  162. human=common.human(comment.created),
  163. permalink=html_helper_permalink(comment),
  164. )
  165. return text
  166. def html_format_submission(submission):
  167. text = HTML_SUBMISSION.format(
  168. id=submission.idstr,
  169. title=sanitize_braces(submission.title),
  170. usernamelink=html_helper_userlink(submission),
  171. score=submission.score,
  172. human=common.human(submission.created),
  173. permalink=html_helper_permalink(submission),
  174. url_or_text=html_helper_urlortext(submission),
  175. )
  176. return text
  177. def html_from_database(database, specific_submission=None):
  178. '''
  179. Given a timesearch database, produce html pages for each
  180. of the submissions it contains (or one particular submission fullname)
  181. '''
  182. if markdown is None:
  183. raise ImportError('Page cannot be rendered without the markdown module')
  184. submission_trees = trees_from_database(database, specific_submission)
  185. for submission_tree in submission_trees:
  186. page = html_from_tree(submission_tree, sort=lambda x: x.data.score * -1)
  187. database.offline_reading_dir.makedirs(exist_ok=True)
  188. html = ''
  189. header = HTML_HEADER.format(title=submission_tree.data.title)
  190. html += header
  191. html += page
  192. html += HTML_FOOTER
  193. yield (submission_tree.identifier, html)
  194. def html_from_tree(tree, sort=None):
  195. '''
  196. Given a tree *whose root is the submission*, return
  197. HTML-formatted text representing each submission's comment page.
  198. '''
  199. if tree.data.object_type == 'submission':
  200. page = html_format_submission(tree.data)
  201. elif tree.data.object_type == 'comment':
  202. page = html_format_comment(tree.data)
  203. children = tree.listnodes()
  204. if sort is not None:
  205. children.sort(key=sort)
  206. children = [html_from_tree(child, sort) for child in children]
  207. if len(children) == 0:
  208. children = ''
  209. else:
  210. children = '\n\n'.join(children)
  211. try:
  212. page = page.format(children=children)
  213. except IndexError:
  214. print(page)
  215. raise
  216. return page
  217. def html_helper_permalink(item):
  218. '''
  219. Given a submission or a comment, return the URL for its permalink.
  220. '''
  221. link = 'https://old.reddit.com/r/%s/comments/' % item.subreddit
  222. if item.object_type == 'submission':
  223. link += item.idstr[3:]
  224. elif item.object_type == 'comment':
  225. link += '%s/_/%s' % (item.submission[3:], item.idstr[3:])
  226. return link
  227. def html_helper_urlortext(submission):
  228. '''
  229. Given a submission, return either an <a> tag for its url, or its
  230. markdown-rendered selftext.
  231. '''
  232. if submission.url:
  233. text = '<a href="{url}">{url}</a>'.format(url=submission.url)
  234. elif submission.selftext:
  235. text = render_markdown(submission.selftext)
  236. else:
  237. text = ''
  238. text = sanitize_braces(text)
  239. return text
  240. def html_helper_userlink(item):
  241. '''
  242. Given a submission or comment, return an <a> tag for its author, or [deleted].
  243. '''
  244. name = item.author
  245. if name.lower() == '[deleted]':
  246. return '[deleted]'
  247. link = 'https://old.reddit.com/u/{name}'
  248. link = '<a href="%s">{name}</a>' % link
  249. link = link.format(name=name)
  250. return link
  251. def render_markdown(text):
  252. # I was going to use html.escape, but then it turns html entities like
  253. # &nbsp; into &amp;nbsp; which doesn't work.
  254. # So I only want to escape the brackets.
  255. escaped = text.replace('<', '&lt;').replace('>', '&rt;')
  256. text = markdown.markdown(escaped, output_format='html5')
  257. return text
  258. def sanitize_braces(text):
  259. text = text.replace('{', '{{')
  260. text = text.replace('}', '}}')
  261. return text
  262. def trees_from_database(database, specific_submission=None):
  263. '''
  264. Given a timesearch database, take all of the submission
  265. ids, take all of the comments for each submission id, and run them
  266. through `tree_from_submission`.
  267. Yield each submission's tree as it is generated.
  268. '''
  269. cur1 = database.sql.cursor()
  270. cur2 = database.sql.cursor()
  271. if specific_submission is None:
  272. cur1.execute('SELECT idstr FROM submissions ORDER BY created ASC')
  273. submission_ids = common.fetchgenerator(cur1)
  274. # sql always returns rows as tuples, even when selecting one column.
  275. submission_ids = (x[0] for x in submission_ids)
  276. else:
  277. specific_submission = common.t3_prefix(specific_submission)
  278. submission_ids = [specific_submission]
  279. found_some_posts = False
  280. for submission_id in submission_ids:
  281. found_some_posts = True
  282. cur2.execute('SELECT * FROM submissions WHERE idstr == ?', [submission_id])
  283. submission = cur2.fetchone()
  284. cur2.execute('SELECT * FROM comments WHERE submission == ?', [submission_id])
  285. fetched_comments = cur2.fetchall()
  286. submission_tree = tree_from_submission(submission, fetched_comments)
  287. yield submission_tree
  288. if not found_some_posts:
  289. raise Exception('Found no submissions!')
  290. def tree_from_submission(submission_dbrow, comments_dbrows):
  291. '''
  292. Given the sqlite data for a submission and all of its comments,
  293. return a tree with the submission id as the root
  294. '''
  295. submission = tsdb.DBEntry(submission_dbrow)
  296. comments = [tsdb.DBEntry(c) for c in comments_dbrows]
  297. comments.sort(key=lambda x: x.created)
  298. print('Building tree for %s (%d comments)' % (submission.idstr, len(comments)))
  299. # Thanks Martin Schmidt for the algorithm
  300. # http://stackoverflow.com/a/29942118/5430534
  301. tree = TreeNode(identifier=submission.idstr, data=submission)
  302. node_map = {}
  303. for comment in comments:
  304. # Ensure this comment is in a node of its own
  305. this_node = node_map.get(comment.idstr, None)
  306. if this_node:
  307. # This ID was detected as a parent of a previous iteration
  308. # Now we're actually filling it in.
  309. this_node.data = comment
  310. else:
  311. this_node = TreeNode(comment.idstr, comment)
  312. node_map[comment.idstr] = this_node
  313. # Attach this node to the parent.
  314. if comment.parent.startswith('t3_'):
  315. tree.add_child(this_node)
  316. else:
  317. parent_node = node_map.get(comment.parent, None)
  318. if not parent_node:
  319. parent_node = TreeNode(comment.parent, data=None)
  320. node_map[comment.parent] = parent_node
  321. parent_node.add_child(this_node)
  322. this_node.parent = parent_node
  323. return tree
  324. def offline_reading(subreddit=None, username=None, specific_submission=None):
  325. if not specific_submission and not common.is_xor(subreddit, username):
  326. raise exceptions.NotExclusive(['subreddit', 'username'])
  327. if specific_submission and not username and not subreddit:
  328. database = tsdb.TSDB.for_submission(specific_submission, do_create=False)
  329. elif subreddit:
  330. database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
  331. else:
  332. database = tsdb.TSDB.for_user(username, do_create=False)
  333. htmls = html_from_database(database, specific_submission=specific_submission)
  334. for (id, html) in htmls:
  335. html_basename = '%s.html' % id
  336. html_filepath = database.offline_reading_dir.with_child(html_basename)
  337. html_handle = html_filepath.open('w', encoding='utf-8')
  338. html_handle.write(html)
  339. html_handle.close()
  340. print('Wrote', html_filepath.relative_path)
  341. def offline_reading_argparse(args):
  342. return offline_reading(
  343. subreddit=args.subreddit,
  344. username=args.username,
  345. specific_submission=args.specific_submission,
  346. )