archiver.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import praw
  4. import snudown
  5. import datetime
  6. import time
  7. import re
  8. import sys
  9. from requests.exceptions import HTTPError
  10. """
  11. Customization Configuration
  12. """
  13. # Default postID: #
  14. postID='15zmjl'
  15. # Path to which to output the file #
  16. outputFilePath='./'
  17. # The Path to the stylesheet, relative to where the html file will be stored #
  18. pathToCSS='css/style.css'
  19. """
  20. Reddit Post Archiver
  21. By Samuel Johnson Stoever
  22. """
  23. if len(sys.argv) == 1:
  24. print('No post ID was provided. Using default postID.')
  25. elif len(sys.argv) > 2:
  26. print('Too Many Arguments. Using default postID.')
  27. else:
  28. postID = sys.argv[1]
  29. outputFilePath = outputFilePath + postID + '.html'
  30. monthsList = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
  31. def writeHeader(posttitle):
  32. htmlFile.write('<!DOCTYPE html>\n<html>\n<head>\n')
  33. htmlFile.write('\t<meta charset="utf-8"/>\n')
  34. htmlFile.write('\t<link type="text/css" rel="stylesheet" href="' + pathToCSS +'"/>\n')
  35. htmlFile.write('\t<title>' + posttitle + '</title>\n')
  36. htmlFile.write('</head>\n<body>\n')
  37. def parsePost(postObject):
  38. writeHeader(fixUnicode(postObject.title))
  39. postObject.replace_more_comments()
  40. postAuthorName = ''
  41. postAuthorExists = 0
  42. try:
  43. postAuthorName = fixUnicode(postObject.author.name)
  44. postAuthorExists = 1
  45. except AttributeError:
  46. postAuthorExists = 0
  47. htmlFile.write('<div class="title">\n')
  48. if postObject.is_self:
  49. # The post is a self post
  50. htmlFile.write(fixUnicode(postObject.title))
  51. htmlFile.write('\n<br/><strong>')
  52. else:
  53. # The post is a link post
  54. htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url))
  55. htmlFile.write('">')
  56. htmlFile.write(fixUnicode(postObject.title))
  57. htmlFile.write('</a>\n<br/><strong>')
  58. if postAuthorExists:
  59. htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url))
  60. htmlFile.write('">')
  61. htmlFile.write(postAuthorName)
  62. htmlFile.write('</a>. </strong><em>')
  63. else:
  64. htmlFile.write('Posted by [Deleted]. </strong><em>')
  65. htmlFile.write('Posted at ')
  66. postDate = time.gmtime(postObject.created_utc)
  67. htmlFile.write(str(postDate.tm_hour) + ':')
  68. htmlFile.write(str(postDate.tm_min) + ' UTC on ')
  69. htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
  70. htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
  71. htmlFile.write('. ' + str(postObject.ups - postObject.downs))
  72. if postObject.is_self:
  73. htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="')
  74. else:
  75. htmlFile.write(' Points. </em><em>(<a id="selfLink" href="')
  76. htmlFile.write(postObject.subreddit._url)
  77. htmlFile.write('">' + postObject.subreddit.display_name)
  78. if postObject.is_self:
  79. htmlFile.write('</a>)</em><em>')
  80. else:
  81. htmlFile.write('</a> Subreddit)</em><em>')
  82. htmlFile.write(' (<a id="postpermalink" href="')
  83. htmlFile.write(fixUnicode(postObject.permalink))
  84. htmlFile.write('">Permalink</a>)</em>\n')
  85. if postObject.is_self:
  86. htmlFile.write('<div class="post">\n')
  87. htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext)))
  88. htmlFile.write('</div>\n')
  89. else:
  90. htmlFile.write('<div class="post">\n<p>\n')
  91. htmlFile.write(postObject.url)
  92. htmlFile.write('</p>\n</div>\n')
  93. htmlFile.write('</div>\n')
  94. for comment in postObject._comments:
  95. parseComment(comment, postAuthorName, postAuthorExists)
  96. htmlFile.write('<hr id="footerhr">\n')
  97. htmlFile.write('<div id="footer"><em>Archived on ')
  98. htmlFile.write(str(datetime.datetime.utcnow()))
  99. htmlFile.write(' UTC</em></div>')
  100. htmlFile.write('\n\n</body>\n</html>\n')
  101. #Done
  102. def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True):
  103. commentAuthorName = ''
  104. commentAuthorExists = 0
  105. try:
  106. commentAuthorName = fixUnicode(redditComment.author.name)
  107. commentAuthorExists = 1
  108. except AttributeError:
  109. commentAuthorExists = 0
  110. if isRoot:
  111. htmlFile.write('<div id="' + str(redditComment.id))
  112. htmlFile.write('" class="comment">\n')
  113. else:
  114. htmlFile.write('<div id="' + str(redditComment.id))
  115. htmlFile.write('" class="comment" style="margin-bottom:10px;margin-left:0px;">\n')
  116. htmlFile.write('<div class="commentinfo">\n')
  117. if commentAuthorExists:
  118. if postAuthorExists and postAuthorName == commentAuthorName:
  119. htmlFile.write('<a href="' + redditComment.author._url)
  120. htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>')
  121. else:
  122. htmlFile.write('<a href="' + redditComment.author._url)
  123. htmlFile.write('">' + commentAuthorName + '</a> <em>')
  124. else:
  125. htmlFile.write('<strong>[Deleted]</strong> <em>')
  126. htmlFile.write(str(redditComment.ups - redditComment.downs))
  127. htmlFile.write(' Points </em><em>')
  128. htmlFile.write('Posted at ')
  129. postDate = time.gmtime(redditComment.created_utc)
  130. htmlFile.write(str(postDate.tm_hour) + ':')
  131. htmlFile.write(str(postDate.tm_min) + ' UTC on ')
  132. htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
  133. htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
  134. htmlFile.write('</em></div>\n')
  135. htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body)))
  136. for reply in redditComment._replies:
  137. parseComment(reply, postAuthorName, postAuthorExists, False)
  138. htmlFile.write('</div>\n')
  139. #Done
  140. def fixMarkdown(markdown):
  141. newMarkdown = markdown.encode('utf8')
  142. return re.sub('\&gt;', '>', str(newMarkdown))
  143. def fixUnicode(text):
  144. return str(text.encode('utf8'))
  145. # End Function Definitions
  146. r = praw.Reddit(user_agent='RedditPostArchiver Bot, version 0.93')
  147. # Disclaimer, storing plain text passwords is bad.
  148. # uncomment the following line to login (e.g., in case of Unable to Archive Post:
  149. # r.login('username', 'password')
  150. try:
  151. thePost = r.get_submission(submission_id=postID)
  152. htmlFile = open(outputFilePath,'w')
  153. parsePost(thePost)
  154. htmlFile.close()
  155. except HTTPError:
  156. print('Unable to Archive Post: Invalid PostID or Log In Required (see line 157 of script)')
  157. ##Done