123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- import praw
- import snudown
- import datetime
- import time
- import re
- import sys
- from requests.exceptions import HTTPError
- """
- Customization Configuration
- """
- # Default postID: #
- postID='15zmjl'
- # Path to which to output the file #
- outputFilePath='./'
- # The Path to the stylesheet, relative to where the html file will be stored #
- pathToCSS='css/style.css'
- """
- Reddit Post Archiver
- By Samuel Johnson Stoever
- """
- if len(sys.argv) == 1:
- print('No post ID was provided. Using default postID.')
- elif len(sys.argv) > 2:
- print('Too Many Arguments. Using default postID.')
- else:
- postID = sys.argv[1]
- outputFilePath = outputFilePath + postID + '.html'
- monthsList = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
- def writeHeader(posttitle):
- htmlFile.write('<!DOCTYPE html>\n<html>\n<head>\n')
- htmlFile.write('\t<meta charset="utf-8"/>\n')
- htmlFile.write('\t<link type="text/css" rel="stylesheet" href="' + pathToCSS +'"/>\n')
- htmlFile.write('\t<title>' + posttitle + '</title>\n')
- htmlFile.write('</head>\n<body>\n')
- def parsePost(postObject):
- writeHeader(fixUnicode(postObject.title))
- postObject.replace_more_comments()
- postAuthorName = ''
- postAuthorExists = 0
- try:
- postAuthorName = fixUnicode(postObject.author.name)
- postAuthorExists = 1
- except AttributeError:
- postAuthorExists = 0
- htmlFile.write('<div class="title">\n')
- if postObject.is_self:
- # The post is a self post
- htmlFile.write(fixUnicode(postObject.title))
- htmlFile.write('\n<br/><strong>')
- else:
- # The post is a link post
- htmlFile.write('<a id="postlink" href="' + fixUnicode(postObject.url))
- htmlFile.write('">')
- htmlFile.write(fixUnicode(postObject.title))
- htmlFile.write('</a>\n<br/><strong>')
- if postAuthorExists:
- htmlFile.write('Posted by <a id="userlink" href="' + fixUnicode(postObject.author._url))
- htmlFile.write('">')
- htmlFile.write(postAuthorName)
- htmlFile.write('</a>. </strong><em>')
- else:
- htmlFile.write('Posted by [Deleted]. </strong><em>')
- htmlFile.write('Posted at ')
- postDate = time.gmtime(postObject.created_utc)
- htmlFile.write(str(postDate.tm_hour) + ':')
- htmlFile.write(str(postDate.tm_min) + ' UTC on ')
- htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
- htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
- htmlFile.write('. ' + str(postObject.ups - postObject.downs))
- if postObject.is_self:
- htmlFile.write(' Points. </em><em>(self.<a id="selfLink" href="')
- else:
- htmlFile.write(' Points. </em><em>(<a id="selfLink" href="')
- htmlFile.write(postObject.subreddit._url)
- htmlFile.write('">' + postObject.subreddit.display_name)
- if postObject.is_self:
- htmlFile.write('</a>)</em><em>')
- else:
- htmlFile.write('</a> Subreddit)</em><em>')
- htmlFile.write(' (<a id="postpermalink" href="')
- htmlFile.write(fixUnicode(postObject.permalink))
- htmlFile.write('">Permalink</a>)</em>\n')
- if postObject.is_self:
- htmlFile.write('<div class="post">\n')
- htmlFile.write(snudown.markdown(fixMarkdown(postObject.selftext)))
- htmlFile.write('</div>\n')
- else:
- htmlFile.write('<div class="post">\n<p>\n')
- htmlFile.write(postObject.url)
- htmlFile.write('</p>\n</div>\n')
- htmlFile.write('</div>\n')
- for comment in postObject._comments:
- parseComment(comment, postAuthorName, postAuthorExists)
- htmlFile.write('<hr id="footerhr">\n')
- htmlFile.write('<div id="footer"><em>Archived on ')
- htmlFile.write(str(datetime.datetime.utcnow()))
- htmlFile.write(' UTC</em></div>')
- htmlFile.write('\n\n</body>\n</html>\n')
- #Done
- def parseComment(redditComment, postAuthorName, postAuthorExists, isRoot=True):
- commentAuthorName = ''
- commentAuthorExists = 0
- try:
- commentAuthorName = fixUnicode(redditComment.author.name)
- commentAuthorExists = 1
- except AttributeError:
- commentAuthorExists = 0
- if isRoot:
- htmlFile.write('<div id="' + str(redditComment.id))
- htmlFile.write('" class="comment">\n')
- else:
- htmlFile.write('<div id="' + str(redditComment.id))
- htmlFile.write('" class="comment" style="margin-bottom:10px;margin-left:0px;">\n')
- htmlFile.write('<div class="commentinfo">\n')
- if commentAuthorExists:
- if postAuthorExists and postAuthorName == commentAuthorName:
- htmlFile.write('<a href="' + redditComment.author._url)
- htmlFile.write('" class="postOP-comment">' + commentAuthorName + '</a> <em>')
- else:
- htmlFile.write('<a href="' + redditComment.author._url)
- htmlFile.write('">' + commentAuthorName + '</a> <em>')
- else:
- htmlFile.write('<strong>[Deleted]</strong> <em>')
- htmlFile.write(str(redditComment.ups - redditComment.downs))
- htmlFile.write(' Points </em><em>')
- htmlFile.write('Posted at ')
- postDate = time.gmtime(redditComment.created_utc)
- htmlFile.write(str(postDate.tm_hour) + ':')
- htmlFile.write(str(postDate.tm_min) + ' UTC on ')
- htmlFile.write(monthsList[postDate.tm_mon-1] + ' ')
- htmlFile.write(str(postDate.tm_mday) + ', ' + str(postDate.tm_year))
- htmlFile.write('</em></div>\n')
- htmlFile.write(snudown.markdown(fixMarkdown(redditComment.body)))
- for reply in redditComment._replies:
- parseComment(reply, postAuthorName, postAuthorExists, False)
- htmlFile.write('</div>\n')
- #Done
- def fixMarkdown(markdown):
- newMarkdown = markdown.encode('utf8')
- return re.sub('\>', '>', str(newMarkdown))
- def fixUnicode(text):
- return str(text.encode('utf8'))
- # End Function Definitions
- r = praw.Reddit(user_agent='RedditPostArchiver Bot, version 0.93')
- # Disclaimer, storing plain text passwords is bad.
- # uncomment the following line to login (e.g., in case of Unable to Archive Post:
- # r.login('username', 'password')
- try:
- thePost = r.get_submission(submission_id=postID)
- htmlFile = open(outputFilePath,'w')
- parsePost(thePost)
- htmlFile.close()
- except HTTPError:
- print('Unable to Archive Post: Invalid PostID or Log In Required (see line 157 of script)')
- ##Done
|