#!/usr/local/bin/python
'''
Yahoo-Groups-Archiver, HTML Archive Script Copyright 2019 Robert Lancaster and others
YahooGroups-Archiver, a simple python script that allows for all
messages in a public Yahoo Group to be archived.
The HTML Archive Script allows you to take the downloaded json documents
and turn them into html-based yearly archives of emails.
Note that the archive-group.py script must be run first.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
' + "\n"
messageText += 'Post ID:' + str(emailMessageID) + '
' + "\n"
messageText += 'Sender:' + cgi.escape(emailMessageSender) + '
' + "\n"
messageText += 'Post Date/Time:' + cgi.escape(emailMessageDateTime) + '
' + "\n"
messageText += 'Subject:' + cgi.escape(emailMessageSubject) + '
' + "\n"
messageText += 'Message:' + '
' + "\n"
messageText += messageBody
messageText += '
' + "\n"
return messageText
def getYahooMessageYear(file):
f1 = open(file,'r')
fileContents=f1.read()
f1.close()
jsonDoc = json.loads(fileContents)
emailMessageTimeStamp = jsonDoc['ygData']['postDate']
return datetime.fromtimestamp(float(emailMessageTimeStamp)).year
# Thank you to the help in this forum for the bulk of this function
# https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
def getEmailBody(message):
body = ''
if message.is_multipart():
for part in message.walk():
ctype = part.get_content_type()
cdispo = str(part.get('Content-Disposition'))
# skip any text/plain (txt) attachments
if ctype == 'text/plain' and 'attachment' not in cdispo:
body += '
'
body += cgi.escape(part.get_payload(decode=True)) # decode
body += ''
break
# not multipart - i.e. plain text, no attachments, keeping fingers crossed
else:
ctype = message.get_content_type()
if ctype != 'text/html':
body += ''
body += cgi.escape(message.get_payload(decode=True))
body += ''
else:
body += message.get_payload(decode=True)
return body
## This is where the script starts
if len(sys.argv) < 2:
sys.exit('You need to specify your group name')
groupName = sys.argv[1]
oldDir = os.getcwd()
if os.path.exists(groupName):
archiveDir = os.path.abspath(groupName + '-archive')
if not os.path.exists(archiveDir):
os.makedirs(archiveDir)
os.chdir(groupName)
for file in natsorted(os.listdir(os.getcwd())):
messageYear = getYahooMessageYear(file)
archiveFile = archiveDir + '/archive-' + str(messageYear) + '.html'
archiveYahooMessage(file, archiveFile, messageYear, 'utf-8')
else:
sys.exit('Please run archive-group.py first')
os.chdir(oldDir)
print('Complete')