make_Yearly_Text_Archive_html.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #!/usr/local/bin/python
  2. '''
  3. Yahoo-Groups-Archiver, HTML Archive Script Copyright 2019 Robert Lancaster and others
  4. YahooGroups-Archiver, a simple python script that allows for all
  5. messages in a public Yahoo Group to be archived.
  6. The HTML Archive Script allows you to take the downloaded json documents
  7. and turn them into html-based yearly archives of emails.
  8. Note that the archive-group.py script must be run first.
  9. This program is free software: you can redistribute it and/or modify
  10. it under the terms of the GNU General Public License as published by
  11. the Free Software Foundation, either version 3 of the License, or
  12. (at your option) any later version.
  13. This program is distributed in the hope that it will be useful
  14. but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. GNU General Public License for more details.
  17. You should have received a copy of the GNU General Public License
  18. along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. '''
  20. import email
  21. import HTMLParser
  22. import json
  23. import os
  24. import sys
  25. from datetime import datetime
  26. from natsort import natsorted, ns
  27. import cgi
  28. #To avoid Unicode Issues
  29. reload(sys)
  30. sys.setdefaultencoding('utf-8')
  31. def archiveYahooMessage(file, archiveFile, messageYear, format):
  32. try:
  33. f = open(archiveFile, 'a')
  34. if f.tell() == 0:
  35. f.write("<style>pre {white-space: pre-wrap;}</style>\n");
  36. f.write(loadYahooMessage(file, format))
  37. f.close()
  38. print ('Yahoo Message: ' + file + ' archived to: archive-' + str(messageYear) + '.html')
  39. except Exception as e:
  40. print ('Yahoo Message: ' + file + ' had an error:')
  41. print (e)
  42. def loadYahooMessage(file, format):
  43. f1 = open(file,'r')
  44. fileContents=f1.read()
  45. f1.close()
  46. jsonDoc = json.loads(fileContents)
  47. emailMessageID = jsonDoc['ygData']['msgId']
  48. emailMessageSender = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['from']).decode(format).encode('utf-8')
  49. emailMessageTimeStamp = jsonDoc['ygData']['postDate']
  50. emailMessageDateTime = datetime.fromtimestamp(float(emailMessageTimeStamp)).strftime('%Y-%m-%d %H:%M:%S')
  51. emailMessageSubject = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['subject']).decode(format).encode('utf-8')
  52. emailMessageString = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['rawEmail']).decode(format).encode('utf-8')
  53. message = email.message_from_string(emailMessageString)
  54. messageBody = getEmailBody(message)
  55. messageText = '-----------------------------------------------------------------------------------<br>' + "\n"
  56. messageText += 'Post ID:' + str(emailMessageID) + '<br>' + "\n"
  57. messageText += 'Sender:' + cgi.escape(emailMessageSender) + '<br>' + "\n"
  58. messageText += 'Post Date/Time:' + cgi.escape(emailMessageDateTime) + '<br>' + "\n"
  59. messageText += 'Subject:' + cgi.escape(emailMessageSubject) + '<br>' + "\n"
  60. messageText += 'Message:' + '<br><br>' + "\n"
  61. messageText += messageBody
  62. messageText += '<br><br><br><br><br>' + "\n"
  63. return messageText
  64. def getYahooMessageYear(file):
  65. f1 = open(file,'r')
  66. fileContents=f1.read()
  67. f1.close()
  68. jsonDoc = json.loads(fileContents)
  69. emailMessageTimeStamp = jsonDoc['ygData']['postDate']
  70. return datetime.fromtimestamp(float(emailMessageTimeStamp)).year
  71. # Thank you to the help in this forum for the bulk of this function
  72. # https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
  73. def getEmailBody(message):
  74. body = ''
  75. if message.is_multipart():
  76. for part in message.walk():
  77. ctype = part.get_content_type()
  78. cdispo = str(part.get('Content-Disposition'))
  79. # skip any text/plain (txt) attachments
  80. if ctype == 'text/plain' and 'attachment' not in cdispo:
  81. body += '<pre>'
  82. body += cgi.escape(part.get_payload(decode=True)) # decode
  83. body += '</pre>'
  84. break
  85. # not multipart - i.e. plain text, no attachments, keeping fingers crossed
  86. else:
  87. ctype = message.get_content_type()
  88. if ctype != 'text/html':
  89. body += '<pre>'
  90. body += cgi.escape(message.get_payload(decode=True))
  91. body += '</pre>'
  92. else:
  93. body += message.get_payload(decode=True)
  94. return body
  95. ## This is where the script starts
  96. if len(sys.argv) < 2:
  97. sys.exit('You need to specify your group name')
  98. groupName = sys.argv[1]
  99. oldDir = os.getcwd()
  100. if os.path.exists(groupName):
  101. archiveDir = os.path.abspath(groupName + '-archive')
  102. if not os.path.exists(archiveDir):
  103. os.makedirs(archiveDir)
  104. os.chdir(groupName)
  105. for file in natsorted(os.listdir(os.getcwd())):
  106. messageYear = getYahooMessageYear(file)
  107. archiveFile = archiveDir + '/archive-' + str(messageYear) + '.html'
  108. archiveYahooMessage(file, archiveFile, messageYear, 'utf-8')
  109. else:
  110. sys.exit('Please run archive-group.py first')
  111. os.chdir(oldDir)
  112. print('Complete')