make_Yearly_Text_Archive.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #!/usr/local/bin/python
  2. '''
  3. Yahoo-Groups-Archiver, Text Archive Script Copyright 2019 Robert Lancaster and others
  4. YahooGroups-Archiver, a simple python script that allows for all
  5. messages in a public Yahoo Group to be archived.
  6. The Text Archive Script allows you to take the downloaded json documents
  7. and turn them yearly archives of emails sorted into text documents.
  8. Note that the archive-group.py script must be run first.
  9. This program is free software: you can redistribute it and/or modify
  10. it under the terms of the GNU General Public License as published by
  11. the Free Software Foundation, either version 3 of the License, or
  12. (at your option) any later version.
  13. This program is distributed in the hope that it will be useful
  14. but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16. GNU General Public License for more details.
  17. You should have received a copy of the GNU General Public License
  18. along with this program. If not, see <http://www.gnu.org/licenses/>.
  19. '''
  20. import email
  21. import HTMLParser
  22. import json
  23. import os
  24. import sys
  25. from datetime import datetime
  26. from natsort import natsorted, ns
  27. #To avoid Unicode Issues
  28. reload(sys)
  29. sys.setdefaultencoding('utf-8')
  30. def archiveYahooMessage(file, archiveFile, messageYear, format):
  31. try:
  32. f = open(archiveFile, 'a')
  33. f.write(loadYahooMessage(file, format))
  34. f.close()
  35. print ('Yahoo Message: ' + file + ' archived to: archive-' + str(messageYear) + '.txt')
  36. except Exception as e:
  37. print ('Yahoo Message: ' + file + ' had an error:')
  38. print (e)
  39. def loadYahooMessage(file, format):
  40. f1 = open(file,'r')
  41. fileContents=f1.read()
  42. f1.close()
  43. jsonDoc = json.loads(fileContents)
  44. emailMessageID = jsonDoc['ygData']['msgId']
  45. emailMessageSender = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['from']).decode(format).encode('utf-8')
  46. emailMessageTimeStamp = jsonDoc['ygData']['postDate']
  47. emailMessageDateTime = datetime.fromtimestamp(float(emailMessageTimeStamp)).strftime('%Y-%m-%d %H:%M:%S')
  48. emailMessageSubject = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['subject']).decode(format).encode('utf-8')
  49. emailMessageString = HTMLParser.HTMLParser().unescape(jsonDoc['ygData']['rawEmail']).decode(format).encode('utf-8')
  50. message = email.message_from_string(emailMessageString)
  51. messageBody = getEmailBody(message)
  52. messageText = '-----------------------------------------------------------------------------------\n'
  53. messageText += 'Post ID:' + str(emailMessageID) + '\n'
  54. messageText += 'Sender:' + emailMessageSender + '\n'
  55. messageText += 'Post Date/Time:' + emailMessageDateTime + '\n'
  56. messageText += 'Subject:' + emailMessageSubject + '\n'
  57. messageText += 'Message:' + '\n\n'
  58. messageText += messageBody
  59. messageText += '\n\n\n\n\n'
  60. return messageText
  61. def getYahooMessageYear(file):
  62. f1 = open(file,'r')
  63. fileContents=f1.read()
  64. f1.close()
  65. jsonDoc = json.loads(fileContents)
  66. emailMessageTimeStamp = jsonDoc['ygData']['postDate']
  67. return datetime.fromtimestamp(float(emailMessageTimeStamp)).year
  68. # Thank you to the help in this forum for the bulk of this function
  69. # https://stackoverflow.com/questions/17874360/python-how-to-parse-the-body-from-a-raw-email-given-that-raw-email-does-not
  70. def getEmailBody(message):
  71. body = ''
  72. if message.is_multipart():
  73. for part in message.walk():
  74. ctype = part.get_content_type()
  75. cdispo = str(part.get('Content-Disposition'))
  76. # skip any text/plain (txt) attachments
  77. if ctype == 'text/plain' and 'attachment' not in cdispo:
  78. body += part.get_payload(decode=True) # decode
  79. break
  80. # not multipart - i.e. plain text, no attachments, keeping fingers crossed
  81. else:
  82. body += message.get_payload(decode=True)
  83. return body
  84. ## This is where the script starts
  85. if len(sys.argv) < 2:
  86. sys.exit('You need to specify your group name')
  87. groupName = sys.argv[1]
  88. oldDir = os.getcwd()
  89. if os.path.exists(groupName):
  90. archiveDir = os.path.abspath(groupName + '-archive')
  91. if not os.path.exists(archiveDir):
  92. os.makedirs(archiveDir)
  93. os.chdir(groupName)
  94. for file in natsorted(os.listdir(os.getcwd())):
  95. messageYear = getYahooMessageYear(file)
  96. archiveFile = archiveDir + '/archive-' + str(messageYear) + '.txt'
  97. archiveYahooMessage(file, archiveFile, messageYear, 'utf-8')
  98. else:
  99. sys.exit('Please run archive-group.py first')
  100. os.chdir(oldDir)
  101. print('Complete')