archive_group.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. '''
  2. Yahoo-Groups-Archiver Copyright 2015, 2017, 2018 Andrew Ferguson and others
  3. YahooGroups-Archiver, a simple python script that allows for all
  4. messages in a public Yahoo Group to be archived.
  5. This program is free software: you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation, either version 3 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. '''
  16. cookie_T = 'COOKIE_T_DATA_GOES_HERE'
  17. cookie_Y = 'COOKIE_Y_DATA_GOES_HERE'
  18. import json #required for reading various JSON attributes from the content
  19. import requests #required for fetching the raw messages
  20. import os #required for checking if a file exists locally
  21. import time #required if Yahoo blocks access temporarily (to wait)
  22. import sys #required to cancel script if blocked by Yahoo
  23. import shutil #required for deletung an old folder
  24. import glob #required to find the most recent message downloaded
  25. import time #required to log the date and time of run
  26. def archive_group(groupName, mode="update"):
  27. log("\nArchiving group '" + groupName + "', mode: " + mode + " , on " + time.strftime("%c"), groupName)
  28. startTime = time.time()
  29. msgsArchived = 0
  30. if mode == "retry":
  31. #don't archive any messages we already have
  32. #but try to archive ones that we don't, and may have
  33. #already attempted to archive
  34. min = 1
  35. elif mode == "update":
  36. #start archiving at the last+1 message message we archived
  37. mostRecent = 1
  38. if os.path.exists(groupName):
  39. oldDir = os.getcwd()
  40. os.chdir(groupName)
  41. for file in glob.glob("*.json"):
  42. if int(file[0:-5]) > mostRecent:
  43. mostRecent = int(file[0:-5])
  44. os.chdir(oldDir)
  45. min = mostRecent
  46. elif mode == "restart":
  47. #delete all previous archival attempts and archive everything again
  48. if os.path.exists(groupName):
  49. shutil.rmtree(groupName)
  50. min = 1
  51. else:
  52. print ("You have specified an invalid mode (" + mode + ").")
  53. print ("Valid modes are:\nupdate - add any new messages to the archive\nretry - attempt to get all messages that are not in the archive\nrestart - delete archive and start from scratch")
  54. sys.exit()
  55. if not os.path.exists(groupName):
  56. os.makedirs(groupName)
  57. max = group_messages_max(groupName)
  58. for x in range(min,max+1):
  59. if not os.path.isfile(groupName + '/' + str(x) + ".json"):
  60. print ("Archiving message " + str(x) + " of " + str(max))
  61. sucsess = archive_message(groupName, x)
  62. if sucsess == True:
  63. msgsArchived = msgsArchived + 1
  64. log("Archive finished, archived " + str(msgsArchived) + ", time taken is " + str(time.time() - startTime) + " seconds", groupName)
  65. def group_messages_max(groupName):
  66. s = requests.Session()
  67. resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages?count=1&sortOrder=desc&direction=-1', cookies={'T': cookie_T, 'Y': cookie_Y})
  68. try:
  69. pageHTML = resp.text
  70. pageJson = json.loads(pageHTML)
  71. except ValueError as valueError:
  72. if "Stay signed in" in pageHTML and "Trouble signing in" in pageHTML:
  73. #the user needs to be signed in to Yahoo
  74. print ("Error. The group you are trying to archive is a private group. To archive a private group using this tool, login to a Yahoo account that has access to the private groups, then extract the data from the cookies Y and T from the domain yahoo.com . Paste this data into the appropriate variables (cookie_Y and cookie_T) at the top of this script, and run the script again.")
  75. sys.exit()
  76. else:
  77. raise valueError
  78. return pageJson["ygData"]["totalRecords"]
  79. def archive_message(groupName, msgNumber, depth=0):
  80. global failed
  81. failed = False
  82. s = requests.Session()
  83. resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages/' + str(msgNumber) + '/raw', cookies={'T': cookie_T, 'Y': cookie_Y})
  84. if resp.status_code != 200:
  85. #some other problem, perhaps being refused access by Yahoo?
  86. #retry for a max of 3 times anyway
  87. if depth < 3:
  88. print ("Cannot get message " + str(msgNumber) + ", attempt " + str(depth+1) + " of 3 due to HTTP status code " + str(resp.status_code))
  89. time.sleep(0.1)
  90. archive_message(groupName,msgNumber,depth+1)
  91. else:
  92. if resp.status_code == 500:
  93. #we are most likely being blocked by Yahoo
  94. log("Archive halted - it appears Yahoo has blocked you.", groupName)
  95. log("Check if you can access the group's homepage from your browser. If you can't, you have been blocked.", groupName)
  96. log("Don't worry, in a few hours (normally less than 3) you'll be unblocked and you can run this script again - it'll continue where you left off." ,groupName)
  97. sys.exit()
  98. log("Failed to retrive message " + str(msgNumber) + " due to HTTP status code " + str(resp.status_code), groupName )
  99. failed = True
  100. if failed == True:
  101. return False
  102. msgJson = resp.text
  103. writeFile = open((groupName + "/" + str(msgNumber) + ".json"), "wb")
  104. writeFile.write(msgJson.encode('utf-8'))
  105. writeFile.close()
  106. return True
  107. global writeLogFile
  108. def log(msg, groupName):
  109. print (msg)
  110. if writeLogFile:
  111. logF = open(groupName + ".txt", "a")
  112. logF.write("\n" + msg)
  113. logF.close()
  114. if __name__ == "__main__":
  115. global writeLogFile
  116. writeLogFile = True
  117. os.chdir(os.path.dirname(os.path.abspath(__file__)))
  118. if "nologs" in sys.argv:
  119. print ("Logging mode OFF")
  120. writeLogFile = False
  121. sys.argv.remove("nologs")
  122. if len(sys.argv) > 2:
  123. archive_group(sys.argv[1], sys.argv[2])
  124. else:
  125. archive_group(sys.argv[1])