imageboard-scraper.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #!/usr/bin/env python3
  2. """
  3. A simple command line tool for archiving 4chan type imageboards.
  4. Copyright (C) 2014 Ryan Chartier
  5. This program is free software: you can redistribute it and/or modify
  6. it under the terms of the GNU General Public License as published by
  7. the Free Software Foundation, either version 3 of the License, or
  8. (at your option) any later version.
  9. This program is distributed in the hope that it will be useful,
  10. but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. GNU General Public License for more details.
  13. You should have received a copy of the GNU General Public License
  14. along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. """
  16. import json
  17. import re
  18. import sys
  19. import time
  20. import os
  21. import shutil
  22. import argparse
  23. import math
  24. import requests
  25. class Response:
  26. def __init__(self):
  27. self.error = 0
  28. @staticmethod
  29. def current_time():
  30. return math.ceil(time.time())
  31. def get_response(self, *args, **kwargs):
  32. """Wrapper function for requests.get that limits rate."""
  33. http = requests.get(*args, **kwargs)
  34. if http.status_code == 200:
  35. #All is well.
  36. return http
  37. elif http.status_code == 522:
  38. # "We are being rate limited.
  39. print("We are being rate limited. Waiting for 30 seconds.")
  40. time.sleep(30)
  41. return self.get_response(*args, **kwargs)
  42. elif http.status_code == 404:
  43. #Thread has been deleted.
  44. print("Thread not found.")
  45. return False
  46. elif self.error < 50:
  47. time.sleep(10)
  48. print("Error detected '{}'.".format(http.status_code))
  49. self.error +=1
  50. return self.get_response(*args, **kwargs)
  51. else:
  52. print("Too many errors.")
  53. raise Exception
  54. GET = Response()
  55. def capture_image(post, args):
  56. """Downloads the image assosiated with a post."""
  57. if "tim" in post:
  58. filename = str(post['tim']) + post['ext']
  59. if not os.path.isfile(args.output + "image/" + filename):
  60. url = args.url['images'].format(args.board, filename)
  61. img = GET.get_response(url, stream=True)
  62. if img:
  63. img.raw.decode_content = True
  64. with open(args.output + "image/" + filename, "wb+") as im:
  65. shutil.copyfileobj(img.raw, im)
  66. def posts(board, since):
  67. """Iterates over new posts."""
  68. #Get list of threads.
  69. url = args.url['catalog'].format(board)
  70. catalog = GET.get_response(url).json()
  71. #Iterate over posts in thread
  72. for page in catalog:
  73. for thread in page["threads"]:
  74. if thread['last_modified'] > since:
  75. iden = args.url['threads'].format(board, thread["no"])
  76. t = GET.get_response(iden)
  77. if t:
  78. for post in t.json()["posts"]:
  79. if post['time'] > since:
  80. yield post
  81. def get_since(args):
  82. """
  83. infer last scrape based on other scrapes.
  84. """
  85. other_archive_files = []
  86. for filename in os.listdir(args.output):
  87. if re.match("^{}-\d+\.json(\.gz)?$".format(args.board), filename):
  88. other_archive_files.append(filename)
  89. other_archive_files.sort()
  90. since_id = None
  91. while len(other_archive_files) != 0:
  92. f = other_archive_files.pop()
  93. if os.path.getsize(args.output + ("/") + f) > 0:
  94. since_id = f
  95. if not since_id:
  96. return 0
  97. since = since_id.rstrip(".gz").rstrip(".json").lstrip(args.board).lstrip("-")
  98. t = time.strptime(since, "%Y%m%d%H%M%S")
  99. return int(time.mktime(t))
  100. def parse(args):
  101. #Read in previous config.
  102. since = get_since(args)
  103. #Create new file and insert new posts.
  104. t = time.strftime("%Y%m%d%H%M%S", time.localtime())
  105. with open(args.output + "/{}-{}.json".format(args.board, t), "w+") as fp:
  106. for post in posts(args.board, since):
  107. json.dump(post, fp)
  108. fp.write("\n")
  109. if args.image:
  110. capture_image(post, args)
  111. if __name__ == "__main__":
  112. parser = argparse.ArgumentParser(description='Scrapes imageboards based on the 4chan api.')
  113. parser.add_argument("board", nargs=1, help="Specific image board to scrape. (ex. 'trv' for the 4chan travel board.")
  114. parser.add_argument("-output", default="data", help="Optional folder to output results. Defaults to 'data'.")
  115. parser.add_argument("-image", action="store_true", help="Set to download images.")
  116. parser.add_argument("-url", choices=("4chan", "8chan"), default="4chan", help="Choose which website to download from.")
  117. args = parser.parse_args()
  118. args.board = args.board[0]
  119. if not args.output.endswith("/"):
  120. args.output += "/"
  121. if not os.path.exists(args.output):
  122. os.makedirs(args.output)
  123. if args.image:
  124. if not os.path.exists(args.output + "image/"):
  125. os.makedirs(args.output + "image/")
  126. if args.url == "4chan":
  127. args.url = {
  128. "catalog": "http://a.4cdn.org/{}/threads.json",
  129. "threads": "http://a.4cdn.org/{}/thread/{}.json",
  130. "images": "http://i.4cdn.org/{}/{}",
  131. }
  132. elif args.url == "8chan":
  133. args.url = {
  134. "catalog": "http://8ch.net/{}/threads.json",
  135. "threads": "http://8ch.net/{}/res/{}.json",
  136. "images": "http://8ch.net/{}/src/{}",
  137. }
  138. parse(args)