#!/usr/bin/env python3 """ A simple command line tool for archiving 4chan type imageboards. Copyright (C) 2014 Ryan Chartier This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import json import re import sys import time import os import shutil import argparse import math import requests class Response: def __init__(self): self.error = 0 @staticmethod def current_time(): return math.ceil(time.time()) def get_response(self, *args, **kwargs): """Wrapper function for requests.get that limits rate.""" http = requests.get(*args, **kwargs) if http.status_code == 200: #All is well. return http elif http.status_code == 522: # "We are being rate limited. print("We are being rate limited. Waiting for 30 seconds.") time.sleep(30) return self.get_response(*args, **kwargs) elif http.status_code == 404: #Thread has been deleted. print("Thread not found.") return False elif self.error < 50: time.sleep(10) print("Error detected '{}'.".format(http.status_code)) self.error +=1 return self.get_response(*args, **kwargs) else: print("Too many errors.") raise Exception GET = Response() def capture_image(post, args): """Downloads the image assosiated with a post.""" if "tim" in post: filename = str(post['tim']) + post['ext'] if not os.path.isfile(args.output + "image/" + filename): url = args.url['images'].format(args.board, filename) img = GET.get_response(url, stream=True) if img: img.raw.decode_content = True with open(args.output + "image/" + filename, "wb+") as im: shutil.copyfileobj(img.raw, im) def posts(board, since): """Iterates over new posts.""" #Get list of threads. url = args.url['catalog'].format(board) catalog = GET.get_response(url).json() #Iterate over posts in thread for page in catalog: for thread in page["threads"]: if thread['last_modified'] > since: iden = args.url['threads'].format(board, thread["no"]) t = GET.get_response(iden) if t: for post in t.json()["posts"]: if post['time'] > since: yield post def get_since(args): """ infer last scrape based on other scrapes. """ other_archive_files = [] for filename in os.listdir(args.output): if re.match("^{}-\d+\.json(\.gz)?$".format(args.board), filename): other_archive_files.append(filename) other_archive_files.sort() since_id = None while len(other_archive_files) != 0: f = other_archive_files.pop() if os.path.getsize(args.output + ("/") + f) > 0: since_id = f if not since_id: return 0 since = since_id.rstrip(".gz").rstrip(".json").lstrip(args.board).lstrip("-") t = time.strptime(since, "%Y%m%d%H%M%S") return int(time.mktime(t)) def parse(args): #Read in previous config. since = get_since(args) #Create new file and insert new posts. t = time.strftime("%Y%m%d%H%M%S", time.localtime()) with open(args.output + "/{}-{}.json".format(args.board, t), "w+") as fp: for post in posts(args.board, since): json.dump(post, fp) fp.write("\n") if args.image: capture_image(post, args) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Scrapes imageboards based on the 4chan api.') parser.add_argument("board", nargs=1, help="Specific image board to scrape. (ex. 'trv' for the 4chan travel board.") parser.add_argument("-output", default="data", help="Optional folder to output results. Defaults to 'data'.") parser.add_argument("-image", action="store_true", help="Set to download images.") parser.add_argument("-url", choices=("4chan", "8chan"), default="4chan", help="Choose which website to download from.") args = parser.parse_args() args.board = args.board[0] if not args.output.endswith("/"): args.output += "/" if not os.path.exists(args.output): os.makedirs(args.output) if args.image: if not os.path.exists(args.output + "image/"): os.makedirs(args.output + "image/") if args.url == "4chan": args.url = { "catalog": "http://a.4cdn.org/{}/threads.json", "threads": "http://a.4cdn.org/{}/thread/{}.json", "images": "http://i.4cdn.org/{}/{}", } elif args.url == "8chan": args.url = { "catalog": "http://8ch.net/{}/threads.json", "threads": "http://8ch.net/{}/res/{}.json", "images": "http://8ch.net/{}/src/{}", } parse(args)