123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- import os
- import shlex
- import shutil
- import subprocess
- import sys
- import tempfile
- from pathlib import Path
- import click
- from scrapy.crawler import CrawlerProcess
- from scrapy.utils.conf import arglist_to_dict
- from .spider import CrauSpider
- from .utils import WarcReader, get_urls_from_file
- from .version import __version__
- def run_command(command):
- print(f"*** Running command: {command}")
- return subprocess.call(shlex.split(command))
- def load_settings(ctx, param, value):
- settings = {
- "HTTPCACHE_ENABLED": False,
- "LOG_LEVEL": "CRITICAL",
- "STATS_CLASS": "crau.utils.StdoutStatsCollector",
- "USER_AGENT": f"crau {__version__}",
- }
- settings.update(arglist_to_dict(value))
- return settings
- @click.group()
- @click.version_option(version=__version__, prog_name="crau")
- def cli():
- pass
- @cli.command("list", help="List URIs of response records stored in a WARC file")
- @click.argument("warc_filename")
- def list_uris(warc_filename):
- warc = WarcReader(warc_filename)
- for record in warc:
- if record.rec_type == "response":
- click.echo(record.rec_headers.get_header("WARC-Target-URI"))
- @cli.command("extract", help="Extract URL content from archive")
- @click.option("--chunk-size", default=512 * 1024)
- @click.argument("warc_filename")
- @click.argument("uri")
- @click.argument("output")
- def extract_uri(chunk_size, warc_filename, uri, output):
- warc = WarcReader(warc_filename)
- stream = warc.get_response(uri).content_stream()
- if output == "-":
- data = stream.read(chunk_size)
- while data != b"":
- sys.stdout.buffer.write(data)
- data = stream.read(chunk_size)
- else:
- with open(output, mode="wb") as fobj:
- data = stream.read(chunk_size)
- while data != b"":
- fobj.write(data)
- data = stream.read(chunk_size)
- @cli.command("archive", help="Archive a list of URLs to a WARC file")
- @click.argument("warc_filename")
- @click.option("--input-filename", "-i")
- @click.option("--input-encoding", default="utf-8")
- @click.option("--cache", is_flag=True)
- @click.option("--max-depth", default=1)
- @click.option("--allowed-uris", multiple=True, default=[])
- @click.option("--autothrottle", is_flag=True)
- @click.option("--log-level", required=False)
- @click.option("--user-agent", required=False)
- @click.option("--settings", "-s", multiple=True, default=[], callback=load_settings)
- @click.argument("URLs", nargs=-1, required=False)
- def archive(
- warc_filename,
- input_filename,
- input_encoding,
- cache,
- max_depth,
- allowed_uris,
- autothrottle,
- log_level,
- settings,
- user_agent,
- urls,
- ):
- if not input_filename and not urls:
- click.echo(
- "ERROR: at least one URL must be provided (or a file containing one per line).",
- err=True,
- )
- exit(1)
- if input_filename:
- if not Path(input_filename).exists():
- click.echo(f"ERROR: filename {input_filename} does not exist.", err=True)
- exit(2)
- urls = get_urls_from_file(input_filename, encoding=input_encoding)
- if cache:
- settings["HTTPCACHE_ENABLED"] = True
- if log_level:
- settings["LOG_LEVEL"] = log_level
- if user_agent:
- settings["USER_AGENT"] = user_agent
- if autothrottle:
- settings.update(
- {
- "AUTOTHROTTLE_ENABLED": True,
- "AUTOTHROTTLE_DEBUG": True,
- }
- )
- process = CrawlerProcess(settings=settings)
- process.crawl(
- CrauSpider,
- warc_filename=warc_filename,
- urls=urls,
- max_depth=max_depth,
- allowed_uris=allowed_uris,
- )
- process.start()
-
- @cli.command("play", help="Run a backend playing your archive")
- @click.option("-p", "--port", default=8000)
- @click.option("-b", "--bind", default="127.0.0.1")
- @click.argument("warc_filename")
- def play(warc_filename, port, bind):
- filename = Path(warc_filename)
- if not filename.exists():
- click.echo(f"ERROR: filename {warc_filename} does not exist.", err=True)
- exit(2)
- full_filename = filename.absolute()
- collection_name = filename.name.split(".")[0]
- temp_dir = tempfile.mkdtemp()
- old_cwd = os.getcwd()
- os.chdir(temp_dir)
- run_command(f'wb-manager init "{collection_name}"')
- run_command(f'wb-manager add "{collection_name}" "{full_filename}"')
- run_command(f"wayback -p {port} -b {bind}")
- shutil.rmtree(temp_dir)
- os.chdir(old_cwd)
|