cli.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import os
  2. import shlex
  3. import shutil
  4. import subprocess
  5. import sys
  6. import tempfile
  7. from pathlib import Path
  8. import click
  9. from scrapy.crawler import CrawlerProcess
  10. from scrapy.utils.conf import arglist_to_dict
  11. from .spider import CrauSpider
  12. from .utils import WarcReader, get_urls_from_file
  13. from .version import __version__
  14. def run_command(command):
  15. print(f"*** Running command: {command}")
  16. return subprocess.call(shlex.split(command))
  17. def load_settings(ctx, param, value):
  18. settings = {
  19. "HTTPCACHE_ENABLED": False,
  20. "LOG_LEVEL": "CRITICAL",
  21. "STATS_CLASS": "crau.utils.StdoutStatsCollector",
  22. "USER_AGENT": f"crau {__version__}",
  23. }
  24. settings.update(arglist_to_dict(value))
  25. return settings
  26. @click.group()
  27. @click.version_option(version=__version__, prog_name="crau")
  28. def cli():
  29. pass
  30. @cli.command("list", help="List URIs of response records stored in a WARC file")
  31. @click.argument("warc_filename")
  32. def list_uris(warc_filename):
  33. warc = WarcReader(warc_filename)
  34. for record in warc:
  35. if record.rec_type == "response":
  36. click.echo(record.rec_headers.get_header("WARC-Target-URI"))
  37. @cli.command("extract", help="Extract URL content from archive")
  38. @click.option("--chunk-size", default=512 * 1024)
  39. @click.argument("warc_filename")
  40. @click.argument("uri")
  41. @click.argument("output")
  42. def extract_uri(chunk_size, warc_filename, uri, output):
  43. warc = WarcReader(warc_filename)
  44. stream = warc.get_response(uri).content_stream()
  45. if output == "-":
  46. data = stream.read(chunk_size)
  47. while data != b"":
  48. sys.stdout.buffer.write(data)
  49. data = stream.read(chunk_size)
  50. else:
  51. with open(output, mode="wb") as fobj:
  52. data = stream.read(chunk_size)
  53. while data != b"":
  54. fobj.write(data)
  55. data = stream.read(chunk_size)
  56. @cli.command("archive", help="Archive a list of URLs to a WARC file")
  57. @click.argument("warc_filename")
  58. @click.option("--input-filename", "-i")
  59. @click.option("--input-encoding", default="utf-8")
  60. @click.option("--cache", is_flag=True)
  61. @click.option("--max-depth", default=1)
  62. @click.option("--allowed-uris", multiple=True, default=[])
  63. @click.option("--autothrottle", is_flag=True)
  64. @click.option("--log-level", required=False)
  65. @click.option("--user-agent", required=False)
  66. @click.option("--settings", "-s", multiple=True, default=[], callback=load_settings)
  67. @click.argument("URLs", nargs=-1, required=False)
  68. def archive(
  69. warc_filename,
  70. input_filename,
  71. input_encoding,
  72. cache,
  73. max_depth,
  74. allowed_uris,
  75. autothrottle,
  76. log_level,
  77. settings,
  78. user_agent,
  79. urls,
  80. ):
  81. if not input_filename and not urls:
  82. click.echo(
  83. "ERROR: at least one URL must be provided (or a file containing one per line).",
  84. err=True,
  85. )
  86. exit(1)
  87. if input_filename:
  88. if not Path(input_filename).exists():
  89. click.echo(f"ERROR: filename {input_filename} does not exist.", err=True)
  90. exit(2)
  91. urls = get_urls_from_file(input_filename, encoding=input_encoding)
  92. if cache:
  93. settings["HTTPCACHE_ENABLED"] = True
  94. if log_level:
  95. settings["LOG_LEVEL"] = log_level
  96. if user_agent:
  97. settings["USER_AGENT"] = user_agent
  98. if autothrottle:
  99. settings.update(
  100. {
  101. "AUTOTHROTTLE_ENABLED": True,
  102. "AUTOTHROTTLE_DEBUG": True,
  103. }
  104. )
  105. process = CrawlerProcess(settings=settings)
  106. process.crawl(
  107. CrauSpider,
  108. warc_filename=warc_filename,
  109. urls=urls,
  110. max_depth=max_depth,
  111. allowed_uris=allowed_uris,
  112. )
  113. process.start()
  114. # TODO: if there's an error, print it
  115. @cli.command("play", help="Run a backend playing your archive")
  116. @click.option("-p", "--port", default=8000)
  117. @click.option("-b", "--bind", default="127.0.0.1")
  118. @click.argument("warc_filename")
  119. def play(warc_filename, port, bind):
  120. filename = Path(warc_filename)
  121. if not filename.exists():
  122. click.echo(f"ERROR: filename {warc_filename} does not exist.", err=True)
  123. exit(2)
  124. full_filename = filename.absolute()
  125. collection_name = filename.name.split(".")[0]
  126. temp_dir = tempfile.mkdtemp()
  127. old_cwd = os.getcwd()
  128. os.chdir(temp_dir)
  129. run_command(f'wb-manager init "{collection_name}"')
  130. run_command(f'wb-manager add "{collection_name}" "{full_filename}"')
  131. run_command(f"wayback -p {port} -b {bind}")
  132. shutil.rmtree(temp_dir)
  133. os.chdir(old_cwd)