cli_1.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. def archive(
  2. warc_filename,
  3. input_filename,
  4. input_encoding,
  5. cache,
  6. max_depth,
  7. allowed_uris,
  8. autothrottle,
  9. log_level,
  10. settings,
  11. user_agent,
  12. urls,
  13. ):
  14. if not input_filename and not urls:
  15. click.echo(
  16. "ERROR: at least one URL must be provided (or a file containing one per line).",
  17. err=True,
  18. )
  19. exit(1)
  20. if input_filename:
  21. if not Path(input_filename).exists():
  22. click.echo(f"ERROR: filename {input_filename} does not exist.", err=True)
  23. exit(2)
  24. urls = get_urls_from_file(input_filename, encoding=input_encoding)
  25. if cache:
  26. settings["HTTPCACHE_ENABLED"] = True
  27. if log_level:
  28. settings["LOG_LEVEL"] = log_level
  29. if user_agent:
  30. settings["USER_AGENT"] = user_agent
  31. if autothrottle:
  32. settings.update(
  33. {
  34. "AUTOTHROTTLE_ENABLED": True,
  35. "AUTOTHROTTLE_DEBUG": True,
  36. }
  37. )
  38. process = CrawlerProcess(settings=settings)
  39. process.crawl(
  40. CrauSpider,
  41. warc_filename=warc_filename,
  42. urls=urls,
  43. max_depth=max_depth,
  44. allowed_uris=allowed_uris,
  45. )
  46. process.start()