def archive( warc_filename, input_filename, input_encoding, cache, max_depth, allowed_uris, autothrottle, log_level, settings, user_agent, urls, ): if not input_filename and not urls: click.echo( "ERROR: at least one URL must be provided (or a file containing one per line).", err=True, ) exit(1) if input_filename: if not Path(input_filename).exists(): click.echo(f"ERROR: filename {input_filename} does not exist.", err=True) exit(2) urls = get_urls_from_file(input_filename, encoding=input_encoding) if cache: settings["HTTPCACHE_ENABLED"] = True if log_level: settings["LOG_LEVEL"] = log_level if user_agent: settings["USER_AGENT"] = user_agent if autothrottle: settings.update( { "AUTOTHROTTLE_ENABLED": True, "AUTOTHROTTLE_DEBUG": True, } ) process = CrawlerProcess(settings=settings) process.crawl( CrauSpider, warc_filename=warc_filename, urls=urls, max_depth=max_depth, allowed_uris=allowed_uris, ) process.start()