1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253 |
- def archive(
- warc_filename,
- input_filename,
- input_encoding,
- cache,
- max_depth,
- allowed_uris,
- autothrottle,
- log_level,
- settings,
- user_agent,
- urls,
- ):
- if not input_filename and not urls:
- click.echo(
- "ERROR: at least one URL must be provided (or a file containing one per line).",
- err=True,
- )
- exit(1)
- if input_filename:
- if not Path(input_filename).exists():
- click.echo(f"ERROR: filename {input_filename} does not exist.", err=True)
- exit(2)
- urls = get_urls_from_file(input_filename, encoding=input_encoding)
- if cache:
- settings["HTTPCACHE_ENABLED"] = True
- if log_level:
- settings["LOG_LEVEL"] = log_level
- if user_agent:
- settings["USER_AGENT"] = user_agent
- if autothrottle:
- settings.update(
- {
- "AUTOTHROTTLE_ENABLED": True,
- "AUTOTHROTTLE_DEBUG": True,
- }
- )
- process = CrawlerProcess(settings=settings)
- process.crawl(
- CrauSpider,
- warc_filename=warc_filename,
- urls=urls,
- max_depth=max_depth,
- allowed_uris=allowed_uris,
- )
- process.start()
|