LiuFan
/
PrivacyScanData


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
							def archive(
    warc_filename,
    input_filename,
    input_encoding,
    cache,
    max_depth,
    allowed_uris,
    autothrottle,
    log_level,
    settings,
    user_agent,
    urls,
):

    if not input_filename and not urls:
        click.echo(
            "ERROR: at least one URL must be provided (or a file containing one per line).",
            err=True,
        )
        exit(1)

    if input_filename:
        if not Path(input_filename).exists():
            click.echo(f"ERROR: filename {input_filename} does not exist.", err=True)
            exit(2)
        urls = get_urls_from_file(input_filename, encoding=input_encoding)

    if cache:
        settings["HTTPCACHE_ENABLED"] = True

    if log_level:
        settings["LOG_LEVEL"] = log_level

    if user_agent:
        settings["USER_AGENT"] = user_agent

    if autothrottle:
        settings.update(
            {
                "AUTOTHROTTLE_ENABLED": True,
                "AUTOTHROTTLE_DEBUG": True,
            }
        )

    process = CrawlerProcess(settings=settings)
    process.crawl(
        CrauSpider,
        warc_filename=warc_filename,
        urls=urls,
        max_depth=max_depth,
        allowed_uris=allowed_uris,
    )
    process.start()