LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
							from functools import partial
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import argparse
import logging
import multiprocessing as mp
import re
import requests
import time
import xml.etree.ElementTree as ET

# Library version
__version__ = "1.9.1"


# String used to prefix local sitemaps
LOCAL_PREFIX = "file://"


def format_archive_url(url):
    """Given a URL, constructs an Archive URL to submit the archive request."""
    logging.debug("Creating archive URL for %s", url)
    SAVE_URL = "https://web.archive.org/save/"
    request_url = SAVE_URL + url

    return request_url


def call_archiver(request_url, rate_limit_wait, session):
    """Submit a url to the Internet Archive to archive."""
    if rate_limit_wait > 0:
        logging.debug("Sleeping for %s", rate_limit_wait)
        time.sleep(rate_limit_wait)
    logging.info("Calling archive url %s", request_url)
    r = session.head(request_url, allow_redirects=True)
    try:
        # Raise `requests.exceptions.HTTPError` if 4XX or 5XX status
        r.raise_for_status()
    except requests.exceptions.HTTPError as e:
        logging.exception(e)
        raise


def get_namespace(element):
    """Extract the namespace using a regular expression."""
    match = re.match(r"\{.*\}", element.tag)
    return match.group(0) if match else ""


def download_remote_sitemap(sitemap_url, session):
    """Download the sitemap of the target website."""
    logging.debug("Downloading: %s", sitemap_url)
    r = session.get(sitemap_url)
    try:
        # Raise `requests.exceptions.HTTPError` if 4XX or 5XX status
        r.raise_for_status()
    except requests.exceptions.HTTPError as e:
        logging.exception(e)
        raise
    else:
        return r.text.encode("utf-8")


def load_local_sitemap(sitemap_filepath):
    """Load a local sitemap and return it as a string."""
    logging.debug("Loading local sitemap: %s", sitemap_filepath)

    if sitemap_filepath.startswith(LOCAL_PREFIX):
        sitemap_filepath = sitemap_filepath[len(LOCAL_PREFIX):]

    # Try to open the file, error on failure
    try:
        logging.debug("Opening local file '%s'", sitemap_filepath)
        with open(sitemap_filepath, "r") as fp:
            contents = fp.read()
    except IOError as e:
        logging.exception(e)
        raise

    return contents


def sitemap_is_local(sitemap_url):
    """Returns True if we believe a URI to be local, False otherwise."""
    return sitemap_url.startswith(LOCAL_PREFIX) or sitemap_url.startswith("/")


def extract_pages_from_sitemap(site_map_text):
    """Extract the various pages from the sitemap text. """
    root = ET.fromstring(site_map_text)

    # Sitemaps use a namespace in the XML, which we need to read
    namespace = get_namespace(root)

    urls = []
    for loc_node in root.findall(".//{}loc".format(namespace)):
        urls.append(loc_node.text)

    return set(urls)


def main():
    # Command line parsing
    parser = argparse.ArgumentParser(
        prog="archiver",
        description="A script to backup a web pages with Internet Archive",
    )
    parser.add_argument(
        "--version",
        action="version",
        version="%(prog)s {version}".format(version=__version__),
    )
    parser.add_argument(
        "urls",
        nargs="*",
        default=[],
        help="the URLs of the pages to archive",
    )
    parser.add_argument(
        "--file",
        help="path to a file containing urls to save (one url per line)",
        required=False,
    )
    parser.add_argument(
        "--sitemaps",
        nargs="+",
        default=[],
        help="one or more URIs to sitemaps listing pages to archive; local paths must be prefixed with '{f}'".format(f=LOCAL_PREFIX),
        required=False,
    )
    parser.add_argument(
        "--log",
        help="set the logging level, defaults to WARNING",
        dest="log_level",
        default=logging.WARNING,
        choices=[
            "DEBUG",
            "INFO",
            "WARNING",
            "ERROR",
            "CRITICAL",
        ],
    )
    parser.add_argument(
        "--log-to-file",
        help="redirect logs to a file",
        dest="log_file",
        default=None,
    )
    parser.add_argument(
        "--archive-sitemap-also",
        help="also submit the URL of the sitemap to be archived",
        dest="archive_sitemap",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--jobs",
        "-j",
        help="run this many concurrent URL submissions, defaults to 1",
        default=1,
        type=int,
    )
    parser.add_argument(
        "--rate-limit-wait",
        help="number of seconds to wait between page requests to avoid flooding the archive site, defaults to 5; also used as the backoff factor for retries",
        dest="rate_limit_in_sec",
        default=5,
        type=int,
    )

    args = parser.parse_args()

    # Set the logging level based on the arguments
    #
    # If `filename` is None, the constructor will set up a stream, otherwise it
    # will use the file specified.
    logging.basicConfig(level=args.log_level, filename=args.log_file)

    logging.debug("Archiver Version: %s", __version__)
    logging.debug("Arguments: %s", args)

    archive_urls = []
    # Add the regular pages
    if args.urls:
        logging.info("Adding page URLs to archive")
        logging.debug("Page URLs to archive: %s", args.urls)
        archive_urls += map(format_archive_url, args.urls)

    # Set up retry and backoff
    session = requests.Session()
    session.max_redirects = 100  # Changed number of redirects allowed from 30

    retries = Retry(
        total=5,
        backoff_factor=args.rate_limit_in_sec,
        status_forcelist=[500, 502, 503, 504, 520],
    )

    session.mount("https://", HTTPAdapter(max_retries=retries))
    session.mount("http://", HTTPAdapter(max_retries=retries))

    # Download and process the sitemaps
    remote_sitemaps = set()
    logging.info("Parsing sitemaps")
    for sitemap_url in args.sitemaps:

        # Save the remote ones, incase the user wants us to backthem up
        if sitemap_is_local(sitemap_url):
            logging.debug("The sitemap '%s' is local.", sitemap_url)
            sitemap_xml = load_local_sitemap(sitemap_url)
        else:
            logging.debug("The sitemap '%s' is remote.", sitemap_url)
            if args.archive_sitemap:
                remote_sitemaps.add(sitemap_url)
            sitemap_xml = download_remote_sitemap(sitemap_url, session=session)

        for url in extract_pages_from_sitemap(sitemap_xml):
            archive_urls.append(format_archive_url(url))

    # Archive the sitemap as well, if requested
    if args.archive_sitemap:
        logging.info("Archiving sitemaps")
        if remote_sitemaps:
            archive_urls += map(format_archive_url, remote_sitemaps)
        else:
            logging.debug("No remote sitemaps to backup.")

    # And URLs from file
    if args.file:
        logging.info("Reading urls from file: %s", args.file)
        with open(args.file) as file:
            urls_from_file = (u.strip() for u in file.readlines() if u.strip())
        archive_urls += map(format_archive_url, urls_from_file)

    # Deduplicate URLs
    archive_urls = set(archive_urls)

    # Archive the URLs
    logging.debug("Archive URLs: %s", archive_urls)
    pool = mp.Pool(processes=args.jobs)
    partial_call = partial(
        call_archiver, rate_limit_wait=args.rate_limit_in_sec, session=session
    )
    pool.map(partial_call, archive_urls)
    pool.close()
    pool.join()


if __name__ == "__main__":
    main()