LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
							import logging
import re
from collections import namedtuple
from urllib.parse import urljoin

from scrapy import Request, Spider, signals
from scrapy.utils.request import request_fingerprint
from warcio.warcwriter import WARCWriter

from .utils import resource_matches_base_url, write_warc_request_response

Resource = namedtuple("Resource", ["name", "type", "link_type", "content"])
REGEXP_CSS_URL = re.compile(r"""url\(['"]?(.*?)['"]?\)""")

Extractor = namedtuple("Extractor", ["name", "type", "link_type", "xpath"])
EXTRACTORS = [
    # Media (images, video etc.)
    Extractor(name="media", type="link", link_type="dependency", xpath="//img/@src"),
    Extractor(name="media", type="link", link_type="dependency", xpath="//audio/@src"),
    Extractor(name="media", type="link", link_type="dependency", xpath="//video/@src"),
    Extractor(name="media", type="link", link_type="dependency", xpath="//source/@src"),
    Extractor(name="media", type="link", link_type="dependency", xpath="//embed/@src"),
    Extractor(
        name="media", type="link", link_type="dependency", xpath="//object/@data"
    ),
    # CSS
    Extractor(
        name="css",
        type="link",
        link_type="dependency",
        xpath="//link[@rel = 'stylesheet']/@href",
    ),
    Extractor(name="css", type="code", link_type="dependency", xpath="//style/text()"),
    Extractor(name="css", type="code", link_type="dependency", xpath="//*/@style"),
    # JavaScript
    Extractor(name="js", type="link", link_type="dependency", xpath="//script/@src"),
    Extractor(name="js", type="code", link_type="dependency", xpath="//script/text()"),
    # TODO: add "javascript:XXX" on //a/@href etc.
    # TODO: add inline JS (onload, onchange, onclick etc.)
    # Internal/external links and iframes
    # TODO: iframe sources must be considered as if they were the same as the
    # current page being archived (same depth, get all dependencies etc.).
    Extractor(name="other", type="link", link_type="anchor", xpath="//iframe/@src"),
    Extractor(name="other", type="link", link_type="anchor", xpath="//a/@href"),
    Extractor(name="other", type="link", link_type="anchor", xpath="//area/@href"),
    Extractor(
        name="other",
        type="link",
        link_type="anchor",
        xpath="//link[not(@rel = 'stylesheet')]/@href",
    ),
    # TODO: link rel=icon should be considered a dependency (what about other
    # link rel=xxx?)
    # TODO: add all other "//link/@href"
]


def extract_resources(response):
    for extractor in EXTRACTORS:
        for content in response.xpath(extractor.xpath).extract():
            yield Resource(
                name=extractor.name,
                type=extractor.type,
                link_type=extractor.link_type,
                content=content,
            )


class CrauSpider(Spider):

    name = "crawler-spider"
    custom_settings = {
        "CONCURRENT_REQUESTS": 256,
        "CONCURRENT_REQUESTS_PER_DOMAIN": 16,
        "DNSCACHE_ENABLED": True,
        "DNSCACHE_SIZE": 500000,
        "DNS_TIMEOUT": 5,
        "DOWNLOAD_MAXSIZE": 5 * 1024 * 1024,
        "DOWNLOAD_TIMEOUT": 5,
        "REACTOR_THREADPOOL_MAXSIZE": 40,
        "REDIRECT_ENABLED": False,
        "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
        "SPIDER_MIDDLEWARES_BASE": {
            "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
            "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
            "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
            "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
        },
    }

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super().from_crawler(crawler, *args, **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider

    def __init__(self, warc_filename, urls, max_depth=1, allowed_uris=None):
        super().__init__()
        self.max_depth = int(max_depth)
        self.warc_filename = warc_filename
        self.urls = urls
        self._request_history = set()
        self.warc_fobj = None
        self.warc_writer = None
        self.allowed_uris = allowed_uris if allowed_uris else []

    def spider_closed(self, spider):
        if self.warc_fobj is not None:
            self.warc_fobj.close()

    def make_request(self, request_class=Request, *args, **kwargs):
        """Method to create requests and implements a custom dedup filter"""

        kwargs["dont_filter"] = kwargs.get("dont_filter", True)
        kwargs["errback"] = kwargs.get("errback", self.parse_request_error)

        meta = kwargs.get("meta", {})
        meta["handle_httpstatus_all"] = meta.get("handle_httpstatus_all", True)
        meta["dont_redirect"] = meta.get("dont_redirect", True)
        kwargs["meta"] = meta

        request = request_class(*args, **kwargs)
        if "#" in request.url:
            request = request.replace(url=request.url[: request.url.find("#")])

        # This `if` filters duplicated requests - we don't use scrapy's dedup
        # filter because it has a bug, which filters out requests in undesired
        # cases <https://github.com/scrapy/scrapy/issues/1225>.
        # TODO: check if this dedup filter does not have the same problem
        # scrapy have (the problem is related to canonicalize request url).
        request_hash = request_fingerprint(request)
        # TODO: may move this in-memory set to a temp file since the number of
        # requests can be pretty large.
        if request_hash in self._request_history:
            return None
        else:
            self._request_history.add(request_hash)
            return request

    def write_warc(self, response):
        # TODO: transform this method into `write_response` so we can have
        # other response writers than WARC (CSV, for example - would be great
        # if we can add specific parsers to save HTML's title and text into
        # CSV, for example).
        write_warc_request_response(self.warc_writer, response)

    def start_requests(self):
        """Start requests with depth = 0

        depth will be 0 for all primary URLs and all requisites (CSS, Images
        and JS) of these URLs. For links found on these URLs, depth will be
        incremented, and so on.
        """
        self.warc_fobj = open(self.warc_filename, mode="wb")
        self.warc_writer = WARCWriter(self.warc_fobj, gzip=True)

        for url in self.urls:
            yield self.make_request(
                url=url, meta={"depth": 0, "main_url": url}, callback=self.parse
            )

    def parse(self, response):
        main_url = response.request.url
        # TODO: what if response.request.url != response.url?
        current_depth = response.request.meta["depth"]
        next_depth = current_depth + 1

        content_type = response.headers.get("Content-Type", b"").decode(
            "ascii"
        )  # TODO: decode properly
        if content_type and content_type.split(";")[0].lower() != "text/html":
            logging.debug(
                f"[{current_depth}] Content-Type not found for {main_url}, parsing as media"
            )
            yield self.parse_media(response)
            return

        logging.debug(f"[{current_depth}] Saving HTML {response.request.url}")
        self.write_warc(response)

        redirect_url = None
        if 300 <= response.status <= 399 and "Location" in response.headers:
            redirect_url = urljoin(
                response.request.url,
                response.headers["Location"].decode("ascii"),  # TODO: decode properly
            )

        for resource in extract_resources(response):
            if resource.type == "link":
                # TODO: handle "//" URLs correctly
                absolute_url = urljoin(main_url, resource.content)
                depth = None
                if resource.link_type == "dependency":
                    depth = current_depth
                elif resource.link_type == "anchor":
                    depth = next_depth
                for request in self.collect_link(
                    main_url, resource.name, absolute_url, depth
                ):
                    if request is None or (
                        redirect_url is not None and redirect_url == request.url
                    ):
                        continue
                    elif (
                        self.allowed_uris
                        and resource.link_type == "anchor"
                        and not resource_matches_base_url(
                            absolute_url, self.allowed_uris
                        )
                    ):
                        logging.info(f"Different domain. Skipping {absolute_url}.")
                        continue
                    yield request

            elif resource.type == "code":
                for request in self.collect_code(
                    main_url, resource.name, resource.content, current_depth
                ):
                    if request is None:
                        continue
                    yield request

        if redirect_url is not None:
            # TODO: how to deal with redirect loops?
            logging.debug(f"[{current_depth}] Redirecting to {redirect_url}")
            yield self.make_request(
                url=redirect_url,
                meta={"depth": current_depth, "main_url": main_url},
                callback=self.parse,
            )

    def parse_request_error(self, failure):
        pass
        # TODO: should we do something with this failure?

    def parse_css(self, response):
        meta = response.request.meta

        for request in self.collect_code(
            response.request.url, "css", response.body, meta["depth"]
        ):
            if request is None:
                continue
            yield request

        logging.debug(f"Saving CSS {response.request.url}")
        self.write_warc(response)

    def parse_js(self, response):
        meta = response.request.meta

        for request in self.collect_code(
            response.request.url, "js", response.body, meta["depth"]
        ):
            if request is None:
                continue
            yield request

        logging.debug(f"Saving JS {response.request.url}")
        self.write_warc(response)

    def parse_media(self, response):
        logging.debug(f"Saving MEDIA {response.request.url}")
        self.write_warc(response)

    def collect_link(self, main_url, link_type, url, depth):
        if depth > self.max_depth:
            logging.debug(
                f"[{depth}] IGNORING (depth exceeded) get link {link_type} {url}"
            )
            return []
        elif not url.startswith("http"):
            logging.debug(f"[{depth}] IGNORING (not HTTP) get link {link_type} {url}")
            return []

        if link_type == "media":
            return [
                self.make_request(
                    url=url,
                    callback=self.parse_media,
                    meta={"depth": depth, "main_url": main_url},
                )
            ]
        elif link_type == "css":
            return [
                self.make_request(
                    url=url,
                    callback=self.parse_css,
                    meta={"depth": depth, "main_url": main_url},
                )
            ]
        elif link_type == "js":
            return [
                self.make_request(
                    url=url,
                    callback=self.parse_js,
                    meta={"depth": depth, "main_url": main_url},
                )
            ]
        elif link_type == "other":
            return [
                self.make_request(
                    url=url,
                    callback=self.parse,
                    meta={"depth": depth, "main_url": main_url},
                )
            ]
        else:
            return [
                self.make_request(
                    url=url,
                    callback=self.parse,
                    meta={"depth": depth, "main_url": main_url},
                )
            ]

    def collect_code(self, main_url, code_type, code, depth):
        if depth > self.max_depth:
            logging.debug(
                f"[{depth}] IGNORING (depth exceeded) getting dependencies for {code_type}"
            )
            return []
        elif code_type == "css":
            if isinstance(code, bytes):
                code = code.decode("utf-8")  # TODO: decode properly
            requests = []
            for result in REGEXP_CSS_URL.findall(code):
                url = urljoin(main_url, result)
                if url.startswith("data:"):
                    continue
                requests.append(
                    self.make_request(
                        url=url,
                        callback=self.parse_media,
                        meta={"depth": depth, "main_url": main_url},
                    )
                )
            return requests
        elif code_type == "js":
            # TODO: extract other references from JS code
            return []
        else:
            logging.info(f"[{depth}] [TODO] PARSE CODE {code_type} {code}")
            return []
            # TODO: change