spider.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. import logging
  2. import re
  3. from collections import namedtuple
  4. from urllib.parse import urljoin
  5. from scrapy import Request, Spider, signals
  6. from scrapy.utils.request import request_fingerprint
  7. from warcio.warcwriter import WARCWriter
  8. from .utils import resource_matches_base_url, write_warc_request_response
  9. Resource = namedtuple("Resource", ["name", "type", "link_type", "content"])
  10. REGEXP_CSS_URL = re.compile(r"""url\(['"]?(.*?)['"]?\)""")
  11. Extractor = namedtuple("Extractor", ["name", "type", "link_type", "xpath"])
  12. EXTRACTORS = [
  13. # Media (images, video etc.)
  14. Extractor(name="media", type="link", link_type="dependency", xpath="//img/@src"),
  15. Extractor(name="media", type="link", link_type="dependency", xpath="//audio/@src"),
  16. Extractor(name="media", type="link", link_type="dependency", xpath="//video/@src"),
  17. Extractor(name="media", type="link", link_type="dependency", xpath="//source/@src"),
  18. Extractor(name="media", type="link", link_type="dependency", xpath="//embed/@src"),
  19. Extractor(
  20. name="media", type="link", link_type="dependency", xpath="//object/@data"
  21. ),
  22. # CSS
  23. Extractor(
  24. name="css",
  25. type="link",
  26. link_type="dependency",
  27. xpath="//link[@rel = 'stylesheet']/@href",
  28. ),
  29. Extractor(name="css", type="code", link_type="dependency", xpath="//style/text()"),
  30. Extractor(name="css", type="code", link_type="dependency", xpath="//*/@style"),
  31. # JavaScript
  32. Extractor(name="js", type="link", link_type="dependency", xpath="//script/@src"),
  33. Extractor(name="js", type="code", link_type="dependency", xpath="//script/text()"),
  34. # TODO: add "javascript:XXX" on //a/@href etc.
  35. # TODO: add inline JS (onload, onchange, onclick etc.)
  36. # Internal/external links and iframes
  37. # TODO: iframe sources must be considered as if they were the same as the
  38. # current page being archived (same depth, get all dependencies etc.).
  39. Extractor(name="other", type="link", link_type="anchor", xpath="//iframe/@src"),
  40. Extractor(name="other", type="link", link_type="anchor", xpath="//a/@href"),
  41. Extractor(name="other", type="link", link_type="anchor", xpath="//area/@href"),
  42. Extractor(
  43. name="other",
  44. type="link",
  45. link_type="anchor",
  46. xpath="//link[not(@rel = 'stylesheet')]/@href",
  47. ),
  48. # TODO: link rel=icon should be considered a dependency (what about other
  49. # link rel=xxx?)
  50. # TODO: add all other "//link/@href"
  51. ]
  52. def extract_resources(response):
  53. for extractor in EXTRACTORS:
  54. for content in response.xpath(extractor.xpath).extract():
  55. yield Resource(
  56. name=extractor.name,
  57. type=extractor.type,
  58. link_type=extractor.link_type,
  59. content=content,
  60. )
  61. class CrauSpider(Spider):
  62. name = "crawler-spider"
  63. custom_settings = {
  64. "CONCURRENT_REQUESTS": 256,
  65. "CONCURRENT_REQUESTS_PER_DOMAIN": 16,
  66. "DNSCACHE_ENABLED": True,
  67. "DNSCACHE_SIZE": 500000,
  68. "DNS_TIMEOUT": 5,
  69. "DOWNLOAD_MAXSIZE": 5 * 1024 * 1024,
  70. "DOWNLOAD_TIMEOUT": 5,
  71. "REACTOR_THREADPOOL_MAXSIZE": 40,
  72. "REDIRECT_ENABLED": False,
  73. "SCHEDULER_PRIORITY_QUEUE": "scrapy.pqueues.DownloaderAwarePriorityQueue",
  74. "SPIDER_MIDDLEWARES_BASE": {
  75. "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
  76. "scrapy.spidermiddlewares.offsite.OffsiteMiddleware": 500,
  77. "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
  78. "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
  79. },
  80. }
  81. @classmethod
  82. def from_crawler(cls, crawler, *args, **kwargs):
  83. spider = super().from_crawler(crawler, *args, **kwargs)
  84. crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
  85. return spider
  86. def __init__(self, warc_filename, urls, max_depth=1, allowed_uris=None):
  87. super().__init__()
  88. self.max_depth = int(max_depth)
  89. self.warc_filename = warc_filename
  90. self.urls = urls
  91. self._request_history = set()
  92. self.warc_fobj = None
  93. self.warc_writer = None
  94. self.allowed_uris = allowed_uris if allowed_uris else []
  95. def spider_closed(self, spider):
  96. if self.warc_fobj is not None:
  97. self.warc_fobj.close()
  98. def make_request(self, request_class=Request, *args, **kwargs):
  99. """Method to create requests and implements a custom dedup filter"""
  100. kwargs["dont_filter"] = kwargs.get("dont_filter", True)
  101. kwargs["errback"] = kwargs.get("errback", self.parse_request_error)
  102. meta = kwargs.get("meta", {})
  103. meta["handle_httpstatus_all"] = meta.get("handle_httpstatus_all", True)
  104. meta["dont_redirect"] = meta.get("dont_redirect", True)
  105. kwargs["meta"] = meta
  106. request = request_class(*args, **kwargs)
  107. if "#" in request.url:
  108. request = request.replace(url=request.url[: request.url.find("#")])
  109. # This `if` filters duplicated requests - we don't use scrapy's dedup
  110. # filter because it has a bug, which filters out requests in undesired
  111. # cases <https://github.com/scrapy/scrapy/issues/1225>.
  112. # TODO: check if this dedup filter does not have the same problem
  113. # scrapy have (the problem is related to canonicalize request url).
  114. request_hash = request_fingerprint(request)
  115. # TODO: may move this in-memory set to a temp file since the number of
  116. # requests can be pretty large.
  117. if request_hash in self._request_history:
  118. return None
  119. else:
  120. self._request_history.add(request_hash)
  121. return request
  122. def write_warc(self, response):
  123. # TODO: transform this method into `write_response` so we can have
  124. # other response writers than WARC (CSV, for example - would be great
  125. # if we can add specific parsers to save HTML's title and text into
  126. # CSV, for example).
  127. write_warc_request_response(self.warc_writer, response)
  128. def start_requests(self):
  129. """Start requests with depth = 0
  130. depth will be 0 for all primary URLs and all requisites (CSS, Images
  131. and JS) of these URLs. For links found on these URLs, depth will be
  132. incremented, and so on.
  133. """
  134. self.warc_fobj = open(self.warc_filename, mode="wb")
  135. self.warc_writer = WARCWriter(self.warc_fobj, gzip=True)
  136. for url in self.urls:
  137. yield self.make_request(
  138. url=url, meta={"depth": 0, "main_url": url}, callback=self.parse
  139. )
  140. def parse(self, response):
  141. main_url = response.request.url
  142. # TODO: what if response.request.url != response.url?
  143. current_depth = response.request.meta["depth"]
  144. next_depth = current_depth + 1
  145. content_type = response.headers.get("Content-Type", b"").decode(
  146. "ascii"
  147. ) # TODO: decode properly
  148. if content_type and content_type.split(";")[0].lower() != "text/html":
  149. logging.debug(
  150. f"[{current_depth}] Content-Type not found for {main_url}, parsing as media"
  151. )
  152. yield self.parse_media(response)
  153. return
  154. logging.debug(f"[{current_depth}] Saving HTML {response.request.url}")
  155. self.write_warc(response)
  156. redirect_url = None
  157. if 300 <= response.status <= 399 and "Location" in response.headers:
  158. redirect_url = urljoin(
  159. response.request.url,
  160. response.headers["Location"].decode("ascii"), # TODO: decode properly
  161. )
  162. for resource in extract_resources(response):
  163. if resource.type == "link":
  164. # TODO: handle "//" URLs correctly
  165. absolute_url = urljoin(main_url, resource.content)
  166. depth = None
  167. if resource.link_type == "dependency":
  168. depth = current_depth
  169. elif resource.link_type == "anchor":
  170. depth = next_depth
  171. for request in self.collect_link(
  172. main_url, resource.name, absolute_url, depth
  173. ):
  174. if request is None or (
  175. redirect_url is not None and redirect_url == request.url
  176. ):
  177. continue
  178. elif (
  179. self.allowed_uris
  180. and resource.link_type == "anchor"
  181. and not resource_matches_base_url(
  182. absolute_url, self.allowed_uris
  183. )
  184. ):
  185. logging.info(f"Different domain. Skipping {absolute_url}.")
  186. continue
  187. yield request
  188. elif resource.type == "code":
  189. for request in self.collect_code(
  190. main_url, resource.name, resource.content, current_depth
  191. ):
  192. if request is None:
  193. continue
  194. yield request
  195. if redirect_url is not None:
  196. # TODO: how to deal with redirect loops?
  197. logging.debug(f"[{current_depth}] Redirecting to {redirect_url}")
  198. yield self.make_request(
  199. url=redirect_url,
  200. meta={"depth": current_depth, "main_url": main_url},
  201. callback=self.parse,
  202. )
  203. def parse_request_error(self, failure):
  204. pass
  205. # TODO: should we do something with this failure?
  206. def parse_css(self, response):
  207. meta = response.request.meta
  208. for request in self.collect_code(
  209. response.request.url, "css", response.body, meta["depth"]
  210. ):
  211. if request is None:
  212. continue
  213. yield request
  214. logging.debug(f"Saving CSS {response.request.url}")
  215. self.write_warc(response)
  216. def parse_js(self, response):
  217. meta = response.request.meta
  218. for request in self.collect_code(
  219. response.request.url, "js", response.body, meta["depth"]
  220. ):
  221. if request is None:
  222. continue
  223. yield request
  224. logging.debug(f"Saving JS {response.request.url}")
  225. self.write_warc(response)
  226. def parse_media(self, response):
  227. logging.debug(f"Saving MEDIA {response.request.url}")
  228. self.write_warc(response)
  229. def collect_link(self, main_url, link_type, url, depth):
  230. if depth > self.max_depth:
  231. logging.debug(
  232. f"[{depth}] IGNORING (depth exceeded) get link {link_type} {url}"
  233. )
  234. return []
  235. elif not url.startswith("http"):
  236. logging.debug(f"[{depth}] IGNORING (not HTTP) get link {link_type} {url}")
  237. return []
  238. if link_type == "media":
  239. return [
  240. self.make_request(
  241. url=url,
  242. callback=self.parse_media,
  243. meta={"depth": depth, "main_url": main_url},
  244. )
  245. ]
  246. elif link_type == "css":
  247. return [
  248. self.make_request(
  249. url=url,
  250. callback=self.parse_css,
  251. meta={"depth": depth, "main_url": main_url},
  252. )
  253. ]
  254. elif link_type == "js":
  255. return [
  256. self.make_request(
  257. url=url,
  258. callback=self.parse_js,
  259. meta={"depth": depth, "main_url": main_url},
  260. )
  261. ]
  262. elif link_type == "other":
  263. return [
  264. self.make_request(
  265. url=url,
  266. callback=self.parse,
  267. meta={"depth": depth, "main_url": main_url},
  268. )
  269. ]
  270. else:
  271. return [
  272. self.make_request(
  273. url=url,
  274. callback=self.parse,
  275. meta={"depth": depth, "main_url": main_url},
  276. )
  277. ]
  278. def collect_code(self, main_url, code_type, code, depth):
  279. if depth > self.max_depth:
  280. logging.debug(
  281. f"[{depth}] IGNORING (depth exceeded) getting dependencies for {code_type}"
  282. )
  283. return []
  284. elif code_type == "css":
  285. if isinstance(code, bytes):
  286. code = code.decode("utf-8") # TODO: decode properly
  287. requests = []
  288. for result in REGEXP_CSS_URL.findall(code):
  289. url = urljoin(main_url, result)
  290. if url.startswith("data:"):
  291. continue
  292. requests.append(
  293. self.make_request(
  294. url=url,
  295. callback=self.parse_media,
  296. meta={"depth": depth, "main_url": main_url},
  297. )
  298. )
  299. return requests
  300. elif code_type == "js":
  301. # TODO: extract other references from JS code
  302. return []
  303. else:
  304. logging.info(f"[{depth}] [TODO] PARSE CODE {code_type} {code}")
  305. return []
  306. # TODO: change