behavior.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. # Copyright (c) 2017–2018 crocoite contributors
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. # THE SOFTWARE.
  20. """
  21. Behavior scripts (i.e. subclasses of Behavior) are a powerful method to
  22. manipulate websites loaded into Chrome. They are executed by the controller
  23. after the page started loading (onload), after it has been idle for a while
  24. (onstop) and after loading was stopped (onfinish).
  25. The script’s excercise their power either through DevTools API calls or by
  26. injecting JavaScript into the page context. Thus they can manipulate both, the
  27. browser itself (DevTools; modify resolution, get DOM snapshot) as well as the
  28. page (JavaScript; trigger JavaScript events, call web API’s).
  29. They also emit (yield) data processable by any consumer registered to the
  30. controller. This allows storing captured screenshots inside WARC files, for
  31. instance.
  32. """
  33. import asyncio, json, os.path
  34. from base64 import b64decode
  35. from collections import OrderedDict
  36. import pkg_resources
  37. from html5lib.serializer import HTMLSerializer
  38. from yarl import URL
  39. import yaml
  40. from .util import getFormattedViewportMetrics
  41. from . import html
  42. from .html import StripAttributeFilter, StripTagFilter, ChromeTreeWalker
  43. from .devtools import Crashed, TabException
  44. class Script:
  45. """ A JavaScript resource """
  46. __slots__ = ('path', 'data')
  47. datadir = 'data'
  48. def __init__ (self, path=None, encoding='utf-8'):
  49. self.path = path
  50. if path:
  51. self.data = pkg_resources.resource_string (__name__, os.path.join (self.datadir, path)).decode (encoding)
  52. def __repr__ (self):
  53. return f'<Script {self.path}>'
  54. def __str__ (self):
  55. return self.data
  56. @property
  57. def abspath (self):
  58. return pkg_resources.resource_filename (__name__,
  59. os.path.join (self.datadir, self.path))
  60. @classmethod
  61. def fromStr (cls, data, path=None):
  62. s = Script ()
  63. s.data = data
  64. s.path = path
  65. return s
  66. class Behavior:
  67. __slots__ = ('loader', 'logger')
  68. # unique behavior name
  69. name = None
  70. def __init__ (self, loader, logger):
  71. assert self.name is not None
  72. self.loader = loader
  73. self.logger = logger.bind (context=type (self).__name__)
  74. def __contains__ (self, url):
  75. """
  76. Accept every URL by default
  77. """
  78. return True
  79. def __repr__ (self):
  80. return f'<Behavior {self.name}>'
  81. async def onload (self):
  82. """ After loading the page started """
  83. # this is a dirty hack to make this function an async generator
  84. return
  85. yield # pragma: no cover
  86. async def onstop (self):
  87. """ Before page loading is stopped """
  88. return
  89. yield # pragma: no cover
  90. async def onfinish (self):
  91. """ After the site has stopped loading """
  92. return
  93. yield # pragma: no cover
  94. class JsOnload (Behavior):
  95. """ Execute JavaScript on page load """
  96. __slots__ = ('script', 'context', 'options')
  97. scriptPath = None
  98. def __init__ (self, loader, logger):
  99. super ().__init__ (loader, logger)
  100. self.script = Script (self.scriptPath)
  101. self.context = None
  102. # options passed to constructor
  103. self.options = {}
  104. async def onload (self):
  105. tab = self.loader.tab
  106. yield self.script
  107. # This is slightly awkward, since we cannot compile the class into an
  108. # objectId and then reference it. Therefore the script must return a
  109. # class constructor, which is then called with a generic options
  110. # parameter.
  111. # XXX: is there a better way to do this?
  112. result = await tab.Runtime.evaluate (expression=str (self.script))
  113. self.logger.debug ('behavior onload inject',
  114. uuid='a2da9b78-5648-44c5-bfa8-5c7573e13ad3', result=result)
  115. exception = result.get ('exceptionDetails', None)
  116. result = result['result']
  117. assert result['type'] == 'function', result
  118. assert result.get ('subtype') != 'error', exception
  119. constructor = result['objectId']
  120. if self.options:
  121. yield Script.fromStr (json.dumps (self.options, indent=2), f'{self.script.path}#options')
  122. try:
  123. result = await tab.Runtime.callFunctionOn (
  124. functionDeclaration='function(options){return new this(options);}',
  125. objectId=constructor,
  126. arguments=[{'value': self.options}])
  127. self.logger.debug ('behavior onload start',
  128. uuid='6c0605ae-93b3-46b3-b575-ba45790909a7', result=result)
  129. result = result['result']
  130. assert result['type'] == 'object', result
  131. assert result.get ('subtype') != 'error', result
  132. self.context = result['objectId']
  133. except TabException as e:
  134. if e.args[0] == -32000:
  135. # the site probably reloaded. ignore this, since we’ll be
  136. # re-injected into the new site by the controller.
  137. self.logger.error ('jsonload onload failed',
  138. uuid='c151a863-78d1-41f4-a8e6-c022a6c5d252',
  139. exception=e.args)
  140. else:
  141. raise
  142. async def onstop (self):
  143. tab = self.loader.tab
  144. try:
  145. assert self.context is not None
  146. await tab.Runtime.callFunctionOn (functionDeclaration='function(){return this.stop();}',
  147. objectId=self.context)
  148. await tab.Runtime.releaseObject (objectId=self.context)
  149. except TabException as e:
  150. # cannot do anything about that. Ignoring should be fine.
  151. self.logger.error ('jsonload onstop failed',
  152. uuid='1786726f-c8ec-4f79-8769-30954d4e32f5',
  153. exception=e.args,
  154. objectId=self.context)
  155. return
  156. yield # pragma: no cover
  157. ### Generic scripts ###
  158. class Scroll (JsOnload):
  159. name = 'scroll'
  160. scriptPath = 'scroll.js'
  161. class EmulateScreenMetrics (Behavior):
  162. name = 'emulateScreenMetrics'
  163. async def onstop (self):
  164. """
  165. Emulate different screen sizes, causing the site to fetch assets (img
  166. srcset and css, for example) for different screen resolutions.
  167. """
  168. cssPpi = 96
  169. sizes = [
  170. {'width': 1920, 'height': 1080, 'deviceScaleFactor': 1.5, 'mobile': False},
  171. {'width': 1920, 'height': 1080, 'deviceScaleFactor': 2, 'mobile': False},
  172. # very dense display
  173. {'width': 1920, 'height': 1080, 'deviceScaleFactor': 4, 'mobile': False},
  174. # just a few samples:
  175. # 1st gen iPhone (portrait mode)
  176. {'width': 320, 'height': 480, 'deviceScaleFactor': 163/cssPpi, 'mobile': True},
  177. # 6th gen iPhone (portrait mode)
  178. {'width': 750, 'height': 1334, 'deviceScaleFactor': 326/cssPpi, 'mobile': True},
  179. ]
  180. l = self.loader
  181. tab = l.tab
  182. for s in sizes:
  183. self.logger.debug ('device override',
  184. uuid='3d2d8096-1a75-4830-ad79-ae5f6f97071d', **s)
  185. await tab.Emulation.setDeviceMetricsOverride (**s)
  186. # give the browser time to re-eval page and start requests
  187. # XXX: should wait until loader is not busy any more
  188. await asyncio.sleep (1)
  189. self.logger.debug ('clear override',
  190. uuid='f9401683-eb3a-4b86-9bb2-c8c5d876fc8d')
  191. await tab.Emulation.clearDeviceMetricsOverride ()
  192. return
  193. yield # pragma: no cover
  194. class DomSnapshotEvent:
  195. __slots__ = ('url', 'document', 'viewport')
  196. def __init__ (self, url, document, viewport):
  197. # XXX: document encoding?
  198. assert isinstance (document, bytes)
  199. self.url = url
  200. self.document = document
  201. self.viewport = viewport
  202. class DomSnapshot (Behavior):
  203. """
  204. Get a DOM snapshot of tab and write it to WARC.
  205. We could use DOMSnapshot.getSnapshot here, but the API is not stable
  206. yet. Also computed styles are not really necessary here.
  207. """
  208. __slots__ = ('script', )
  209. name = 'domSnapshot'
  210. def __init__ (self, loader, logger):
  211. super ().__init__ (loader, logger)
  212. self.script = Script ('canvas-snapshot.js')
  213. async def onfinish (self):
  214. tab = self.loader.tab
  215. yield self.script
  216. await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
  217. viewport = await getFormattedViewportMetrics (tab)
  218. dom = await tab.DOM.getDocument (depth=-1, pierce=True)
  219. self.logger.debug ('dom snapshot document',
  220. uuid='0c720784-8bd1-4fdc-a811-84394d753539', dom=dom)
  221. haveUrls = set ()
  222. for doc in ChromeTreeWalker (dom['root']).split ():
  223. url = URL (doc['documentURL'])
  224. if url in haveUrls:
  225. # ignore duplicate URLs. they are usually caused by
  226. # javascript-injected iframes (advertising) with no(?) src
  227. self.logger.warning ('dom snapshot duplicate',
  228. uuid='d44de989-98d4-456e-82e7-9d4c49acab5e')
  229. elif url.scheme in ('http', 'https'):
  230. self.logger.debug ('dom snapshot',
  231. uuid='ece7ff05-ccd9-44b5-b6a8-be25a24b96f4',
  232. base=doc["baseURL"])
  233. haveUrls.add (url)
  234. walker = ChromeTreeWalker (doc)
  235. # remove script, to make the page static and noscript, because at the
  236. # time we took the snapshot scripts were enabled
  237. disallowedTags = ['script', 'noscript']
  238. disallowedAttributes = html.eventAttributes
  239. stream = StripAttributeFilter (StripTagFilter (walker, disallowedTags), disallowedAttributes)
  240. serializer = HTMLSerializer ()
  241. yield DomSnapshotEvent (url.with_fragment(None), serializer.render (stream, 'utf-8'), viewport)
  242. class ScreenshotEvent:
  243. __slots__ = ('yoff', 'data', 'url')
  244. def __init__ (self, url, yoff, data):
  245. self.url = url
  246. self.yoff = yoff
  247. self.data = data
  248. class Screenshot (Behavior):
  249. """
  250. Create screenshot from tab and write it to WARC
  251. Chrome will allocate an additional 512MB of RAM when using this plugin.
  252. """
  253. __slots__ = ('script')
  254. name = 'screenshot'
  255. # Hardcoded max texture size of 16,384 (crbug.com/770769)
  256. maxDim = 16*1024
  257. def __init__ (self, loader, logger):
  258. super ().__init__ (loader, logger)
  259. self.script = Script ('screenshot.js')
  260. async def onfinish (self):
  261. tab = self.loader.tab
  262. # for top-level/full-screen elements with position: fixed we need to
  263. # figure out their actual size (i.e. scrollHeight) and use that when
  264. # overriding the viewport size.
  265. # we could do this without javascript, but that would require several
  266. # round-trips to Chrome or pulling down the entire DOM+computed styles
  267. tab = self.loader.tab
  268. yield self.script
  269. result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
  270. assert result['result']['type'] == 'object', result
  271. result = result['result']['value']
  272. # this is required to make the browser render more than just the small
  273. # actual viewport (i.e. entire page). see
  274. # https://github.com/GoogleChrome/puppeteer/blob/45873ea737b4ebe4fa7d6f46256b2ea19ce18aa7/lib/Page.js#L805
  275. metrics = await tab.Page.getLayoutMetrics ()
  276. contentSize = metrics['contentSize']
  277. contentHeight = max (result + [contentSize['height']])
  278. override = {
  279. 'width': 0,
  280. 'height': 0,
  281. 'deviceScaleFactor': 0,
  282. 'mobile': False,
  283. 'viewport': {'x': 0,
  284. 'y': 0,
  285. 'width': contentSize['width'],
  286. 'height': contentHeight,
  287. 'scale': 1}
  288. }
  289. self.logger.debug ('screenshot override',
  290. uuid='e0affa18-cbb1-4d97-9d13-9a88f704b1b2', override=override)
  291. await tab.Emulation.setDeviceMetricsOverride (**override)
  292. tree = await tab.Page.getFrameTree ()
  293. try:
  294. url = URL (tree['frameTree']['frame']['url']).with_fragment (None)
  295. except KeyError:
  296. self.logger.error ('frame without url',
  297. uuid='edc2743d-b93e-4ba1-964e-db232f2f96ff', tree=tree)
  298. url = None
  299. width = min (contentSize['width'], self.maxDim)
  300. # we’re ignoring horizontal scroll intentionally. Most horizontal
  301. # layouts use JavaScript scrolling and don’t extend the viewport.
  302. for yoff in range (0, contentHeight, self.maxDim):
  303. height = min (contentHeight - yoff, self.maxDim)
  304. clip = {'x': 0, 'y': yoff, 'width': width, 'height': height, 'scale': 1}
  305. ret = await tab.Page.captureScreenshot (format='png', clip=clip)
  306. data = b64decode (ret['data'])
  307. yield ScreenshotEvent (url, yoff, data)
  308. await tab.Emulation.clearDeviceMetricsOverride ()
  309. class Click (JsOnload):
  310. """ Generic link clicking """
  311. name = 'click'
  312. scriptPath = 'click.js'
  313. def __init__ (self, loader, logger):
  314. super ().__init__ (loader, logger)
  315. with pkg_resources.resource_stream (__name__, os.path.join ('data', 'click.yaml')) as fd:
  316. self.options['sites'] = list (yaml.safe_load_all (fd))
  317. class ExtractLinksEvent:
  318. __slots__ = ('links', )
  319. def __init__ (self, links):
  320. self.links = links
  321. def __repr__ (self):
  322. return f'<ExtractLinksEvent {self.links!r}>'
  323. def mapOrIgnore (f, l):
  324. for e in l:
  325. try:
  326. yield f (e)
  327. except:
  328. pass
  329. class ExtractLinks (Behavior):
  330. """
  331. Extract links from a page using JavaScript
  332. We could retrieve a HTML snapshot and extract links here, but we’d have to
  333. manually resolve relative links.
  334. """
  335. __slots__ = ('script', )
  336. name = 'extractLinks'
  337. def __init__ (self, loader, logger):
  338. super ().__init__ (loader, logger)
  339. self.script = Script ('extract-links.js')
  340. async def onfinish (self):
  341. tab = self.loader.tab
  342. yield self.script
  343. result = await tab.Runtime.evaluate (expression=str (self.script), returnByValue=True)
  344. yield ExtractLinksEvent (list (set (mapOrIgnore (URL, result['result']['value']))))
  345. class Crash (Behavior):
  346. """ Crash the browser. For testing only. Obviously. """
  347. name = 'crash'
  348. async def onstop (self):
  349. try:
  350. await self.loader.tab.Page.crash ()
  351. except Crashed:
  352. pass
  353. return
  354. yield # pragma: no cover
  355. # available behavior scripts. Order matters, move those modifying the page
  356. # towards the end of available
  357. available = [Scroll, Click, ExtractLinks, Screenshot, EmulateScreenMetrics, DomSnapshot]
  358. #available.append (Crash)
  359. # order matters, since behavior can modify the page (dom snapshots, for instance)
  360. availableMap = OrderedDict (map (lambda x: (x.name, x), available))