browser.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. # Copyright (c) 2017 crocoite contributors
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. # THE SOFTWARE.
  20. """
  21. Chrome browser interactions.
  22. """
  23. import asyncio
  24. from base64 import b64decode, b64encode
  25. from datetime import datetime, timedelta
  26. from http.server import BaseHTTPRequestHandler
  27. from yarl import URL
  28. from multidict import CIMultiDict
  29. from .logger import Level
  30. from .devtools import Browser, TabException
  31. # These two classes’ only purpose is so we can later tell whether a body was
  32. # base64-encoded or a unicode string
  33. class Base64Body (bytes):
  34. def __new__ (cls, value):
  35. return bytes.__new__ (cls, b64decode (value))
  36. @classmethod
  37. def fromBytes (cls, b):
  38. """ For testing """
  39. return cls (b64encode (b))
  40. class UnicodeBody (bytes):
  41. def __new__ (cls, value):
  42. if type (value) is not str:
  43. raise TypeError ('expecting unicode string')
  44. return bytes.__new__ (cls, value.encode ('utf-8'))
  45. class Request:
  46. __slots__ = ('headers', 'body', 'initiator', 'hasPostData', 'method', 'timestamp')
  47. def __init__ (self, method=None, headers=None, body=None):
  48. self.headers = headers
  49. self.body = body
  50. self.hasPostData = False
  51. self.initiator = None
  52. # HTTP method
  53. self.method = method
  54. self.timestamp = None
  55. def __repr__ (self):
  56. return f'Request({self.method!r}, {self.headers!r}, {self.body!r})'
  57. def __eq__ (self, b):
  58. if b is None:
  59. return False
  60. if not isinstance (b, Request):
  61. raise TypeError ('Can only compare equality with Request.')
  62. # do not compare hasPostData (only required to fetch body) and
  63. # timestamp (depends on time)
  64. return self.headers == b.headers and \
  65. self.body == b.body and \
  66. self.initiator == b.initiator and \
  67. self.method == b.method
  68. class Response:
  69. __slots__ = ('status', 'statusText', 'headers', 'body', 'bytesReceived',
  70. 'timestamp', 'mimeType')
  71. def __init__ (self, status=None, statusText=None, headers=None, body=None, mimeType=None):
  72. self.status = status
  73. self.statusText = statusText
  74. self.headers = headers
  75. self.body = body
  76. # bytes received over the network (not body size!)
  77. self.bytesReceived = 0
  78. self.timestamp = None
  79. self.mimeType = mimeType
  80. def __repr__ (self):
  81. return f'Response({self.status!r}, {self.statusText!r}, {self.headers!r}, {self.body!r}, {self.mimeType!r})'
  82. def __eq__ (self, b):
  83. if b is None:
  84. return False
  85. if not isinstance (b, Response):
  86. raise TypeError ('Can only compare equality with Response.')
  87. # do not compare bytesReceived (depends on network), timestamp
  88. # (depends on time) and statusText (does not matter)
  89. return self.status == b.status and \
  90. self.statusText == b.statusText and \
  91. self.headers == b.headers and \
  92. self.body == b.body and \
  93. self.mimeType == b.mimeType
  94. class ReferenceTimestamp:
  95. """ Map relative timestamp to absolute timestamp """
  96. def __init__ (self, relative, absolute):
  97. self.relative = timedelta (seconds=relative)
  98. self.absolute = datetime.utcfromtimestamp (absolute)
  99. def __call__ (self, relative):
  100. if not isinstance (relative, timedelta):
  101. relative = timedelta (seconds=relative)
  102. return self.absolute + (relative-self.relative)
  103. class RequestResponsePair:
  104. __slots__ = ('request', 'response', 'id', 'url', 'remoteIpAddress',
  105. 'protocol', 'resourceType', '_time')
  106. def __init__ (self, id=None, url=None, request=None, response=None):
  107. self.request = request
  108. self.response = response
  109. self.id = id
  110. self.url = url
  111. self.remoteIpAddress = None
  112. self.protocol = None
  113. self.resourceType = None
  114. self._time = None
  115. def __repr__ (self):
  116. return f'RequestResponsePair({self.id!r}, {self.url!r}, {self.request!r}, {self.response!r})'
  117. def __eq__ (self, b):
  118. if not isinstance (b, RequestResponsePair):
  119. raise TypeError (f'Can only compare with {self.__class__.__name__}')
  120. # do not compare id and _time. These depend on external factors and do
  121. # not influence the request/response *content*
  122. return self.request == b.request and \
  123. self.response == b.response and \
  124. self.url == b.url and \
  125. self.remoteIpAddress == b.remoteIpAddress and \
  126. self.protocol == b.protocol and \
  127. self.resourceType == b.resourceType
  128. def fromRequestWillBeSent (self, req):
  129. """ Set request data from Chrome Network.requestWillBeSent event """
  130. r = req['request']
  131. self.id = req['requestId']
  132. self.url = URL (r['url'])
  133. self.resourceType = req.get ('type')
  134. self._time = ReferenceTimestamp (req['timestamp'], req['wallTime'])
  135. assert self.request is None, req
  136. self.request = Request ()
  137. self.request.initiator = req['initiator']
  138. self.request.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
  139. self.request.hasPostData = r.get ('hasPostData', False)
  140. self.request.method = r['method']
  141. self.request.timestamp = self._time (req['timestamp'])
  142. if self.request.hasPostData:
  143. postData = r.get ('postData')
  144. if postData is not None:
  145. self.request.body = UnicodeBody (postData)
  146. def fromResponse (self, r, timestamp=None, resourceType=None):
  147. """
  148. Set response data from Chrome’s Response object.
  149. Request must exist. Updates if response was set before. Sometimes
  150. fromResponseReceived is triggered twice by Chrome. No idea why.
  151. """
  152. assert self.request is not None, (self.request, r)
  153. if not timestamp:
  154. timestamp = self.request.timestamp
  155. self.remoteIpAddress = r.get ('remoteIPAddress')
  156. self.protocol = r.get ('protocol')
  157. if resourceType:
  158. self.resourceType = resourceType
  159. # a response may contain updated request headers (i.e. those actually
  160. # sent over the wire)
  161. if 'requestHeaders' in r:
  162. self.request.headers = CIMultiDict (self._unfoldHeaders (r['requestHeaders']))
  163. self.response = Response ()
  164. self.response.headers = CIMultiDict (self._unfoldHeaders (r['headers']))
  165. self.response.status = r['status']
  166. self.response.statusText = r['statusText']
  167. self.response.timestamp = timestamp
  168. self.response.mimeType = r['mimeType']
  169. def fromResponseReceived (self, resp):
  170. """ Set response data from Chrome Network.responseReceived """
  171. return self.fromResponse (resp['response'],
  172. self._time (resp['timestamp']), resp['type'])
  173. def fromLoadingFinished (self, data):
  174. self.response.bytesReceived = data['encodedDataLength']
  175. def fromLoadingFailed (self, data):
  176. self.response = None
  177. @staticmethod
  178. def _unfoldHeaders (headers):
  179. """
  180. A host may send multiple headers using the same key, which Chrome folds
  181. into the same item. Separate those.
  182. """
  183. items = []
  184. for k in headers.keys ():
  185. for v in headers[k].split ('\n'):
  186. items.append ((k, v))
  187. return items
  188. async def prefetchRequestBody (self, tab):
  189. if self.request.hasPostData and self.request.body is None:
  190. try:
  191. postData = await tab.Network.getRequestPostData (requestId=self.id)
  192. self.request.body = UnicodeBody (postData['postData'])
  193. except TabException:
  194. self.request.body = None
  195. async def prefetchResponseBody (self, tab):
  196. """ Fetch response body """
  197. try:
  198. body = await tab.Network.getResponseBody (requestId=self.id)
  199. if body['base64Encoded']:
  200. self.response.body = Base64Body (body['body'])
  201. else:
  202. self.response.body = UnicodeBody (body['body'])
  203. except TabException:
  204. self.response.body = None
  205. class NavigateError (IOError):
  206. pass
  207. class PageIdle:
  208. """ Page idle event """
  209. __slots__ = ('idle', )
  210. def __init__ (self, idle):
  211. self.idle = idle
  212. def __bool__ (self):
  213. return self.idle
  214. class FrameNavigated:
  215. __slots__ = ('id', 'url', 'mimeType')
  216. def __init__ (self, id, url, mimeType):
  217. self.id = id
  218. self.url = URL (url)
  219. self.mimeType = mimeType
  220. class SiteLoader:
  221. """
  222. Load site in Chrome and monitor network requests
  223. XXX: track popup windows/new tabs and close them
  224. """
  225. __slots__ = ('requests', 'browser', 'logger', 'tab', '_iterRunning',
  226. '_framesLoading', '_rootFrame')
  227. allowedSchemes = {'http', 'https'}
  228. def __init__ (self, browser, logger):
  229. self.requests = {}
  230. self.browser = Browser (url=browser)
  231. self.logger = logger.bind (context=type (self).__name__)
  232. self._iterRunning = []
  233. self._framesLoading = set ()
  234. self._rootFrame = None
  235. async def __aenter__ (self):
  236. tab = self.tab = await self.browser.__aenter__ ()
  237. # enable events
  238. await asyncio.gather (*[
  239. tab.Log.enable (),
  240. tab.Network.enable(),
  241. tab.Page.enable (),
  242. tab.Inspector.enable (),
  243. tab.Network.clearBrowserCache (),
  244. tab.Network.clearBrowserCookies (),
  245. ])
  246. return self
  247. async def __aexit__ (self, exc_type, exc_value, traceback):
  248. for task in self._iterRunning:
  249. # ignore any results from stuff we did not end up using anyway
  250. if not task.done ():
  251. task.cancel ()
  252. self._iterRunning = []
  253. await self.browser.__aexit__ (exc_type, exc_value, traceback)
  254. self.tab = None
  255. return False
  256. def __len__ (self):
  257. return len (self.requests)
  258. async def __aiter__ (self):
  259. """ Retrieve network items """
  260. tab = self.tab
  261. assert tab is not None
  262. handler = {
  263. tab.Network.requestWillBeSent: self._requestWillBeSent,
  264. tab.Network.responseReceived: self._responseReceived,
  265. tab.Network.loadingFinished: self._loadingFinished,
  266. tab.Network.loadingFailed: self._loadingFailed,
  267. tab.Log.entryAdded: self._entryAdded,
  268. tab.Page.javascriptDialogOpening: self._javascriptDialogOpening,
  269. tab.Page.frameStartedLoading: self._frameStartedLoading,
  270. tab.Page.frameStoppedLoading: self._frameStoppedLoading,
  271. tab.Page.frameNavigated: self._frameNavigated,
  272. }
  273. # The implementation is a little advanced. Why? The goal here is to
  274. # process events from the tab as quickly as possible (i.e.
  275. # asynchronously). We need to make sure that JavaScript dialogs are
  276. # handled immediately for instance. Otherwise they stall every
  277. # other request. Also, we don’t want to use an unbounded queue,
  278. # since the items yielded can get quite big (response body). Thus
  279. # we need to block (yield) for every item completed, but not
  280. # handled by the consumer (caller).
  281. running = self._iterRunning
  282. tabGetTask = asyncio.ensure_future (self.tab.get ())
  283. running.append (tabGetTask)
  284. while True:
  285. done, pending = await asyncio.wait (running, return_when=asyncio.FIRST_COMPLETED)
  286. for t in done:
  287. result = t.result ()
  288. if result is None:
  289. pass
  290. elif t == tabGetTask:
  291. method, data = result
  292. f = handler.get (method, None)
  293. if f is not None:
  294. task = asyncio.ensure_future (f (**data))
  295. pending.add (task)
  296. tabGetTask = asyncio.ensure_future (self.tab.get ())
  297. pending.add (tabGetTask)
  298. else:
  299. yield result
  300. running = pending
  301. self._iterRunning = running
  302. async def navigate (self, url):
  303. ret = await self.tab.Page.navigate(url=url)
  304. self.logger.debug ('navigate',
  305. uuid='9d47ded2-951f-4e09-86ee-fd4151e20666', result=ret)
  306. if 'errorText' in ret:
  307. raise NavigateError (ret['errorText'])
  308. self._rootFrame = ret['frameId']
  309. # internal chrome callbacks
  310. async def _requestWillBeSent (self, **kwargs):
  311. self.logger.debug ('requestWillBeSent',
  312. uuid='b828d75a-650d-42d2-8c66-14f4547512da', args=kwargs)
  313. reqId = kwargs['requestId']
  314. req = kwargs['request']
  315. url = URL (req['url'])
  316. logger = self.logger.bind (reqId=reqId, reqUrl=url)
  317. if url.scheme not in self.allowedSchemes:
  318. return
  319. ret = None
  320. item = self.requests.get (reqId)
  321. if item:
  322. # redirects never “finish” loading, but yield another requestWillBeSent with this key set
  323. redirectResp = kwargs.get ('redirectResponse')
  324. if redirectResp:
  325. if item.url != url:
  326. # this happens for unknown reasons. the docs simply state
  327. # it can differ in case of a redirect. Fix it and move on.
  328. logger.warning ('redirect url differs',
  329. uuid='558a7df7-2258-4fe4-b16d-22b6019cc163',
  330. expected=item.url)
  331. redirectResp['url'] = str (item.url)
  332. item.fromResponse (redirectResp)
  333. logger.info ('redirect', uuid='85eaec41-e2a9-49c2-9445-6f19690278b8', target=url)
  334. # XXX: queue this? no need to wait for it
  335. await item.prefetchRequestBody (self.tab)
  336. # cannot fetch response body due to race condition (item id reused)
  337. ret = item
  338. else:
  339. logger.warning ('request exists', uuid='2c989142-ba00-4791-bb03-c2a14e91a56b')
  340. item = RequestResponsePair ()
  341. item.fromRequestWillBeSent (kwargs)
  342. self.requests[reqId] = item
  343. return ret
  344. async def _responseReceived (self, **kwargs):
  345. self.logger.debug ('responseReceived',
  346. uuid='ecd67e69-401a-41cb-b4ec-eeb1f1ec6abb', args=kwargs)
  347. reqId = kwargs['requestId']
  348. item = self.requests.get (reqId)
  349. if item is None:
  350. return
  351. resp = kwargs['response']
  352. url = URL (resp['url'])
  353. logger = self.logger.bind (reqId=reqId, respUrl=url)
  354. if item.url != url:
  355. logger.error ('url mismatch', uuid='7385f45f-0b06-4cbc-81f9-67bcd72ee7d0', respUrl=url)
  356. if url.scheme in self.allowedSchemes:
  357. item.fromResponseReceived (kwargs)
  358. else:
  359. logger.warning ('scheme forbidden', uuid='2ea6e5d7-dd3b-4881-b9de-156c1751c666')
  360. async def _loadingFinished (self, **kwargs):
  361. """
  362. Item was fully loaded. For some items the request body is not available
  363. when responseReceived is fired, thus move everything here.
  364. """
  365. self.logger.debug ('loadingFinished',
  366. uuid='35479405-a5b5-4395-8c33-d3601d1796b9', args=kwargs)
  367. reqId = kwargs['requestId']
  368. item = self.requests.pop (reqId, None)
  369. if item is None:
  370. # we never recorded this request (blacklisted scheme, for example)
  371. return
  372. if not item.response:
  373. # chrome failed to send us a responseReceived event for this item,
  374. # so we can’t record it (missing request/response headers)
  375. self.logger.error ('response missing',
  376. uuid='fac3ab96-3f9b-4c5a-95c7-f83b675cdcb9', requestId=item.id)
  377. return
  378. req = item.request
  379. if item.url.scheme in self.allowedSchemes:
  380. item.fromLoadingFinished (kwargs)
  381. # XXX queue both
  382. await asyncio.gather (item.prefetchRequestBody (self.tab), item.prefetchResponseBody (self.tab))
  383. return item
  384. async def _loadingFailed (self, **kwargs):
  385. self.logger.info ('loadingFailed',
  386. uuid='4a944e85-5fae-4aa6-9e7c-e578b29392e4', args=kwargs)
  387. reqId = kwargs['requestId']
  388. logger = self.logger.bind (reqId=reqId)
  389. item = self.requests.pop (reqId, None)
  390. if item is not None:
  391. item.fromLoadingFailed (kwargs)
  392. return item
  393. async def _entryAdded (self, **kwargs):
  394. """ Log entry added """
  395. entry = kwargs['entry']
  396. level = {'verbose': Level.DEBUG, 'info': Level.INFO,
  397. 'warning': Level.WARNING,
  398. 'error': Level.ERROR}.get (entry.pop ('level'), Level.INFO)
  399. entry['uuid'] = 'e62ffb5a-0521-459c-a3d9-1124551934d2'
  400. self.logger (level, 'console', **entry)
  401. async def _javascriptDialogOpening (self, **kwargs):
  402. t = kwargs.get ('type')
  403. if t in {'alert', 'confirm', 'prompt'}:
  404. self.logger.info ('js dialog',
  405. uuid='d6f07ce2-648e-493b-a1df-f353bed27c84',
  406. action='cancel', type=t, message=kwargs.get ('message'))
  407. await self.tab.Page.handleJavaScriptDialog (accept=False)
  408. elif t == 'beforeunload':
  409. # we must accept this one, otherwise the page will not unload/close
  410. self.logger.info ('js dialog',
  411. uuid='96399b99-9834-4c8f-bd93-cb9fa2225abd',
  412. action='proceed', type=t, message=kwargs.get ('message'))
  413. await self.tab.Page.handleJavaScriptDialog (accept=True)
  414. else: # pragma: no cover
  415. self.logger.warning ('js dialog unknown',
  416. uuid='3ef7292e-8595-4e89-b834-0cc6bc40ee38', **kwargs)
  417. async def _frameStartedLoading (self, **kwargs):
  418. self.logger.debug ('frameStartedLoading',
  419. uuid='bbeb39c0-3304-4221-918e-f26bd443c566', args=kwargs)
  420. self._framesLoading.add (kwargs['frameId'])
  421. return PageIdle (False)
  422. async def _frameStoppedLoading (self, **kwargs):
  423. self.logger.debug ('frameStoppedLoading',
  424. uuid='fcbe8110-511c-4cbb-ac2b-f61a5782c5a0', args=kwargs)
  425. self._framesLoading.remove (kwargs['frameId'])
  426. if not self._framesLoading:
  427. return PageIdle (True)
  428. async def _frameNavigated (self, **kwargs):
  429. self.logger.debug ('frameNavigated',
  430. uuid='0e876f7d-7129-4612-8632-686f42ac6e1f', args=kwargs)
  431. frame = kwargs['frame']
  432. if self._rootFrame == frame['id']:
  433. assert frame.get ('parentId', None) is None, "root frame must not have a parent"
  434. return FrameNavigated (frame['id'], frame['url'], frame['mimeType'])