test_warc.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # Copyright (c) 2018 crocoite contributors
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. # THE SOFTWARE.
  20. from tempfile import NamedTemporaryFile
  21. import json, urllib
  22. from operator import itemgetter
  23. from warcio.archiveiterator import ArchiveIterator
  24. from yarl import URL
  25. from multidict import CIMultiDict
  26. from hypothesis import given, reproduce_failure
  27. import hypothesis.strategies as st
  28. import pytest
  29. from .warc import WarcHandler
  30. from .logger import Logger, WarcHandlerConsumer
  31. from .controller import ControllerStart
  32. from .behavior import Script, ScreenshotEvent, DomSnapshotEvent
  33. from .browser import RequestResponsePair, Base64Body, UnicodeBody
  34. from .test_browser import requestResponsePair, urls
  35. def test_log ():
  36. logger = Logger ()
  37. with NamedTemporaryFile() as fd:
  38. with WarcHandler (fd, logger) as handler:
  39. warclogger = WarcHandlerConsumer (handler)
  40. logger.connect (warclogger)
  41. golden = []
  42. assert handler.log.tell () == 0
  43. golden.append (logger.info (foo=1, bar='baz', encoding='äöü⇔ΓΨ'))
  44. assert handler.log.tell () != 0
  45. handler.maxLogSize = 0
  46. golden.append (logger.info (bar=1, baz='baz'))
  47. # should flush the log
  48. assert handler.log.tell () == 0
  49. fd.seek (0)
  50. for it in ArchiveIterator (fd):
  51. headers = it.rec_headers
  52. assert headers['warc-type'] == 'metadata'
  53. assert 'warc-target-uri' not in headers
  54. assert headers['x-crocoite-type'] == 'log'
  55. assert headers['content-type'] == f'application/json; charset={handler.logEncoding}'
  56. while True:
  57. l = it.raw_stream.readline ()
  58. if not l:
  59. break
  60. data = json.loads (l.strip ())
  61. assert data == golden.pop (0)
  62. def jsonObject ():
  63. """ JSON-encodable objects """
  64. return st.dictionaries (st.text (), st.one_of (st.integers (), st.text ()))
  65. def viewport ():
  66. return st.builds (lambda x, y: f'{x}x{y}', st.integers (), st.integers ())
  67. def event ():
  68. return st.one_of (
  69. st.builds (ControllerStart, jsonObject ()),
  70. st.builds (Script.fromStr, st.text (), st.one_of(st.none (), st.text ())),
  71. st.builds (ScreenshotEvent, urls (), st.integers (), st.binary ()),
  72. st.builds (DomSnapshotEvent, urls (), st.builds (lambda x: x.encode ('utf-8'), st.text ()), viewport()),
  73. requestResponsePair (),
  74. )
  75. @pytest.mark.asyncio
  76. @given (st.lists (event ()))
  77. async def test_push (golden):
  78. def checkWarcinfoId (headers):
  79. if lastWarcinfoRecordid is not None:
  80. assert headers['WARC-Warcinfo-ID'] == lastWarcinfoRecordid
  81. lastWarcinfoRecordid = None
  82. # null logger
  83. logger = Logger ()
  84. with open('/tmp/test.warc.gz', 'w+b') as fd:
  85. with WarcHandler (fd, logger) as handler:
  86. for g in golden:
  87. await handler.push (g)
  88. fd.seek (0)
  89. it = iter (ArchiveIterator (fd))
  90. for g in golden:
  91. if isinstance (g, ControllerStart):
  92. rec = next (it)
  93. headers = rec.rec_headers
  94. assert headers['warc-type'] == 'warcinfo'
  95. assert 'warc-target-uri' not in headers
  96. assert 'x-crocoite-type' not in headers
  97. data = json.load (rec.raw_stream)
  98. assert data == g.payload
  99. lastWarcinfoRecordid = headers['warc-record-id']
  100. assert lastWarcinfoRecordid
  101. elif isinstance (g, Script):
  102. rec = next (it)
  103. headers = rec.rec_headers
  104. assert headers['warc-type'] == 'resource'
  105. assert headers['content-type'] == 'application/javascript; charset=utf-8'
  106. assert headers['x-crocoite-type'] == 'script'
  107. checkWarcinfoId (headers)
  108. if g.path:
  109. assert URL (headers['warc-target-uri']) == URL ('file://' + g.abspath)
  110. else:
  111. assert 'warc-target-uri' not in headers
  112. data = rec.raw_stream.read ().decode ('utf-8')
  113. assert data == g.data
  114. elif isinstance (g, ScreenshotEvent):
  115. # XXX: check refers-to header
  116. rec = next (it)
  117. headers = rec.rec_headers
  118. assert headers['warc-type'] == 'conversion'
  119. assert headers['x-crocoite-type'] == 'screenshot'
  120. checkWarcinfoId (headers)
  121. assert URL (headers['warc-target-uri']) == g.url, (headers['warc-target-uri'], g.url)
  122. assert headers['warc-refers-to'] is None
  123. assert int (headers['X-Crocoite-Screenshot-Y-Offset']) == g.yoff
  124. assert rec.raw_stream.read () == g.data
  125. elif isinstance (g, DomSnapshotEvent):
  126. rec = next (it)
  127. headers = rec.rec_headers
  128. assert headers['warc-type'] == 'conversion'
  129. assert headers['x-crocoite-type'] == 'dom-snapshot'
  130. checkWarcinfoId (headers)
  131. assert URL (headers['warc-target-uri']) == g.url
  132. assert headers['warc-refers-to'] is None
  133. assert rec.raw_stream.read () == g.document
  134. elif isinstance (g, RequestResponsePair):
  135. rec = next (it)
  136. # request
  137. headers = rec.rec_headers
  138. assert headers['warc-type'] == 'request'
  139. assert 'x-crocoite-type' not in headers
  140. checkWarcinfoId (headers)
  141. assert URL (headers['warc-target-uri']) == g.url
  142. assert headers['x-chrome-request-id'] == g.id
  143. assert CIMultiDict (rec.http_headers.headers) == g.request.headers
  144. if g.request.hasPostData:
  145. if g.request.body is not None:
  146. assert rec.raw_stream.read () == g.request.body
  147. else:
  148. # body fetch failed
  149. assert headers['warc-truncated'] == 'unspecified'
  150. assert not rec.raw_stream.read ()
  151. else:
  152. assert not rec.raw_stream.read ()
  153. # response
  154. if g.response:
  155. rec = next (it)
  156. headers = rec.rec_headers
  157. httpheaders = rec.http_headers
  158. assert headers['warc-type'] == 'response'
  159. checkWarcinfoId (headers)
  160. assert URL (headers['warc-target-uri']) == g.url
  161. assert headers['x-chrome-request-id'] == g.id
  162. assert 'x-crocoite-type' not in headers
  163. # these are checked separately
  164. filteredHeaders = CIMultiDict (httpheaders.headers)
  165. for b in {'content-type', 'content-length'}:
  166. if b in g.response.headers:
  167. g.response.headers.popall (b)
  168. if b in filteredHeaders:
  169. filteredHeaders.popall (b)
  170. assert filteredHeaders == g.response.headers
  171. expectedContentType = g.response.mimeType
  172. if expectedContentType is not None:
  173. assert httpheaders['content-type'].startswith (expectedContentType)
  174. if g.response.body is not None:
  175. assert rec.raw_stream.read () == g.response.body
  176. assert httpheaders['content-length'] == str (len (g.response.body))
  177. # body is never truncated if it exists
  178. assert headers['warc-truncated'] is None
  179. # unencoded strings are converted to utf8
  180. if isinstance (g.response.body, UnicodeBody) and httpheaders['content-type'] is not None:
  181. assert httpheaders['content-type'].endswith ('; charset=utf-8')
  182. else:
  183. # body fetch failed
  184. assert headers['warc-truncated'] == 'unspecified'
  185. assert not rec.raw_stream.read ()
  186. # content-length header should be kept intact
  187. else:
  188. assert False, f"invalid golden type {type(g)}" # pragma: no cover
  189. # no further records
  190. with pytest.raises (StopIteration):
  191. next (it)