test_writer.py 29 KB


  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. from warcio.statusandheaders import StatusAndHeaders
  4. from warcio.warcwriter import BufferWARCWriter, GzippingWrapper
  5. from warcio.recordbuilder import RecordBuilder
  6. from warcio.recordloader import ArcWarcRecordLoader
  7. from warcio.archiveiterator import ArchiveIterator
  8. from warcio.bufferedreaders import DecompressingBufferedReader
  9. from . import get_test_file
  10. from io import BytesIO
  11. from collections import OrderedDict
  12. import json
  13. import re
  14. import pytest
  15. # ============================================================================
  16. class FixedTestRecordMixin:
  17. @classmethod
  18. def _make_warc_id(cls, id_=None):
  19. return '<urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>'
  20. @classmethod
  21. def _make_warc_date(cls, use_micros=False):
  22. if not use_micros:
  23. return '2000-01-01T00:00:00Z'
  24. else:
  25. return '2000-01-01T00:00:00.123456Z'
  26. class FixedTestRecordBuilder(FixedTestRecordMixin, RecordBuilder):
  27. pass
  28. class FixedTestWARCWriter(FixedTestRecordMixin, BufferWARCWriter):
  29. pass
  30. # ============================================================================
  31. WARCINFO_RECORD = '\
  32. WARC/1.0\r\n\
  33. WARC-Type: warcinfo\r\n\
  34. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  35. WARC-Filename: testfile.warc.gz\r\n\
  36. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  37. WARC-Block-Digest: sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID\r\n\
  38. Content-Type: application/warc-fields\r\n\
  39. Content-Length: 86\r\n\
  40. \r\n\
  41. software: recorder test\r\n\
  42. format: WARC File Format 1.0\r\n\
  43. json-metadata: {"foo": "bar"}\r\n\
  44. \r\n\
  45. \r\n\
  46. '
  47. RESPONSE_RECORD = '\
  48. WARC/1.0\r\n\
  49. WARC-Type: response\r\n\
  50. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  51. WARC-Target-URI: http://example.com/\r\n\
  52. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  53. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  54. WARC-Block-Digest: sha1:OS3OKGCWQIJOAOC3PKXQOQFD52NECQ74\r\n\
  55. Content-Type: application/http; msgtype=response\r\n\
  56. Content-Length: 97\r\n\
  57. \r\n\
  58. HTTP/1.0 200 OK\r\n\
  59. Content-Type: text/plain; charset="UTF-8"\r\n\
  60. Custom-Header: somevalue\r\n\
  61. \r\n\
  62. some\n\
  63. text\r\n\
  64. \r\n\
  65. '
  66. RESPONSE_RECORD_UNICODE_HEADERS = '\
  67. WARC/1.0\r\n\
  68. WARC-Type: response\r\n\
  69. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  70. WARC-Target-URI: http://example.com/\r\n\
  71. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  72. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  73. WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
  74. Content-Type: application/http; msgtype=response\r\n\
  75. Content-Length: 268\r\n\
  76. \r\n\
  77. HTTP/1.0 200 OK\r\n\
  78. Content-Type: text/plain; charset="UTF-8"\r\n\
  79. Content-Disposition: attachment; filename*=UTF-8\'\'%D0%B8%D1%81%D0%BF%D1%8B%D1%82%D0%B0%D0%BD%D0%B8%D0%B5.txt\r\n\
  80. Custom-Header: somevalue\r\n\
  81. Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
  82. \r\n\
  83. some\n\
  84. text\r\n\
  85. \r\n\
  86. '
  87. RESPONSE_RECORD_2 = '\
  88. WARC/1.0\r\n\
  89. WARC-Type: response\r\n\
  90. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  91. WARC-Target-URI: http://example.com/\r\n\
  92. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  93. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  94. WARC-Block-Digest: sha1:U6KNJY5MVNU3IMKED7FSO2JKW6MZ3QUX\r\n\
  95. Content-Type: application/http; msgtype=response\r\n\
  96. Content-Length: 145\r\n\
  97. \r\n\
  98. HTTP/1.0 200 OK\r\n\
  99. Content-Type: text/plain; charset="UTF-8"\r\n\
  100. Content-Length: 9\r\n\
  101. Custom-Header: somevalue\r\n\
  102. Content-Encoding: x-unknown\r\n\
  103. \r\n\
  104. some\n\
  105. text\r\n\
  106. \r\n\
  107. '
  108. REQUEST_RECORD = '\
  109. WARC/1.0\r\n\
  110. WARC-Type: request\r\n\
  111. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  112. WARC-Target-URI: http://example.com/\r\n\
  113. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  114. WARC-Payload-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
  115. WARC-Block-Digest: sha1:ONEHF6PTXPTTHE3333XHTD2X45TZ3DTO\r\n\
  116. Content-Type: application/http; msgtype=request\r\n\
  117. Content-Length: 54\r\n\
  118. \r\n\
  119. GET / HTTP/1.0\r\n\
  120. User-Agent: foo\r\n\
  121. Host: example.com\r\n\
  122. \r\n\
  123. \r\n\
  124. \r\n\
  125. '
  126. REQUEST_RECORD_2 = '\
  127. WARC/1.0\r\n\
  128. WARC-Type: request\r\n\
  129. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  130. WARC-Target-URI: http://example.com/\r\n\
  131. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  132. WARC-Payload-Digest: sha1:R5VZAKIE53UW5VGK43QJIFYS333QM5ZA\r\n\
  133. WARC-Block-Digest: sha1:L7SVBUPPQ6RH3ANJD42G5JL7RHRVZ5DV\r\n\
  134. Content-Type: application/http; msgtype=request\r\n\
  135. Content-Length: 92\r\n\
  136. \r\n\
  137. POST /path HTTP/1.0\r\n\
  138. Content-Type: application/json\r\n\
  139. Content-Length: 17\r\n\
  140. \r\n\
  141. {"some": "value"}\r\n\
  142. \r\n\
  143. '
  144. REVISIT_RECORD_1 = '\
  145. WARC/1.0\r\n\
  146. WARC-Type: revisit\r\n\
  147. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  148. WARC-Target-URI: http://example.com/\r\n\
  149. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  150. WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest\r\n\
  151. WARC-Refers-To-Target-URI: http://example.com/foo\r\n\
  152. WARC-Refers-To-Date: 1999-01-01T00:00:00Z\r\n\
  153. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  154. WARC-Block-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
  155. Content-Type: application/http; msgtype=response\r\n\
  156. Content-Length: 0\r\n\
  157. \r\n\
  158. \r\n\
  159. \r\n\
  160. '
  161. REVISIT_RECORD_2 = '\
  162. WARC/1.0\r\n\
  163. WARC-Type: revisit\r\n\
  164. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  165. WARC-Target-URI: http://example.com/\r\n\
  166. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  167. WARC-Profile: http://netpreserve.org/warc/1.0/revisit/identical-payload-digest\r\n\
  168. WARC-Refers-To-Target-URI: http://example.com/foo\r\n\
  169. WARC-Refers-To-Date: 1999-01-01T00:00:00Z\r\n\
  170. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  171. WARC-Block-Digest: sha1:A6J5UTI2QHHCZFCFNHQHCDD3JJFKP53V\r\n\
  172. Content-Type: application/http; msgtype=response\r\n\
  173. Content-Length: 88\r\n\
  174. \r\n\
  175. HTTP/1.0 200 OK\r\n\
  176. Content-Type: text/plain; charset="UTF-8"\r\n\
  177. Custom-Header: somevalue\r\n\
  178. \r\n\
  179. \r\n\
  180. \r\n\
  181. '
  182. REVISIT_RECORD_3 = '\
  183. WARC/1.1\r\n\
  184. WARC-Type: revisit\r\n\
  185. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  186. WARC-Target-URI: http://example.com/\r\n\
  187. WARC-Date: 2000-01-01T00:00:00.123456Z\r\n\
  188. WARC-Profile: http://netpreserve.org/warc/1.1/revisit/identical-payload-digest\r\n\
  189. WARC-Refers-To-Target-URI: http://example.com/foo\r\n\
  190. WARC-Refers-To-Date: 1999-01-01T00:00:00Z\r\n\
  191. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  192. WARC-Block-Digest: sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ\r\n\
  193. Content-Type: application/http; msgtype=response\r\n\
  194. Content-Length: 0\r\n\
  195. \r\n\
  196. \r\n\
  197. \r\n\
  198. '
  199. RESOURCE_RECORD = '\
  200. WARC/1.0\r\n\
  201. WARC-Type: resource\r\n\
  202. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  203. WARC-Target-URI: ftp://example.com/\r\n\
  204. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  205. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  206. WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  207. Content-Type: text/plain\r\n\
  208. Content-Length: 9\r\n\
  209. \r\n\
  210. some\n\
  211. text\r\n\
  212. \r\n\
  213. '
  214. RESOURCE_RECORD_NO_CONTENT_TYPE = '\
  215. WARC/1.0\r\n\
  216. WARC-Type: resource\r\n\
  217. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  218. WARC-Target-URI: ftp://example.com/\r\n\
  219. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  220. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  221. WARC-Block-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  222. Content-Length: 9\r\n\
  223. \r\n\
  224. some\n\
  225. text\r\n\
  226. \r\n\
  227. '
  228. METADATA_RECORD = '\
  229. WARC/1.0\r\n\
  230. WARC-Type: metadata\r\n\
  231. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  232. WARC-Target-URI: http://example.com/\r\n\
  233. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  234. WARC-Payload-Digest: sha1:ZOLBLKAQVZE5DXH56XE6EH6AI6ZUGDPT\r\n\
  235. WARC-Block-Digest: sha1:ZOLBLKAQVZE5DXH56XE6EH6AI6ZUGDPT\r\n\
  236. Content-Type: application/json\r\n\
  237. Content-Length: 67\r\n\
  238. \r\n\
  239. {"metadata": {"nested": "obj", "list": [1, 2, 3], "length": "123"}}\r\n\
  240. \r\n\
  241. '
  242. DNS_RESPONSE_RECORD = '\
  243. WARC/1.0\r\n\
  244. WARC-Type: response\r\n\
  245. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  246. WARC-Target-URI: dns:google.com\r\n\
  247. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  248. WARC-Payload-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
  249. WARC-Block-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
  250. Content-Type: application/http; msgtype=response\r\n\
  251. Content-Length: 147\r\n\
  252. \r\n\
  253. 20170509000739\n\
  254. google.com. 185 IN A 209.148.113.239\n\
  255. google.com. 185 IN A 209.148.113.238\n\
  256. google.com. 185 IN A 209.148.113.250\n\
  257. \r\n\r\n\
  258. '
  259. DNS_RESOURCE_RECORD = '\
  260. WARC/1.0\r\n\
  261. WARC-Type: resource\r\n\
  262. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  263. WARC-Target-URI: dns:google.com\r\n\
  264. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  265. WARC-Payload-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
  266. WARC-Block-Digest: sha1:2AAVJYKKIWK5CF6EWE7PH63EMNLO44TH\r\n\
  267. Content-Type: application/warc-record\r\n\
  268. Content-Length: 147\r\n\
  269. \r\n\
  270. 20170509000739\n\
  271. google.com. 185 IN A 209.148.113.239\n\
  272. google.com. 185 IN A 209.148.113.238\n\
  273. google.com. 185 IN A 209.148.113.250\n\
  274. \r\n\r\n\
  275. '
  276. # ============================================================================
  277. # Decorator Setup
  278. # ============================================================================
  279. all_sample_records = {}
  280. def sample_record(name, record_string):
  281. def decorate(f):
  282. all_sample_records[name] = (f, record_string)
  283. return f
  284. return decorate
  285. # ============================================================================
  286. # Sample Record Functions
  287. # ============================================================================
  288. @sample_record('warcinfo', WARCINFO_RECORD)
  289. def sample_warcinfo(builder):
  290. params = OrderedDict([('software', 'recorder test'),
  291. ('format', 'WARC File Format 1.0'),
  292. ('invalid', ''),
  293. ('json-metadata', json.dumps({'foo': 'bar'}))])
  294. return builder.create_warcinfo_record('testfile.warc.gz', params)
  295. # ============================================================================
  296. @sample_record('response_1', RESPONSE_RECORD)
  297. def sample_response(builder):
  298. headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
  299. ('Custom-Header', 'somevalue')
  300. ]
  301. payload = b'some\ntext'
  302. http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
  303. return builder.create_warc_record('http://example.com/', 'response',
  304. payload=BytesIO(payload),
  305. length=len(payload),
  306. http_headers=http_headers)
  307. # ============================================================================
  308. @sample_record('response_1-buff', RESPONSE_RECORD)
  309. def sample_response_from_buff(builder):
  310. payload = '\
  311. HTTP/1.0 200 OK\r\n\
  312. Content-Type: text/plain; charset="UTF-8"\r\n\
  313. Custom-Header: somevalue\r\n\
  314. \r\n\
  315. some\ntext'.encode('utf-8')
  316. return builder.create_warc_record('http://example.com/', 'response',
  317. payload=BytesIO(payload),
  318. length=len(payload))
  319. # ============================================================================
  320. @sample_record('response-unicode-header', RESPONSE_RECORD_UNICODE_HEADERS)
  321. def sample_response_unicode(builder):
  322. headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
  323. ('Content-Disposition', u'attachment; filename="испытание.txt"'),
  324. ('Custom-Header', 'somevalue'),
  325. ('Unicode-Header', '📁 text 🗄️'),
  326. ]
  327. payload = b'some\ntext'
  328. http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
  329. return builder.create_warc_record('http://example.com/', 'response',
  330. payload=BytesIO(payload),
  331. length=len(payload),
  332. http_headers=http_headers)
  333. # ============================================================================
  334. @sample_record('response_2', RESPONSE_RECORD_2)
  335. def sample_response_2(builder):
  336. payload = b'some\ntext'
  337. headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
  338. ('Content-Length', str(len(payload))),
  339. ('Custom-Header', 'somevalue'),
  340. ('Content-Encoding', 'x-unknown'),
  341. ]
  342. http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
  343. return builder.create_warc_record('http://example.com/', 'response',
  344. payload=BytesIO(payload),
  345. length=len(payload),
  346. http_headers=http_headers)
  347. # ============================================================================
  348. @sample_record('response_dns', DNS_RESPONSE_RECORD)
  349. def sample_response_dns(builder):
  350. payload = b'''\
  351. 20170509000739
  352. google.com. 185 IN A 209.148.113.239
  353. google.com. 185 IN A 209.148.113.238
  354. google.com. 185 IN A 209.148.113.250
  355. '''
  356. return builder.create_warc_record('dns:google.com', 'response',
  357. payload=BytesIO(payload))
  358. # ============================================================================
  359. @sample_record('resource_dns', DNS_RESOURCE_RECORD)
  360. def sample_resource_dns(builder):
  361. payload = b'''\
  362. 20170509000739
  363. google.com. 185 IN A 209.148.113.239
  364. google.com. 185 IN A 209.148.113.238
  365. google.com. 185 IN A 209.148.113.250
  366. '''
  367. return builder.create_warc_record('dns:google.com', 'resource',
  368. payload=BytesIO(payload))
  369. # ============================================================================
  370. @sample_record('request_1', REQUEST_RECORD)
  371. def sample_request(builder):
  372. headers_list = [('User-Agent', 'foo'),
  373. ('Host', 'example.com')]
  374. http_headers = StatusAndHeaders('GET / HTTP/1.0', headers_list, is_http_request=True)
  375. return builder.create_warc_record('http://example.com/', 'request',
  376. http_headers=http_headers)
  377. # ============================================================================
  378. @sample_record('request_2', REQUEST_RECORD_2)
  379. def sample_request_from_buff(builder):
  380. payload = '\
  381. POST /path HTTP/1.0\r\n\
  382. Content-Type: application/json\r\n\
  383. Content-Length: 17\r\n\
  384. \r\n\
  385. {"some": "value"}'.encode('utf-8')
  386. return builder.create_warc_record('http://example.com/', 'request',
  387. payload=BytesIO(payload),
  388. length=len(payload))
  389. # ============================================================================
  390. @sample_record('resource', RESOURCE_RECORD)
  391. def sample_resource(builder):
  392. payload = b'some\ntext'
  393. return builder.create_warc_record('ftp://example.com/', 'resource',
  394. payload=BytesIO(payload),
  395. length=len(payload),
  396. warc_content_type='text/plain')
  397. # ============================================================================
  398. @sample_record('resource_no_ct', RESOURCE_RECORD_NO_CONTENT_TYPE)
  399. def sample_resource_no_content_type(builder):
  400. payload = b'some\ntext'
  401. rec = builder.create_warc_record('ftp://example.com/', 'resource',
  402. payload=BytesIO(payload),
  403. length=len(payload))
  404. # default content-type added, but removing to match expected string
  405. assert rec.content_type == 'application/warc-record'
  406. rec.content_type = None
  407. return rec
  408. # ============================================================================
  409. @sample_record('metadata', METADATA_RECORD)
  410. def sample_metadata(builder):
  411. payload_dict = {"metadata": OrderedDict([("nested", "obj"),
  412. ("list", [1, 2, 3]),
  413. ("length", "123")])}
  414. payload = json.dumps(payload_dict).encode('utf-8')
  415. return builder.create_warc_record('http://example.com/', 'metadata',
  416. payload=BytesIO(payload),
  417. length=len(payload),
  418. warc_content_type='application/json')
  419. # ============================================================================
  420. @sample_record('revisit_1', REVISIT_RECORD_1)
  421. def sample_revisit_1(builder):
  422. return builder.create_revisit_record('http://example.com/',
  423. digest='sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O',
  424. refers_to_uri='http://example.com/foo',
  425. refers_to_date='1999-01-01T00:00:00Z')
  426. # ============================================================================
  427. @sample_record('revisit_2', REVISIT_RECORD_2)
  428. def sample_revisit_2(builder):
  429. resp = sample_response(builder)
  430. return builder.create_revisit_record('http://example.com/',
  431. digest='sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O',
  432. refers_to_uri='http://example.com/foo',
  433. refers_to_date='1999-01-01T00:00:00Z',
  434. http_headers=resp.http_headers)
  435. # ============================================================================
  436. @sample_record('revisit_warc_1_1', REVISIT_RECORD_3)
  437. def sample_revisit_1_1(builder):
  438. builder.warc_version = 'WARC/1.1'
  439. res = builder.create_revisit_record('http://example.com/',
  440. digest='sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O',
  441. refers_to_uri='http://example.com/foo',
  442. refers_to_date='1999-01-01T00:00:00Z')
  443. builder.warc_version = 'WARC/1.0'
  444. return res
  445. # ============================================================================
  446. # Fixture Setup
  447. # ============================================================================
  448. @pytest.fixture(params=['gzip', 'plain'])
  449. def is_gzip(request):
  450. return request.param == 'gzip'
  451. @pytest.fixture(params=['writer', 'builder'])
  452. def builder_factory(request):
  453. def factory(writer, builder_cls=FixedTestRecordBuilder, **kwargs):
  454. if request.param == 'writer':
  455. return writer
  456. return builder_cls(**kwargs)
  457. return factory
  458. @pytest.fixture(params=all_sample_records.keys())
  459. def record_sampler(request):
  460. return all_sample_records[request.param]
  461. # ============================================================================
  462. class TestWarcWriter(object):
  463. @classmethod
  464. def _validate_record_content_len(cls, stream):
  465. for record in ArchiveIterator(stream, no_record_parse=True):
  466. assert record.http_headers == None
  467. assert int(record.rec_headers.get_header('Content-Length')) == record.length
  468. assert record.length == len(record.raw_stream.read())
  469. def test_generate_record(self, record_sampler, is_gzip, builder_factory):
  470. writer = FixedTestWARCWriter(gzip=is_gzip)
  471. builder = builder_factory(writer)
  472. record_maker, record_string = record_sampler
  473. record = record_maker(builder)
  474. writer.write_record(record)
  475. raw_buff = writer.get_contents()
  476. self._validate_record_content_len(BytesIO(raw_buff))
  477. stream = DecompressingBufferedReader(writer.get_stream())
  478. buff = stream.read()
  479. if is_gzip:
  480. assert len(buff) > len(raw_buff)
  481. else:
  482. assert len(buff) == len(raw_buff)
  483. assert buff.decode('utf-8') == record_string
  484. # assert parsing record matches as well
  485. stream = DecompressingBufferedReader(writer.get_stream())
  486. parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
  487. writer2 = FixedTestWARCWriter(gzip=False)
  488. writer2.write_record(parsed_record)
  489. assert writer2.get_contents().decode('utf-8') == record_string
  490. # verify parts of record
  491. stream = DecompressingBufferedReader(writer.get_stream())
  492. parsed_record = ArcWarcRecordLoader().parse_record_stream(stream)
  493. content_buff = parsed_record.content_stream().read().decode('utf-8')
  494. assert content_buff in record_string
  495. rec_type = parsed_record.rec_type
  496. # verify http_headers
  497. # match original
  498. assert record.http_headers == parsed_record.http_headers
  499. if parsed_record.http_headers:
  500. assert rec_type in ('response', 'request', 'revisit')
  501. else:
  502. # empty revisit
  503. if rec_type == 'revisit':
  504. assert len(content_buff) == 0
  505. else:
  506. assert len(content_buff) == parsed_record.length
  507. def test_warcinfo_record(self, is_gzip, builder_factory):
  508. writer = FixedTestWARCWriter(gzip=is_gzip)
  509. builder = builder_factory(writer)
  510. record = sample_warcinfo(builder)
  511. writer.write_record(record)
  512. reader = DecompressingBufferedReader(writer.get_stream())
  513. parsed_record = ArcWarcRecordLoader().parse_record_stream(reader)
  514. assert parsed_record.rec_headers.get_header('WARC-Type') == 'warcinfo'
  515. assert parsed_record.rec_headers.get_header('Content-Type') == 'application/warc-fields'
  516. assert parsed_record.rec_headers.get_header('WARC-Filename') == 'testfile.warc.gz'
  517. assert parsed_record.rec_headers.get_header('WARC-Block-Digest') == 'sha1:GAD6P5BTZPRU57ICXEYUJZGCURZYABID'
  518. buff = parsed_record.content_stream().read().decode('utf-8')
  519. assert 'json-metadata: {"foo": "bar"}\r\n' in buff
  520. assert 'format: WARC File Format 1.0\r\n' in buff
  521. def test_request_response_concur(self, is_gzip, builder_factory):
  522. writer = BufferWARCWriter(gzip=is_gzip)
  523. builder = builder_factory(writer, builder_cls=RecordBuilder)
  524. resp = sample_response(builder)
  525. req = sample_request(builder)
  526. # test explicitly calling ensure_digest with block digest enabled on a record
  527. writer.ensure_digest(resp, block=True, payload=True)
  528. writer.write_request_response_pair(req, resp)
  529. stream = writer.get_stream()
  530. reader = ArchiveIterator(stream)
  531. resp, req = list(reader)
  532. resp_id = resp.rec_headers.get_header('WARC-Record-ID')
  533. req_id = req.rec_headers.get_header('WARC-Record-ID')
  534. assert resp_id != req_id
  535. assert resp_id == req.rec_headers.get_header('WARC-Concurrent-To')
  536. def test_response_warc_1_1(self, is_gzip, builder_factory):
  537. writer = BufferWARCWriter(gzip=is_gzip, warc_version='WARC/1.1')
  538. builder = builder_factory(writer, warc_version='WARC/1.1')
  539. resp = sample_response(builder)
  540. writer.write_record(resp)
  541. stream = writer.get_stream()
  542. reader = ArchiveIterator(stream)
  543. recs = list(reader)
  544. assert len(recs) == 1
  545. assert recs[0].rec_headers.protocol == 'WARC/1.1'
  546. # ISO 8601 date with fractional seconds (microseconds)
  547. assert '.' in recs[0].rec_headers['WARC-Date']
  548. assert len(recs[0].rec_headers['WARC-Date']) == 27
  549. def _conv_to_streaming_record(self, record_buff, rec_type):
  550. # strip-off the two empty \r\n\r\n added at the end of uncompressed record
  551. record_buff = record_buff[:-4]
  552. record_buff = re.sub('Content-Length:[^\r\n]+\r\n', '', record_buff, 1)
  553. # don't remove payload digest for revisit, as it can not be recomputed
  554. if rec_type != 'revisit':
  555. record_buff = re.sub('WARC-Payload-Digest:[^\r\n]+\r\n', '', record_buff, 1)
  556. assert 'WARC-Payload-Digest: ' not in record_buff
  557. record_buff = re.sub('WARC-Block-Digest:[^\r\n]+\r\n', 'WARC-Block-Digest: sha1:x-invalid\r\n', record_buff, 1)
  558. assert 'WARC-Block-Digest: sha1:x-invalid' in record_buff
  559. return record_buff
  560. def test_read_from_stream_no_content_length(self, record_sampler, is_gzip, builder_factory):
  561. writer = FixedTestWARCWriter(gzip=is_gzip)
  562. builder = builder_factory(writer)
  563. record_maker, record_string = record_sampler
  564. full_record = record_maker(builder)
  565. stream = BytesIO()
  566. record_no_cl = self._conv_to_streaming_record(record_string, full_record.rec_type)
  567. if is_gzip:
  568. gzip_stream = GzippingWrapper(stream)
  569. gzip_stream.write(record_no_cl.encode('utf-8'))
  570. gzip_stream.flush()
  571. else:
  572. stream.write(record_no_cl.encode('utf-8'))
  573. # parse to verify http headers + payload matches sample record
  574. # but not rec headers (missing content-length)
  575. stream.seek(0)
  576. parsed_record = ArcWarcRecordLoader().parse_record_stream(DecompressingBufferedReader(stream))
  577. if 'Content-Disposition' not in record_string:
  578. assert full_record.http_headers == parsed_record.http_headers
  579. assert full_record.raw_stream.read() == parsed_record.raw_stream.read()
  580. assert full_record.rec_headers != parsed_record.rec_headers
  581. # parse and write
  582. stream.seek(0)
  583. parsed_record = ArcWarcRecordLoader().parse_record_stream(DecompressingBufferedReader(stream))
  584. writer.write_record(parsed_record)
  585. stream = DecompressingBufferedReader(writer.get_stream())
  586. buff = stream.read()
  587. # assert written record matches expected response record
  588. # with content-length, digests computed
  589. assert buff.decode('utf-8') == record_string
  590. @pytest.mark.parametrize('filename', ['example.arc.gz', 'example.arc'])
  591. def test_arc2warc(self, filename, is_gzip):
  592. writer = FixedTestWARCWriter(gzip=is_gzip)
  593. def validate_warcinfo(record):
  594. assert record.rec_headers.get('WARC-Type') == 'warcinfo'
  595. assert record.rec_headers.get('WARC-Filename') == 'live-web-example.arc.gz'
  596. assert record.rec_headers.get('Content-Type') == 'text/plain'
  597. def validate_response(record):
  598. assert record.rec_headers.get('WARC-Type') == 'response'
  599. assert record.rec_headers.get('Content-Length') == '1591'
  600. assert record.length == 1591
  601. assert record.rec_headers.get('WARC-Target-URI') == 'http://example.com/'
  602. assert record.rec_headers.get('WARC-Date') == '2014-02-16T05:02:21Z'
  603. assert record.rec_headers.get('WARC-Block-Digest') == 'sha1:PEWDX5GTH66WU74WBPGFECIYBMPMP3FP'
  604. assert record.rec_headers.get('WARC-Payload-Digest') == 'sha1:B2LTWWPUOYAH7UIPQ7ZUPQ4VMBSVC36A'
  605. with open(get_test_file(filename), 'rb') as fh:
  606. for record in ArchiveIterator(fh, arc2warc=True):
  607. writer.write_record(record)
  608. if record.rec_type == 'response':
  609. validate_response(record)
  610. if record.rec_type == 'warcinfo':
  611. validate_warcinfo(record)
  612. raw_buff = writer.get_contents()
  613. self._validate_record_content_len(BytesIO(raw_buff))
  614. stream = writer.get_stream()
  615. records = list(ArchiveIterator(stream, arc2warc=False))
  616. assert len(records) == 2
  617. validate_warcinfo(records[0])
  618. validate_response(records[1])
  619. validate_warcinfo(records[0])
  620. def test_utf8_rewrite_content_adjust(self):
  621. UTF8_PAYLOAD = u'\
  622. HTTP/1.0 200 OK\r\n\
  623. Content-Type: text/plain; charset="UTF-8"\r\n\
  624. Content-Disposition: attachment; filename="испытание.txt"\r\n\
  625. Custom-Header: somevalue\r\n\
  626. Unicode-Header: %F0%9F%93%81%20text%20%F0%9F%97%84%EF%B8%8F\r\n\
  627. \r\n\
  628. some\n\
  629. text'
  630. content_length = len(UTF8_PAYLOAD.encode('utf-8'))
  631. UTF8_RECORD = u'\
  632. WARC/1.0\r\n\
  633. WARC-Type: response\r\n\
  634. WARC-Record-ID: <urn:uuid:12345678-feb0-11e6-8f83-68a86d1772ce>\r\n\
  635. WARC-Target-URI: http://example.com/\r\n\
  636. WARC-Date: 2000-01-01T00:00:00Z\r\n\
  637. WARC-Payload-Digest: sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O\r\n\
  638. WARC-Block-Digest: sha1:KMUABC6URWIQ7QXCZDQ5FS6WIBBFRORR\r\n\
  639. Content-Type: application/http; msgtype=response\r\n\
  640. Content-Length: {0}\r\n\
  641. \r\n\
  642. {1}\r\n\
  643. \r\n\
  644. '.format(content_length, UTF8_PAYLOAD)
  645. assert(content_length == 226)
  646. record = ArcWarcRecordLoader().parse_record_stream(BytesIO(UTF8_RECORD.encode('utf-8')))
  647. writer = BufferWARCWriter(gzip=False)
  648. writer.write_record(record)
  649. raw_buff = writer.get_contents()
  650. assert raw_buff.decode('utf-8') == RESPONSE_RECORD_UNICODE_HEADERS
  651. for record in ArchiveIterator(writer.get_stream()):
  652. assert record.length == 268
  653. def test_identity(self):
  654. """ read(write(record)) should yield record """
  655. payload = b'foobar'
  656. writer = BufferWARCWriter(gzip=True)
  657. httpHeaders = StatusAndHeaders('GET / HTTP/1.1', {}, is_http_request=True)
  658. warcHeaders = {'Foo': 'Bar'}
  659. record = writer.create_warc_record('http://example.com/', 'request',
  660. payload=BytesIO(payload),
  661. warc_headers_dict=warcHeaders, http_headers=httpHeaders)
  662. writer.write_record(record)
  663. for new_rec in ArchiveIterator(writer.get_stream()):
  664. assert new_rec.rec_type == record.rec_type
  665. assert new_rec.rec_headers == record.rec_headers
  666. assert new_rec.content_type == record.content_type
  667. assert new_rec.length == record.length
  668. assert new_rec.http_headers == record.http_headers
  669. assert new_rec.raw_stream.read() == payload