checker.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from __future__ import print_function
  2. from warcio.archiveiterator import ArchiveIterator
  3. from warcio.exceptions import ArchiveLoadFailed
  4. def _read_entire_stream(stream):
  5. while True:
  6. piece = stream.read(1024*1024)
  7. if len(piece) == 0:
  8. break
  9. class Checker(object):
  10. def __init__(self, cmd):
  11. self.inputs = cmd.inputs
  12. self.verbose = cmd.verbose
  13. self.exit_value = 0
  14. def process_all(self):
  15. for filename in self.inputs:
  16. try:
  17. self.process_one(filename)
  18. except ArchiveLoadFailed as e:
  19. print(filename)
  20. print(' saw exception ArchiveLoadFailed: '+str(e).rstrip())
  21. print(' skipping rest of file')
  22. self.exit_value = 1
  23. return self.exit_value
  24. def process_one(self, filename):
  25. printed_filename = False
  26. with open(filename, 'rb') as stream:
  27. it = ArchiveIterator(stream, check_digests=True)
  28. for record in it:
  29. digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
  30. record.rec_headers.get_header('WARC-Block-Digest'))
  31. _read_entire_stream(record.content_stream())
  32. d_msg = None
  33. output = []
  34. rec_id = record.rec_headers.get_header('WARC-Record-ID')
  35. rec_type = record.rec_headers.get_header('WARC-Type')
  36. rec_offset = it.get_record_offset()
  37. if record.digest_checker.passed is False:
  38. self.exit_value = 1
  39. output = list(record.digest_checker.problems)
  40. elif record.digest_checker.passed is True and self.verbose:
  41. d_msg = 'digest pass'
  42. elif record.digest_checker.passed is None and self.verbose:
  43. if digest_present and rec_type == 'revisit':
  44. d_msg = 'digest present but not checked (revisit)'
  45. elif digest_present: # pragma: no cover
  46. # should not happen
  47. d_msg = 'digest present but not checked'
  48. else:
  49. d_msg = 'no digest to check'
  50. if d_msg or output:
  51. if not printed_filename:
  52. print(filename)
  53. printed_filename = True
  54. print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type)
  55. if d_msg:
  56. print(' ', d_msg)
  57. for o in output:
  58. print(' ', o)