12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- from __future__ import print_function
- from warcio.archiveiterator import ArchiveIterator
- from warcio.exceptions import ArchiveLoadFailed
- def _read_entire_stream(stream):
- while True:
- piece = stream.read(1024*1024)
- if len(piece) == 0:
- break
- class Checker(object):
- def __init__(self, cmd):
- self.inputs = cmd.inputs
- self.verbose = cmd.verbose
- self.exit_value = 0
- def process_all(self):
- for filename in self.inputs:
- try:
- self.process_one(filename)
- except ArchiveLoadFailed as e:
- print(filename)
- print(' saw exception ArchiveLoadFailed: '+str(e).rstrip())
- print(' skipping rest of file')
- self.exit_value = 1
- return self.exit_value
- def process_one(self, filename):
- printed_filename = False
- with open(filename, 'rb') as stream:
- it = ArchiveIterator(stream, check_digests=True)
- for record in it:
- digest_present = (record.rec_headers.get_header('WARC-Payload-Digest') or
- record.rec_headers.get_header('WARC-Block-Digest'))
- _read_entire_stream(record.content_stream())
- d_msg = None
- output = []
- rec_id = record.rec_headers.get_header('WARC-Record-ID')
- rec_type = record.rec_headers.get_header('WARC-Type')
- rec_offset = it.get_record_offset()
- if record.digest_checker.passed is False:
- self.exit_value = 1
- output = list(record.digest_checker.problems)
- elif record.digest_checker.passed is True and self.verbose:
- d_msg = 'digest pass'
- elif record.digest_checker.passed is None and self.verbose:
- if digest_present and rec_type == 'revisit':
- d_msg = 'digest present but not checked (revisit)'
- elif digest_present: # pragma: no cover
- # should not happen
- d_msg = 'digest present but not checked'
- else:
- d_msg = 'no digest to check'
- if d_msg or output:
- if not printed_filename:
- print(filename)
- printed_filename = True
- print(' ', 'offset', rec_offset, 'WARC-Record-ID', rec_id, rec_type)
- if d_msg:
- print(' ', d_msg)
- for o in output:
- print(' ', o)
|