digestverifyingreader.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. import base64
  2. import sys
  3. from warcio.limitreader import LimitReader
  4. from warcio.utils import to_native_str, Digester
  5. from warcio.exceptions import ArchiveLoadFailed
  6. # ============================================================================
  7. class DigestChecker(object):
  8. def __init__(self, kind=None):
  9. self._problem = []
  10. self._passed = None
  11. self.kind = kind
  12. @property
  13. def passed(self):
  14. return self._passed
  15. @passed.setter
  16. def passed(self, value):
  17. self._passed = value
  18. @property
  19. def problems(self):
  20. return self._problem
  21. def problem(self, value, passed=False):
  22. self._problem.append(value)
  23. if self.kind == 'raise':
  24. raise ArchiveLoadFailed(value)
  25. if self.kind == 'log':
  26. sys.stderr.write(value + '\n')
  27. self._passed = passed
  28. # ============================================================================
  29. class DigestVerifyingReader(LimitReader):
  30. """
  31. A reader which verifies the digest of the wrapped reader
  32. """
  33. def __init__(self, stream, limit, digest_checker, record_type=None,
  34. payload_digest=None, block_digest=None, segment_number=None):
  35. super(DigestVerifyingReader, self).__init__(stream, limit)
  36. self.digest_checker = digest_checker
  37. if record_type == 'revisit':
  38. block_digest = None
  39. payload_digest = None
  40. if segment_number is not None: #pragma: no cover
  41. payload_digest = None
  42. self.payload_digest = payload_digest
  43. self.block_digest = block_digest
  44. self.payload_digester = None
  45. self.payload_digester_obj = None
  46. self.block_digester = None
  47. if block_digest:
  48. try:
  49. algo, _ = _parse_digest(block_digest)
  50. self.block_digester = Digester(algo)
  51. except ValueError:
  52. self.digest_checker.problem('unknown hash algorithm name in block digest')
  53. self.block_digester = None
  54. if payload_digest:
  55. try:
  56. algo, _ = _parse_digest(self.payload_digest)
  57. self.payload_digester_obj = Digester(algo)
  58. except ValueError:
  59. self.digest_checker.problem('unknown hash algorithm name in payload digest')
  60. self.payload_digester_obj = None
  61. def begin_payload(self):
  62. self.payload_digester = self.payload_digester_obj
  63. if self.limit == 0:
  64. check = _compare_digest_rfc_3548(self.payload_digester, self.payload_digest)
  65. if check is False:
  66. self.digest_checker.problem('payload digest failed: {}'.format(self.payload_digest))
  67. self.payload_digester = None # prevent double-fire
  68. elif check is True and self.digest_checker.passed is not False:
  69. self.digest_checker.passed = True
  70. def _update(self, buff):
  71. super(DigestVerifyingReader, self)._update(buff)
  72. if self.payload_digester:
  73. self.payload_digester.update(buff)
  74. if self.block_digester:
  75. self.block_digester.update(buff)
  76. if self.limit == 0:
  77. check = _compare_digest_rfc_3548(self.block_digester, self.block_digest)
  78. if check is False:
  79. self.digest_checker.problem('block digest failed: {}'.format(self.block_digest))
  80. elif check is True and self.digest_checker.passed is not False:
  81. self.digest_checker.passed = True
  82. check = _compare_digest_rfc_3548(self.payload_digester, self.payload_digest)
  83. if check is False:
  84. self.digest_checker.problem('payload digest failed {}'.format(self.payload_digest))
  85. elif check is True and self.digest_checker.passed is not False:
  86. self.digest_checker.passed = True
  87. return buff
  88. def _compare_digest_rfc_3548(digester, digest):
  89. '''
  90. The WARC standard does not recommend a digest algorithm and appears to
  91. allow any encoding from RFC3548. The Python base64 module supports
  92. RFC3548 although the base64 alternate alphabet is not exactly a first
  93. class citizen. Hopefully digest algos are named with the same names
  94. used by OpenSSL.
  95. '''
  96. if not digester or not digest:
  97. return None
  98. digester_b32 = str(digester)
  99. our_algo, our_value = _parse_digest(digester_b32)
  100. warc_algo, warc_value = _parse_digest(digest)
  101. warc_b32 = _to_b32(len(our_value), warc_value)
  102. if our_value == warc_b32:
  103. return True
  104. return False
  105. def _to_b32(length, value):
  106. '''
  107. Convert value to base 32, given that it's supposed to have the same
  108. length as the digest we're about to compare it to
  109. '''
  110. if len(value) == length:
  111. return value # casefold needed here? -- rfc recommends not allowing
  112. if len(value) > length:
  113. binary = base64.b16decode(value, casefold=True)
  114. else:
  115. binary = _b64_wrapper(value)
  116. return to_native_str(base64.b32encode(binary), encoding='ascii')
  117. base64_url_filename_safe_alt = b'-_'
  118. def _b64_wrapper(value):
  119. if '-' in value or '_' in value:
  120. return base64.b64decode(value, altchars=base64_url_filename_safe_alt)
  121. else:
  122. return base64.b64decode(value)
  123. def _parse_digest(digest):
  124. algo, sep, value = digest.partition(':')
  125. if sep == ':':
  126. return algo, value
  127. else:
  128. raise ValueError('could not parse digest algorithm out of '+digest)