recompressor.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from warcio.archiveiterator import ArchiveIterator
  2. from warcio.exceptions import ArchiveLoadFailed
  3. from warcio.warcwriter import WARCWriter
  4. from warcio.bufferedreaders import DecompressingBufferedReader
  5. import tempfile
  6. import shutil
  7. import traceback
  8. import sys
  9. # ============================================================================
  10. class Recompressor(object):
  11. def __init__(self, filename, output, verbose=False):
  12. self.filename = filename
  13. self.output = output
  14. self.verbose = verbose
  15. def recompress(self):
  16. from warcio.cli import main
  17. try:
  18. count = 0
  19. msg = ''
  20. with open(self.filename, 'rb') as stream:
  21. try:
  22. count = self.load_and_write(stream, self.output)
  23. msg = 'No Errors Found!'
  24. except Exception as e:
  25. if self.verbose:
  26. print('Parsing Error(s) Found:')
  27. print(str(e) if isinstance(e, ArchiveLoadFailed) else repr(e))
  28. print()
  29. count = self.decompress_and_recompress(stream, self.output)
  30. msg = 'Compression Errors Found and Fixed!'
  31. if self.verbose:
  32. print('Records successfully read and compressed:')
  33. main(['index', self.output])
  34. print('')
  35. print('{0} records read and recompressed to file: {1}'.format(count, self.output))
  36. print(msg)
  37. except:
  38. if self.verbose:
  39. print('Exception Details:')
  40. traceback.print_exc()
  41. print('')
  42. print('Recompress Failed: {0} could not be read as a WARC or ARC'.format(self.filename))
  43. sys.exit(1)
  44. def load_and_write(self, stream, output):
  45. count = 0
  46. with open(output, 'wb') as out:
  47. writer = WARCWriter(filebuf=out, gzip=True)
  48. for record in ArchiveIterator(stream,
  49. no_record_parse=False,
  50. arc2warc=True,
  51. verify_http=False):
  52. writer.write_record(record)
  53. count += 1
  54. return count
  55. def decompress_and_recompress(self, stream, output):
  56. with tempfile.TemporaryFile() as tout:
  57. decomp = DecompressingBufferedReader(stream, read_all_members=True)
  58. # decompress entire file to temp file
  59. stream.seek(0)
  60. shutil.copyfileobj(decomp, tout)
  61. # attempt to compress and write temp
  62. tout.seek(0)
  63. return self.load_and_write(tout, output)