1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- from warcio.archiveiterator import ArchiveIterator
- from warcio.exceptions import ArchiveLoadFailed
- from warcio.warcwriter import WARCWriter
- from warcio.bufferedreaders import DecompressingBufferedReader
- import tempfile
- import shutil
- import traceback
- import sys
- # ============================================================================
- class Recompressor(object):
- def __init__(self, filename, output, verbose=False):
- self.filename = filename
- self.output = output
- self.verbose = verbose
- def recompress(self):
- from warcio.cli import main
- try:
- count = 0
- msg = ''
- with open(self.filename, 'rb') as stream:
- try:
- count = self.load_and_write(stream, self.output)
- msg = 'No Errors Found!'
- except Exception as e:
- if self.verbose:
- print('Parsing Error(s) Found:')
- print(str(e) if isinstance(e, ArchiveLoadFailed) else repr(e))
- print()
- count = self.decompress_and_recompress(stream, self.output)
- msg = 'Compression Errors Found and Fixed!'
- if self.verbose:
- print('Records successfully read and compressed:')
- main(['index', self.output])
- print('')
- print('{0} records read and recompressed to file: {1}'.format(count, self.output))
- print(msg)
- except:
- if self.verbose:
- print('Exception Details:')
- traceback.print_exc()
- print('')
- print('Recompress Failed: {0} could not be read as a WARC or ARC'.format(self.filename))
- sys.exit(1)
- def load_and_write(self, stream, output):
- count = 0
- with open(output, 'wb') as out:
- writer = WARCWriter(filebuf=out, gzip=True)
- for record in ArchiveIterator(stream,
- no_record_parse=False,
- arc2warc=True,
- verify_http=False):
- writer.write_record(record)
- count += 1
- return count
- def decompress_and_recompress(self, stream, output):
- with tempfile.TemporaryFile() as tout:
- decomp = DecompressingBufferedReader(stream, read_all_members=True)
- # decompress entire file to temp file
- stream.seek(0)
- shutil.copyfileobj(decomp, tout)
- # attempt to compress and write temp
- tout.seek(0)
- return self.load_and_write(tout, output)
|