123456789101112131415161718192021222324252627282930313233343536373839404142 |
- from warcio.archiveiterator import ArchiveIterator
- from warcio.utils import BUFF_SIZE
- import sys
- # ============================================================================
- class Extractor(object):
- READ_SIZE = BUFF_SIZE * 4
- def __init__(self, filename, offset):
- self.filename = filename
- self.offset = offset
- def extract(self, payload_only, headers_only):
- with open(self.filename, 'rb') as fh:
- fh.seek(int(self.offset))
- it = iter(ArchiveIterator(fh))
- record = next(it)
- try:
- stdout_raw = sys.stdout.buffer
- except AttributeError: #pragma: no cover
- stdout_raw = sys.stdout
- if payload_only:
- stream = record.content_stream()
- buf = stream.read(self.READ_SIZE)
- while buf:
- stdout_raw.write(buf)
- buf = stream.read(self.READ_SIZE)
- else:
- stdout_raw.write(record.rec_headers.to_bytes())
- if record.http_headers:
- stdout_raw.write(record.http_headers.to_bytes())
- if not headers_only:
- buf = record.raw_stream.read(self.READ_SIZE)
- while buf:
- stdout_raw.write(buf)
- buf = record.raw_stream.read(self.READ_SIZE)
|