LiuFan
/
PrivacyScanData


			
							123456789101112131415161718192021222324252627282930313233343536373839404142
							from warcio.archiveiterator import ArchiveIterator

from warcio.utils import BUFF_SIZE
import sys


# ============================================================================
class Extractor(object):
    READ_SIZE = BUFF_SIZE * 4

    def __init__(self, filename, offset):
        self.filename = filename
        self.offset = offset

    def extract(self, payload_only, headers_only):
        with open(self.filename, 'rb') as fh:
            fh.seek(int(self.offset))
            it = iter(ArchiveIterator(fh))
            record = next(it)

            try:
                stdout_raw = sys.stdout.buffer
            except AttributeError:  #pragma: no cover
                stdout_raw = sys.stdout

            if payload_only:
                stream = record.content_stream()
                buf = stream.read(self.READ_SIZE)
                while buf:
                    stdout_raw.write(buf)
                    buf = stream.read(self.READ_SIZE)
            else:
                stdout_raw.write(record.rec_headers.to_bytes())
                if record.http_headers:
                    stdout_raw.write(record.http_headers.to_bytes())
                if not headers_only:
                    buf = record.raw_stream.read(self.READ_SIZE)
                    while buf:
                        stdout_raw.write(buf)
                        buf = record.raw_stream.read(self.READ_SIZE)