extractor.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. from warcio.archiveiterator import ArchiveIterator
  2. from warcio.utils import BUFF_SIZE
  3. import sys
  4. # ============================================================================
  5. class Extractor(object):
  6. READ_SIZE = BUFF_SIZE * 4
  7. def __init__(self, filename, offset):
  8. self.filename = filename
  9. self.offset = offset
  10. def extract(self, payload_only, headers_only):
  11. with open(self.filename, 'rb') as fh:
  12. fh.seek(int(self.offset))
  13. it = iter(ArchiveIterator(fh))
  14. record = next(it)
  15. try:
  16. stdout_raw = sys.stdout.buffer
  17. except AttributeError: #pragma: no cover
  18. stdout_raw = sys.stdout
  19. if payload_only:
  20. stream = record.content_stream()
  21. buf = stream.read(self.READ_SIZE)
  22. while buf:
  23. stdout_raw.write(buf)
  24. buf = stream.read(self.READ_SIZE)
  25. else:
  26. stdout_raw.write(record.rec_headers.to_bytes())
  27. if record.http_headers:
  28. stdout_raw.write(record.http_headers.to_bytes())
  29. if not headers_only:
  30. buf = record.raw_stream.read(self.READ_SIZE)
  31. while buf:
  32. stdout_raw.write(buf)
  33. buf = record.raw_stream.read(self.READ_SIZE)