indexer.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import json
  2. import sys
  3. import os
  4. from collections import OrderedDict
  5. from warcio.archiveiterator import ArchiveIterator
  6. from warcio.utils import open_or_default
  7. # ============================================================================
  8. class Indexer(object):
  9. field_names = {}
  10. def __init__(self, fields, inputs, output, verify_http=False):
  11. if isinstance(fields, str):
  12. fields = fields.split(',')
  13. self.fields = fields
  14. self.record_parse = any(field.startswith('http:') for field in self.fields)
  15. self.inputs = inputs
  16. self.output = output
  17. self.verify_http = verify_http
  18. def process_all(self):
  19. with open_or_default(self.output, 'wt', sys.stdout) as out:
  20. for filename in self.inputs:
  21. try:
  22. stdin = sys.stdin.buffer
  23. except AttributeError: # py2
  24. stdin = sys.stdin
  25. with open_or_default(filename, 'rb', stdin) as fh:
  26. self.process_one(fh, out, filename)
  27. def process_one(self, input_, output, filename):
  28. it = self._create_record_iter(input_)
  29. self._write_header(output, filename)
  30. for record in it:
  31. self.process_index_entry(it, record, filename, output)
  32. def process_index_entry(self, it, record, filename, output):
  33. index = self._new_dict(record)
  34. for field in self.fields:
  35. value = self.get_field(record, field, it, filename)
  36. if value is not None:
  37. field = self.field_names.get(field, field)
  38. index[field] = value
  39. self._write_line(output, index, record, filename)
  40. def _create_record_iter(self, input_):
  41. return ArchiveIterator(input_,
  42. no_record_parse=not self.record_parse,
  43. arc2warc=True,
  44. verify_http=self.verify_http)
  45. def _new_dict(self, record):
  46. return OrderedDict()
  47. def get_field(self, record, name, it, filename):
  48. value = None
  49. if name == 'offset':
  50. value = str(it.get_record_offset())
  51. elif name == 'length':
  52. value = str(it.get_record_length())
  53. elif name == 'filename':
  54. value = os.path.basename(filename)
  55. elif name == 'http:status':
  56. if record.rec_type in ('response', 'revisit') and record.http_headers:
  57. value = record.http_headers.get_statuscode()
  58. elif name.startswith('http:'):
  59. if record.http_headers:
  60. value = record.http_headers.get_header(name[5:])
  61. else:
  62. value = record.rec_headers.get_header(name)
  63. return value
  64. def _write_header(self, out, filename):
  65. pass
  66. def _write_line(self, out, index, record, filename):
  67. out.write(json.dumps(index) + '\n')