cli.py 3.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. from argparse import ArgumentParser, RawTextHelpFormatter
  2. from warcio.indexer import Indexer
  3. from warcio.checker import Checker
  4. from warcio.extractor import Extractor
  5. from warcio.recompressor import Recompressor
  6. import sys
  7. # ============================================================================
  8. def main(args=None):
  9. parser = ArgumentParser(description='warcio utils',
  10. formatter_class=RawTextHelpFormatter)
  11. parser.add_argument('-V', '--version', action='version', version=get_version())
  12. subparsers = parser.add_subparsers(dest='cmd')
  13. subparsers.required = True
  14. index = subparsers.add_parser('index', help='WARC/ARC Indexer')
  15. index.add_argument('inputs', nargs='*', help='input file(s); default is stdin')
  16. index.add_argument('-f', '--fields', default='offset,warc-type,warc-target-uri',
  17. help='fields to include in json output; supported values are "offset", '
  18. '"length", "filename", "http:status", "http:{http-header}" '
  19. '(arbitrary http header), and "{warc-header}" (arbitrary warc '
  20. 'record header)')
  21. index.add_argument('-o', '--output', help='output file; default is stdout')
  22. index.set_defaults(func=indexer)
  23. recompress = subparsers.add_parser('recompress', help='Recompress an existing WARC or ARC',
  24. description='Read an existing, possibly broken WARC ' +
  25. 'and correctly recompress it to fix any compression errors\n' +
  26. 'Also convert any ARC file to a standard compressed WARC file')
  27. recompress.add_argument('filename')
  28. recompress.add_argument('output')
  29. recompress.add_argument('-v', '--verbose', action='store_true')
  30. recompress.set_defaults(func=recompressor)
  31. extract = subparsers.add_parser('extract', help='Extract WARC/ARC Record')
  32. extract.add_argument('filename')
  33. extract.add_argument('offset')
  34. group = extract.add_mutually_exclusive_group()
  35. group.add_argument('--payload', action='store_true', help='output only record payload (after content and transfer decoding, if applicable)')
  36. group.add_argument('--headers', action='store_true', help='output only record headers (and http headers, if applicable)')
  37. extract.set_defaults(func=extractor)
  38. check = subparsers.add_parser('check', help='WARC digest checker')
  39. check.add_argument('inputs', nargs='+')
  40. check.add_argument('-v', '--verbose', action='store_true')
  41. check.set_defaults(func=checker)
  42. cmd = parser.parse_args(args=args)
  43. cmd.func(cmd)
  44. # ============================================================================
  45. def get_version():
  46. import pkg_resources
  47. return '%(prog)s ' + pkg_resources.get_distribution('warcio').version
  48. # ============================================================================
  49. def indexer(cmd):
  50. inputs = cmd.inputs or ('-',) # default to stdin
  51. _indexer = Indexer(cmd.fields, inputs, cmd.output)
  52. _indexer.process_all()
  53. # ============================================================================
  54. def checker(cmd):
  55. _checker = Checker(cmd)
  56. sys.exit(_checker.process_all())
  57. # ============================================================================
  58. def extractor(cmd):
  59. _extractor = Extractor(cmd.filename, cmd.offset)
  60. _extractor.extract(cmd.payload, cmd.headers)
  61. # ============================================================================
  62. def recompressor(cmd):
  63. _recompressor = Recompressor(cmd.filename, cmd.output, cmd.verbose)
  64. _recompressor.recompress()
  65. # ============================================================================
  66. if __name__ == "__main__": #pragma: no cover
  67. main()