1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192 |
- from argparse import ArgumentParser, RawTextHelpFormatter
- from warcio.indexer import Indexer
- from warcio.checker import Checker
- from warcio.extractor import Extractor
- from warcio.recompressor import Recompressor
- import sys
- # ============================================================================
- def main(args=None):
- parser = ArgumentParser(description='warcio utils',
- formatter_class=RawTextHelpFormatter)
- parser.add_argument('-V', '--version', action='version', version=get_version())
- subparsers = parser.add_subparsers(dest='cmd')
- subparsers.required = True
- index = subparsers.add_parser('index', help='WARC/ARC Indexer')
- index.add_argument('inputs', nargs='*', help='input file(s); default is stdin')
- index.add_argument('-f', '--fields', default='offset,warc-type,warc-target-uri',
- help='fields to include in json output; supported values are "offset", '
- '"length", "filename", "http:status", "http:{http-header}" '
- '(arbitrary http header), and "{warc-header}" (arbitrary warc '
- 'record header)')
- index.add_argument('-o', '--output', help='output file; default is stdout')
- index.set_defaults(func=indexer)
- recompress = subparsers.add_parser('recompress', help='Recompress an existing WARC or ARC',
- description='Read an existing, possibly broken WARC ' +
- 'and correctly recompress it to fix any compression errors\n' +
- 'Also convert any ARC file to a standard compressed WARC file')
- recompress.add_argument('filename')
- recompress.add_argument('output')
- recompress.add_argument('-v', '--verbose', action='store_true')
- recompress.set_defaults(func=recompressor)
- extract = subparsers.add_parser('extract', help='Extract WARC/ARC Record')
- extract.add_argument('filename')
- extract.add_argument('offset')
- group = extract.add_mutually_exclusive_group()
- group.add_argument('--payload', action='store_true', help='output only record payload (after content and transfer decoding, if applicable)')
- group.add_argument('--headers', action='store_true', help='output only record headers (and http headers, if applicable)')
- extract.set_defaults(func=extractor)
- check = subparsers.add_parser('check', help='WARC digest checker')
- check.add_argument('inputs', nargs='+')
- check.add_argument('-v', '--verbose', action='store_true')
- check.set_defaults(func=checker)
- cmd = parser.parse_args(args=args)
- cmd.func(cmd)
- # ============================================================================
- def get_version():
- import pkg_resources
- return '%(prog)s ' + pkg_resources.get_distribution('warcio').version
- # ============================================================================
- def indexer(cmd):
- inputs = cmd.inputs or ('-',) # default to stdin
- _indexer = Indexer(cmd.fields, inputs, cmd.output)
- _indexer.process_all()
- # ============================================================================
- def checker(cmd):
- _checker = Checker(cmd)
- sys.exit(_checker.process_all())
- # ============================================================================
- def extractor(cmd):
- _extractor = Extractor(cmd.filename, cmd.offset)
- _extractor.extract(cmd.payload, cmd.headers)
- # ============================================================================
- def recompressor(cmd):
- _recompressor = Recompressor(cmd.filename, cmd.output, cmd.verbose)
- _recompressor.recompress()
- # ============================================================================
- if __name__ == "__main__": #pragma: no cover
- main()
|