123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- #!/usr/bin/env python
- """
- Downloads using s3transfer.processpool.ProcessPoolDownloader
- Usage
- =====
- NOTE: Make sure you run ``pip install -r requirements-dev.txt`` before running.
- To download a file::
- ./proccesspool-download -f myfilename -b mybucket -k mykey
- To download a prefix recursively to a directory::
- ./proccesspool-download -d mydirname -b mybucket -p myprefix/
- """
- import argparse
- import os
- import botocore.session
- from s3transfer.processpool import ProcessPoolDownloader, ProcessTransferConfig
- MB = 1024 * 1024
- def download(bucket, key, filename, num_processes, mb_chunksize):
- config = ProcessTransferConfig(
- multipart_chunksize=mb_chunksize * MB,
- max_request_processes=num_processes,
- )
- with ProcessPoolDownloader(config=config) as downloader:
- future = downloader.download_file(
- bucket=bucket, key=key, filename=filename
- )
- future.result()
- def recursive_download(bucket, prefix, dirname, num_processes, mb_chunksize):
- config = ProcessTransferConfig(
- multipart_chunksize=mb_chunksize * MB,
- max_request_processes=num_processes,
- )
- s3 = botocore.session.get_session().create_client('s3')
- with ProcessPoolDownloader(config=config) as downloader:
- paginator = s3.get_paginator('list_objects')
- for response in paginator.paginate(Bucket=bucket, Prefix=prefix):
- contents = response.get('Contents', [])
- for content in contents:
- key = content['Key']
- filename = os.path.join(dirname, key[len(prefix) :])
- parent_dirname = os.path.dirname(filename)
- if not os.path.exists(parent_dirname):
- os.makedirs(parent_dirname)
- # An expected size is provided so an additional HeadObject
- # does not need to be made for each of these objects that
- # get downloaded.
- downloader.download_file(
- bucket,
- key,
- filename=filename,
- expected_size=content['Size'],
- )
- def main():
- parser = argparse.ArgumentParser(usage=__doc__)
- parser.add_argument(
- '-b', '--bucket', required=True, help='The S3 bucket to download from'
- )
- single_file_group = parser.add_argument_group('Single file downloads')
- single_file_group.add_argument(
- '-k', '--key', help='The key to download from'
- )
- single_file_group.add_argument(
- '-f', '--filename', help='The name of file to download to'
- )
- recursive_file_group = parser.add_argument_group(
- 'Recursive file downloads'
- )
- recursive_file_group.add_argument(
- '-p', '--prefix', help='The prefix to download from'
- )
- recursive_file_group.add_argument(
- '-d', '--dirname', help='The directory to download to'
- )
- parser.add_argument(
- '-n',
- '--num-processes',
- type=int,
- default=10,
- help='The number of processes to run the download. 10 by default.',
- )
- parser.add_argument(
- '-c',
- '--mb-chunksize',
- type=int,
- default=8,
- help='The part size in MB to use for the download. 8 MB by default.',
- )
- args = parser.parse_args()
- if args.filename and args.key:
- download(
- args.bucket,
- args.key,
- args.filename,
- args.num_processes,
- args.mb_chunksize,
- )
- elif args.prefix and args.dirname:
- recursive_download(
- args.bucket,
- args.prefix,
- args.dirname,
- args.num_processes,
- args.mb_chunksize,
- )
- else:
- raise ValueError(
- 'Either --key and --filename must be provided or '
- '--prefix and --dirname must be provided.'
- )
- if __name__ == '__main__':
- main()
|