processpool-download 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. #!/usr/bin/env python
  2. """
  3. Downloads using s3transfer.processpool.ProcessPoolDownloader
  4. Usage
  5. =====
  6. NOTE: Make sure you run ``pip install -r requirements-dev.txt`` before running.
  7. To download a file::
  8. ./proccesspool-download -f myfilename -b mybucket -k mykey
  9. To download a prefix recursively to a directory::
  10. ./proccesspool-download -d mydirname -b mybucket -p myprefix/
  11. """
  12. import argparse
  13. import os
  14. import botocore.session
  15. from s3transfer.processpool import ProcessPoolDownloader, ProcessTransferConfig
  16. MB = 1024 * 1024
  17. def download(bucket, key, filename, num_processes, mb_chunksize):
  18. config = ProcessTransferConfig(
  19. multipart_chunksize=mb_chunksize * MB,
  20. max_request_processes=num_processes,
  21. )
  22. with ProcessPoolDownloader(config=config) as downloader:
  23. future = downloader.download_file(
  24. bucket=bucket, key=key, filename=filename
  25. )
  26. future.result()
  27. def recursive_download(bucket, prefix, dirname, num_processes, mb_chunksize):
  28. config = ProcessTransferConfig(
  29. multipart_chunksize=mb_chunksize * MB,
  30. max_request_processes=num_processes,
  31. )
  32. s3 = botocore.session.get_session().create_client('s3')
  33. with ProcessPoolDownloader(config=config) as downloader:
  34. paginator = s3.get_paginator('list_objects')
  35. for response in paginator.paginate(Bucket=bucket, Prefix=prefix):
  36. contents = response.get('Contents', [])
  37. for content in contents:
  38. key = content['Key']
  39. filename = os.path.join(dirname, key[len(prefix) :])
  40. parent_dirname = os.path.dirname(filename)
  41. if not os.path.exists(parent_dirname):
  42. os.makedirs(parent_dirname)
  43. # An expected size is provided so an additional HeadObject
  44. # does not need to be made for each of these objects that
  45. # get downloaded.
  46. downloader.download_file(
  47. bucket,
  48. key,
  49. filename=filename,
  50. expected_size=content['Size'],
  51. )
  52. def main():
  53. parser = argparse.ArgumentParser(usage=__doc__)
  54. parser.add_argument(
  55. '-b', '--bucket', required=True, help='The S3 bucket to download from'
  56. )
  57. single_file_group = parser.add_argument_group('Single file downloads')
  58. single_file_group.add_argument(
  59. '-k', '--key', help='The key to download from'
  60. )
  61. single_file_group.add_argument(
  62. '-f', '--filename', help='The name of file to download to'
  63. )
  64. recursive_file_group = parser.add_argument_group(
  65. 'Recursive file downloads'
  66. )
  67. recursive_file_group.add_argument(
  68. '-p', '--prefix', help='The prefix to download from'
  69. )
  70. recursive_file_group.add_argument(
  71. '-d', '--dirname', help='The directory to download to'
  72. )
  73. parser.add_argument(
  74. '-n',
  75. '--num-processes',
  76. type=int,
  77. default=10,
  78. help='The number of processes to run the download. 10 by default.',
  79. )
  80. parser.add_argument(
  81. '-c',
  82. '--mb-chunksize',
  83. type=int,
  84. default=8,
  85. help='The part size in MB to use for the download. 8 MB by default.',
  86. )
  87. args = parser.parse_args()
  88. if args.filename and args.key:
  89. download(
  90. args.bucket,
  91. args.key,
  92. args.filename,
  93. args.num_processes,
  94. args.mb_chunksize,
  95. )
  96. elif args.prefix and args.dirname:
  97. recursive_download(
  98. args.bucket,
  99. args.prefix,
  100. args.dirname,
  101. args.num_processes,
  102. args.mb_chunksize,
  103. )
  104. else:
  105. raise ValueError(
  106. 'Either --key and --filename must be provided or '
  107. '--prefix and --dirname must be provided.'
  108. )
  109. if __name__ == '__main__':
  110. main()