s3sync_media.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. """
  2. Sync Media to S3
  3. ================
  4. Django command that scans all files in your settings.MEDIA_ROOT folder and
  5. uploads them to S3 with the same directory structure.
  6. This command can optionally do the following but it is off by default:
  7. * gzip compress any CSS and Javascript files it finds and adds the appropriate
  8. 'Content-Encoding' header.
  9. * set a far future 'Expires' header for optimal caching.
  10. Note: This script requires the Python boto library and valid Amazon Web
  11. Services API keys.
  12. Required settings.py variables:
  13. AWS_ACCESS_KEY_ID = ''
  14. AWS_SECRET_ACCESS_KEY = ''
  15. Command options are:
  16. -b BUCKET, --bucket=BUCKET
  17. The name of the Amazon bucket you are uploading to.
  18. -p PREFIX, --prefix=PREFIX
  19. The prefix to prepend to the path on S3.
  20. -d DIRECTORY, --dir=DIRECTORY
  21. The root directory to use instead of your MEDIA_ROOT
  22. --gzip Enables gzipping CSS and Javascript files.
  23. --expires Enables setting a far future expires header.
  24. --force Skip the file mtime check to force upload of all
  25. files.
  26. --remove-missing
  27. Remove any existing keys from the bucket that are not
  28. present in your local. DANGEROUS!
  29. --exclude-list Override default directory and file exclusion
  30. filters. (enter as comma separated line)
  31. --dry-run
  32. Do A dry-run to show what files would be affected.
  33. """
  34. import datetime
  35. from fnmatch import fnmatch
  36. try:
  37. from hashlib import md5
  38. except ImportError:
  39. from md5 import md5
  40. import optparse
  41. import os
  42. import time
  43. from django.conf import settings
  44. from django.core.management.base import BaseCommand, CommandError
  45. from s3sync.utils import (get_aws_info, get_bucket_and_key, ConfigMissingError,
  46. upload_file_to_s3)
  47. # Make sure boto is available
  48. try:
  49. import boto
  50. import boto.exception
  51. from boto.s3.bucketlistresultset import bucket_lister
  52. except ImportError:
  53. raise ImportError("The boto Python library is not installed.")
  54. class Command(BaseCommand):
  55. # Extra variables to avoid passing these around
  56. AWS_ACCESS_KEY_ID = ''
  57. AWS_SECRET_ACCESS_KEY = ''
  58. AWS_BUCKET_NAME = ''
  59. DIRECTORY = ''
  60. EXCLUDE_LIST = []
  61. upload_count = 0
  62. skip_count = 0
  63. remove_bucket_count = 0
  64. option_list = BaseCommand.option_list + (
  65. optparse.make_option('-b', '--bucket',
  66. dest='bucket', default='',
  67. help="The name of the Amazon bucket you are uploading to."),
  68. optparse.make_option('-p', '--prefix',
  69. dest='prefix',
  70. default='',
  71. help="The prefix to prepend to the path on S3."),
  72. optparse.make_option('-d', '--dir',
  73. dest='dir', default=settings.MEDIA_ROOT,
  74. help="The root directory to use instead of your MEDIA_ROOT"),
  75. optparse.make_option('--gzip',
  76. action='store_true', dest='gzip', default=False,
  77. help="Enables gzipping CSS and Javascript files."),
  78. optparse.make_option('--expires',
  79. action='store_true', dest='expires', default=False,
  80. help="Enables setting a far future expires header."),
  81. optparse.make_option('--force',
  82. action='store_true', dest='force', default=False,
  83. help="Skip the file mtime check to force upload of all files."),
  84. optparse.make_option('--remove-missing',
  85. action='store_true', dest='remove_missing', default=False,
  86. help="Remove keys in the bucket for files locally missing."),
  87. optparse.make_option('--dry-run',
  88. action='store_true', dest='dry_run', default=False,
  89. help="Do a dry-run to show what files would be affected."),
  90. optparse.make_option('--exclude-list', dest='exclude_list',
  91. action='store', default='',
  92. help="Override default directory and file exclusion filters. "
  93. "(enter as comma separated line)"),
  94. # TODO: implement
  95. optparse.make_option('--hash-chunk-size', dest='hash_chunk',
  96. action='store', default=4096,
  97. help="Override default directory and file exclusion filters. "
  98. "(enter as comma separated line)"),
  99. )
  100. help = ('Syncs the complete MEDIA_ROOT structure and files to S3 into '
  101. 'the given bucket name.')
  102. def handle(self, *args, **options):
  103. # Check for AWS keys in settings
  104. try:
  105. self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, self.AWS_S3_HOST = get_aws_info()
  106. except ConfigMissingError:
  107. raise CommandError('Missing AWS keys from settings file. ' +
  108. ' Please supply both AWS_ACCESS_KEY_ID and ' +
  109. 'AWS_SECRET_ACCESS_KEY.')
  110. self.AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID
  111. self.AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY
  112. self.AWS_BUCKET_NAME = options.get('bucket')
  113. if not self.AWS_BUCKET_NAME:
  114. raise CommandError('No bucket specified. Use --bucket=name')
  115. if not settings.MEDIA_ROOT:
  116. raise CommandError('MEDIA_ROOT must be set in your settings.')
  117. self.verbosity = int(options.get('verbosity'))
  118. # TODO: compare first hash chunk of files to see if they're identical
  119. self.hash_chunk = int(options.get('hash_chunk'))
  120. self.prefix = options.get('prefix')
  121. self.do_gzip = options.get('gzip')
  122. self.do_expires = options.get('expires')
  123. self.do_force = options.get('force')
  124. self.remove_missing = options.get('remove_missing')
  125. self.dry_run = options.get('dry_run')
  126. self.DIRECTORY = options.get('dir')
  127. exclude_list = options.get('exclude_list')
  128. if exclude_list and isinstance(exclude_list, list):
  129. # command line option overrides default exclude_list
  130. self.EXCLUDE_LIST = exclude_list
  131. elif exclude_list:
  132. self.EXCLUDE_LIST = exclude_list.split(',')
  133. # Now call the syncing method to walk the MEDIA_ROOT directory and
  134. # upload all files found.
  135. self.sync_s3()
  136. print ("%d files uploaded." % (self.upload_count))
  137. print ("%d files skipped." % (self.skip_count))
  138. if self.remove_missing:
  139. print ("%d keys removed from bucket." % (self.remove_bucket_count))
  140. if self.dry_run:
  141. print ('THIS IS A DRY RUN, NO ACTUAL CHANGES.')
  142. def sync_s3(self):
  143. """
  144. Walks the media directory and syncs files to S3
  145. """
  146. bucket, key = get_bucket_and_key(self.AWS_BUCKET_NAME)
  147. self.s3_files = {}
  148. self.files_processed = set()
  149. os.path.walk(self.DIRECTORY, self.upload_s3,
  150. (bucket, key, self.AWS_BUCKET_NAME, self.DIRECTORY))
  151. # Remove files on bucket if they're missing locally
  152. if self.remove_missing:
  153. self.remove_s3(bucket)
  154. def find_key_in_list(self, s3_list, file_key):
  155. if file_key in self.s3_files:
  156. return self.s3_files[file_key]
  157. for s3_key in s3_list:
  158. if s3_key.name == file_key:
  159. return s3_key
  160. if s3_key.name not in self.files_processed:
  161. self.s3_files[s3_key.name] = s3_key
  162. return None
  163. def finish_list(self, s3_list):
  164. for s3_key in s3_list:
  165. if s3_key.name not in self.files_processed:
  166. self.s3_files[s3_key.name] = s3_key
  167. def remove_s3(self, bucket):
  168. if not self.s3_files:
  169. if self.verbosity > 0:
  170. print ('No files to remove.')
  171. return
  172. for key, value in self.s3_files.items():
  173. if not self.dry_run:
  174. bucket.delete_key(value.name)
  175. self.remove_bucket_count += 1
  176. print ("Deleting %s..." % (key))
  177. def upload_s3(self, arg, dirname, names):
  178. """
  179. This is the callback to os.path.walk and where much of the work happens
  180. """
  181. bucket, key, bucket_name, root_dir = arg
  182. # Skip files and directories we don't want to sync
  183. for pattern in self.EXCLUDE_LIST:
  184. if fnmatch(os.path.basename(dirname), pattern):
  185. if self.verbosity > 1:
  186. print ('Skipping: %s (rule: %s)' % (names, pattern))
  187. del names[:]
  188. return
  189. # Later we assume the MEDIA_ROOT ends with a trailing slash
  190. if not root_dir.endswith(os.path.sep):
  191. root_dir = root_dir + os.path.sep
  192. list_prefix = dirname[len(root_dir):]
  193. if self.prefix:
  194. list_prefix = '%s/%s' % (self.prefix, list_prefix)
  195. s3_list = bucket_lister(bucket, prefix=list_prefix)
  196. for name in names:
  197. bad_name = False
  198. for pattern in self.EXCLUDE_LIST:
  199. if fnmatch(name, pattern):
  200. bad_name = True # Skip files we don't want to sync
  201. if bad_name:
  202. if self.verbosity > 1:
  203. print ('Skipping: %s (rule: %s)' % (names, pattern))
  204. continue
  205. filename = os.path.join(dirname, name)
  206. if os.path.isdir(filename):
  207. continue # Don't try to upload directories
  208. file_key = filename[len(root_dir):]
  209. if self.prefix:
  210. file_key = '%s/%s' % (self.prefix, file_key)
  211. # Check if file on S3 is older than local file, if so, upload
  212. # TODO: check if hash chunk corresponds
  213. if not self.do_force:
  214. s3_key = self.find_key_in_list(s3_list, file_key)
  215. if s3_key:
  216. s3_datetime = datetime.datetime(*time.strptime(
  217. s3_key.last_modified, '%Y-%m-%dT%H:%M:%S.000Z')[0:6])
  218. local_datetime = datetime.datetime.utcfromtimestamp(
  219. os.stat(filename).st_mtime)
  220. if local_datetime < s3_datetime:
  221. self.skip_count += 1
  222. if self.verbosity > 1:
  223. print ("File %s hasn't been modified since last " \
  224. "being uploaded" % (file_key))
  225. if file_key in self.s3_files:
  226. self.files_processed.add(file_key)
  227. del self.s3_files[file_key]
  228. continue
  229. if file_key in self.s3_files:
  230. self.files_processed.add(file_key)
  231. del self.s3_files[file_key]
  232. # File is newer, let's process and upload
  233. if self.verbosity > 0:
  234. print ("Uploading %s..." % file_key)
  235. if self.dry_run:
  236. self.upload_count += 1
  237. continue
  238. try:
  239. upload_file_to_s3(file_key, filename, key,
  240. do_gzip=self.do_gzip, do_expires=self.do_expires,
  241. verbosity=self.verbosity)
  242. except boto.exception.S3CreateError as e:
  243. # TODO: retry to create a few times
  244. print ("Failed to upload: %s" % e)
  245. except Exception as e:
  246. print (e)
  247. raise
  248. else:
  249. self.upload_count += 1
  250. # If we don't care about what's missing, wipe this to save memory.
  251. if not self.remove_missing:
  252. self.s3_files = {}
  253. else:
  254. self.finish_list(s3_list)