123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- """
- Sync Media to S3
- ================
- Django command that scans all files in your settings.MEDIA_ROOT folder and
- uploads them to S3 with the same directory structure.
- This command can optionally do the following but it is off by default:
- * gzip compress any CSS and Javascript files it finds and adds the appropriate
- 'Content-Encoding' header.
- * set a far future 'Expires' header for optimal caching.
- Note: This script requires the Python boto library and valid Amazon Web
- Services API keys.
- Required settings.py variables:
- AWS_ACCESS_KEY_ID = ''
- AWS_SECRET_ACCESS_KEY = ''
- Command options are:
- -b BUCKET, --bucket=BUCKET
- The name of the Amazon bucket you are uploading to.
- -p PREFIX, --prefix=PREFIX
- The prefix to prepend to the path on S3.
- -d DIRECTORY, --dir=DIRECTORY
- The root directory to use instead of your MEDIA_ROOT
- --gzip Enables gzipping CSS and Javascript files.
- --expires Enables setting a far future expires header.
- --force Skip the file mtime check to force upload of all
- files.
- --remove-missing
- Remove any existing keys from the bucket that are not
- present in your local. DANGEROUS!
- --exclude-list Override default directory and file exclusion
- filters. (enter as comma separated line)
- --dry-run
- Do A dry-run to show what files would be affected.
- """
- import datetime
- from fnmatch import fnmatch
- try:
- from hashlib import md5
- except ImportError:
- from md5 import md5
- import optparse
- import os
- import time
- from django.conf import settings
- from django.core.management.base import BaseCommand, CommandError
- from s3sync.utils import (get_aws_info, get_bucket_and_key, ConfigMissingError,
- upload_file_to_s3)
- # Make sure boto is available
- try:
- import boto
- import boto.exception
- from boto.s3.bucketlistresultset import bucket_lister
- except ImportError:
- raise ImportError("The boto Python library is not installed.")
- class Command(BaseCommand):
- # Extra variables to avoid passing these around
- AWS_ACCESS_KEY_ID = ''
- AWS_SECRET_ACCESS_KEY = ''
- AWS_BUCKET_NAME = ''
- DIRECTORY = ''
- EXCLUDE_LIST = []
- upload_count = 0
- skip_count = 0
- remove_bucket_count = 0
- option_list = BaseCommand.option_list + (
- optparse.make_option('-b', '--bucket',
- dest='bucket', default='',
- help="The name of the Amazon bucket you are uploading to."),
- optparse.make_option('-p', '--prefix',
- dest='prefix',
- default='',
- help="The prefix to prepend to the path on S3."),
- optparse.make_option('-d', '--dir',
- dest='dir', default=settings.MEDIA_ROOT,
- help="The root directory to use instead of your MEDIA_ROOT"),
- optparse.make_option('--gzip',
- action='store_true', dest='gzip', default=False,
- help="Enables gzipping CSS and Javascript files."),
- optparse.make_option('--expires',
- action='store_true', dest='expires', default=False,
- help="Enables setting a far future expires header."),
- optparse.make_option('--force',
- action='store_true', dest='force', default=False,
- help="Skip the file mtime check to force upload of all files."),
- optparse.make_option('--remove-missing',
- action='store_true', dest='remove_missing', default=False,
- help="Remove keys in the bucket for files locally missing."),
- optparse.make_option('--dry-run',
- action='store_true', dest='dry_run', default=False,
- help="Do a dry-run to show what files would be affected."),
- optparse.make_option('--exclude-list', dest='exclude_list',
- action='store', default='',
- help="Override default directory and file exclusion filters. "
- "(enter as comma separated line)"),
- # TODO: implement
- optparse.make_option('--hash-chunk-size', dest='hash_chunk',
- action='store', default=4096,
- help="Override default directory and file exclusion filters. "
- "(enter as comma separated line)"),
- )
- help = ('Syncs the complete MEDIA_ROOT structure and files to S3 into '
- 'the given bucket name.')
- def handle(self, *args, **options):
- # Check for AWS keys in settings
- try:
- self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, self.AWS_S3_HOST = get_aws_info()
- except ConfigMissingError:
- raise CommandError('Missing AWS keys from settings file. ' +
- ' Please supply both AWS_ACCESS_KEY_ID and ' +
- 'AWS_SECRET_ACCESS_KEY.')
- self.AWS_ACCESS_KEY_ID = settings.AWS_ACCESS_KEY_ID
- self.AWS_SECRET_ACCESS_KEY = settings.AWS_SECRET_ACCESS_KEY
- self.AWS_BUCKET_NAME = options.get('bucket')
- if not self.AWS_BUCKET_NAME:
- raise CommandError('No bucket specified. Use --bucket=name')
- if not settings.MEDIA_ROOT:
- raise CommandError('MEDIA_ROOT must be set in your settings.')
- self.verbosity = int(options.get('verbosity'))
- # TODO: compare first hash chunk of files to see if they're identical
- self.hash_chunk = int(options.get('hash_chunk'))
- self.prefix = options.get('prefix')
- self.do_gzip = options.get('gzip')
- self.do_expires = options.get('expires')
- self.do_force = options.get('force')
- self.remove_missing = options.get('remove_missing')
- self.dry_run = options.get('dry_run')
- self.DIRECTORY = options.get('dir')
- exclude_list = options.get('exclude_list')
- if exclude_list and isinstance(exclude_list, list):
- # command line option overrides default exclude_list
- self.EXCLUDE_LIST = exclude_list
- elif exclude_list:
- self.EXCLUDE_LIST = exclude_list.split(',')
- # Now call the syncing method to walk the MEDIA_ROOT directory and
- # upload all files found.
- self.sync_s3()
- print ("%d files uploaded." % (self.upload_count))
- print ("%d files skipped." % (self.skip_count))
- if self.remove_missing:
- print ("%d keys removed from bucket." % (self.remove_bucket_count))
- if self.dry_run:
- print ('THIS IS A DRY RUN, NO ACTUAL CHANGES.')
- def sync_s3(self):
- """
- Walks the media directory and syncs files to S3
- """
- bucket, key = get_bucket_and_key(self.AWS_BUCKET_NAME)
- self.s3_files = {}
- self.files_processed = set()
- os.path.walk(self.DIRECTORY, self.upload_s3,
- (bucket, key, self.AWS_BUCKET_NAME, self.DIRECTORY))
- # Remove files on bucket if they're missing locally
- if self.remove_missing:
- self.remove_s3(bucket)
- def find_key_in_list(self, s3_list, file_key):
- if file_key in self.s3_files:
- return self.s3_files[file_key]
- for s3_key in s3_list:
- if s3_key.name == file_key:
- return s3_key
- if s3_key.name not in self.files_processed:
- self.s3_files[s3_key.name] = s3_key
- return None
- def finish_list(self, s3_list):
- for s3_key in s3_list:
- if s3_key.name not in self.files_processed:
- self.s3_files[s3_key.name] = s3_key
- def remove_s3(self, bucket):
- if not self.s3_files:
- if self.verbosity > 0:
- print ('No files to remove.')
- return
- for key, value in self.s3_files.items():
- if not self.dry_run:
- bucket.delete_key(value.name)
- self.remove_bucket_count += 1
- print ("Deleting %s..." % (key))
- def upload_s3(self, arg, dirname, names):
- """
- This is the callback to os.path.walk and where much of the work happens
- """
- bucket, key, bucket_name, root_dir = arg
- # Skip files and directories we don't want to sync
- for pattern in self.EXCLUDE_LIST:
- if fnmatch(os.path.basename(dirname), pattern):
- if self.verbosity > 1:
- print ('Skipping: %s (rule: %s)' % (names, pattern))
- del names[:]
- return
- # Later we assume the MEDIA_ROOT ends with a trailing slash
- if not root_dir.endswith(os.path.sep):
- root_dir = root_dir + os.path.sep
- list_prefix = dirname[len(root_dir):]
- if self.prefix:
- list_prefix = '%s/%s' % (self.prefix, list_prefix)
- s3_list = bucket_lister(bucket, prefix=list_prefix)
- for name in names:
- bad_name = False
- for pattern in self.EXCLUDE_LIST:
- if fnmatch(name, pattern):
- bad_name = True # Skip files we don't want to sync
- if bad_name:
- if self.verbosity > 1:
- print ('Skipping: %s (rule: %s)' % (names, pattern))
- continue
- filename = os.path.join(dirname, name)
- if os.path.isdir(filename):
- continue # Don't try to upload directories
- file_key = filename[len(root_dir):]
- if self.prefix:
- file_key = '%s/%s' % (self.prefix, file_key)
- # Check if file on S3 is older than local file, if so, upload
- # TODO: check if hash chunk corresponds
- if not self.do_force:
- s3_key = self.find_key_in_list(s3_list, file_key)
- if s3_key:
- s3_datetime = datetime.datetime(*time.strptime(
- s3_key.last_modified, '%Y-%m-%dT%H:%M:%S.000Z')[0:6])
- local_datetime = datetime.datetime.utcfromtimestamp(
- os.stat(filename).st_mtime)
- if local_datetime < s3_datetime:
- self.skip_count += 1
- if self.verbosity > 1:
- print ("File %s hasn't been modified since last " \
- "being uploaded" % (file_key))
- if file_key in self.s3_files:
- self.files_processed.add(file_key)
- del self.s3_files[file_key]
- continue
- if file_key in self.s3_files:
- self.files_processed.add(file_key)
- del self.s3_files[file_key]
- # File is newer, let's process and upload
- if self.verbosity > 0:
- print ("Uploading %s..." % file_key)
- if self.dry_run:
- self.upload_count += 1
- continue
- try:
- upload_file_to_s3(file_key, filename, key,
- do_gzip=self.do_gzip, do_expires=self.do_expires,
- verbosity=self.verbosity)
- except boto.exception.S3CreateError as e:
- # TODO: retry to create a few times
- print ("Failed to upload: %s" % e)
- except Exception as e:
- print (e)
- raise
- else:
- self.upload_count += 1
- # If we don't care about what's missing, wipe this to save memory.
- if not self.remove_missing:
- self.s3_files = {}
- else:
- self.finish_list(s3_list)
|