123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685 |
- # Amazon S3 Helper functions
- import sys
- import http.client
- import time
- import hashlib
- from pprint import pprint
- import string
- import random
- import re
- import hmac
- import mimetypes
- import base64
- import urllib.parse
- import os
- import types
- import traceback
- from stat import *
- from xml.etree import ElementTree
- from getpass import getpass
- REMOTE_ENC = 'utf-8'
- SEND_CHUNK_SIZE = 8192
- LOG_FILE = ''
- STORAGE_CLASS = 'REDUCED_REDUNDANCY'
- class FileInfo:
- def __init__(self):
- self.base_path = ""
- self.path = ""
- self.md5 = ""
- self.size = 0
- self.mod_time = 0
- self.storage_class = ""
- self.placeholder_file = None
- def __str__(self):
- try:
- return "\"" + self.path + "\" [" + str(self.size) + " bytes]"
- except:
- return "(Error preparing file info string)"
- def get_full_path(self):
- return os.path.join(self.base_path, self.path)
-
- def calc_md5(self, dir):
- full_path = os.path.join(dir, self.path)
- try:
- log("Calculating MD5 for " + full_path)
- out = get_file_md5(full_path)
- except:
- log("Error calculating MD5 for :" + full_path + " " + str(sys.exc_info()[0]))
- out = ""
- return out
- class FileInfoMap:
- def __init__(self):
- self.by_path = {}
- self.by_md5 = {}
- self.by_size = {}
- def add_file(self, info):
- self.by_path[info.path] = info
- if info.size > 0:
- if info.size in self.by_size:
- self.by_size[info.size].append(info)
- else:
- self.by_size[info.size] = [info]
- if info.md5 != "":
- self.by_md5[info.md5] = info
- outfile = None
- def close_log_file():
- global outfile
- if LOG_FILE != "":
- outfile.close()
- def log(str):
- global outfile
- if LOG_FILE != "":
- if outfile == None:
- outfile = open(LOG_FILE, "w")
- try:
- print(str)
- if LOG_FILE != "":
- outfile. write(str + "\n")
- except:
- print("Error printing or writing log file line.")
- def save_remote_map(remote_map, cache_file):
- paths = list(remote_map.by_path.keys())
- paths.sort()
- log("Saving remote map ...")
- fh = open(cache_file, 'w+')
- dir_stack = []
- for path in paths:
- line = ""
- dir_parts = path.split("/")
- file_part = dir_parts[-1]
- dir_parts = dir_parts[0:-1]
- #log("Saving map, dir parts: " + dir_parts)
- i = 0
- while i < len(dir_stack) and i < len(dir_parts) and dir_parts[i] == dir_stack[i]:
- line += "\t"
- i += 1
- j = i
- while j < len(dir_stack):
- dir_stack.pop()
- while i < len(dir_parts):
- line += dir_parts[i] + "/"
- dir_stack.append(dir_parts[i])
- i += 1
- info = remote_map.by_path[path]
- line += file_part + "\t" + str(info.size) + "\t" + info.md5 + "\t" + str(info.mod_time) + "\n"
- #log("Writing line:" + line)
- try:
- fh.write(line)
- except:
- log("Error writing line of remote map.")
- fh.close()
- log("Saved remote map.")
- def load_remote_map(cache_file):
- log("Reading remote cache file...")
- fh = open(cache_file, "r+")
- map = FileInfoMap()
- lines = fh.readlines()
- fh.close()
- log("Read remote cache file.")
- log("Creating remote map...")
- dir_stack = []
- indent = 0
- j = 0
- for line in lines:
- path = ""
- i = 0
- while line[i] == '\t':
- path += dir_stack[i] + "/"
- i += 1
- line = line[i:]
- while indent > i:
- dir_stack.pop()
- indent -= 1
- record = line.split("\t")
- #log("Record split: " + record
- dir_parts = record[0].split("/")
- file_part = dir_parts[-1]
- dir_parts = dir_parts[:-1]
- for part in dir_parts:
- dir_stack.append(part)
- indent += 1
- path += part + "/"
-
- info = FileInfo()
- info.path = path + file_part
- info.size = int(record[1])
- info.md5 = record[2]
- info.mod_time = int(record[3])
- #if j < 50:
- # log("File in map: " + info
- # log("File MD5: " + info.md5
- # log("File mod time: " + info.mod_time
- map.add_file(info)
- j+= 1
- log("Created remote map")
- return map
- # From: http://www.joelverhagen.com/blog/2011/02/md5-hash-of-file-in-python/
- def execute_operations(access_key, secret_key, local_dir, bucket, prefix, to_upload, to_copy, to_delete,
- remote_map, storage_class, access_level):
-
- remote_dir_base = "/" + bucket + "/" + prefix
- # Next thing to do: update the remote map accordingly to the things that change.
- total_ops = len(to_copy) + len(to_upload) + len(to_delete)
- op_num = 1
- for copy_op in to_copy:
- copy_local_dst = copy_op[0]
- copy_remote_src = copy_op[1]
- amz_headers = {}
- amz_headers['x-amz-storage-class'] = storage_class
- amz_headers['x-amz-acl'] = access_level
- amz_headers['x-amz-copy-source'] = remote_dir_base + copy_remote_src.path
- try:
- status = s3_operation(access_key, secret_key, "PUT", remote_dir_base + copy_local_dst.path, "", amz_headers)
-
- if status == 200:
- remote_map.add_file(copy_local_dst)
- log("Copied to: " + str(copy_local_dst) + " from " + str(copy_remote_src) \
- + " (" + str(op_num) + "/" + str(total_ops) + ")")
- else:
- log("Tried to copy, source not found, status: " + str(status))
-
- except:
- log("Error copying file: " + str(copy_local_dst) + " from " + str(copy_remote_src) + ": " + str(sys.exc_info()[0]))
- op_num += 1
- for local in to_upload:
- amz_headers = {}
- amz_headers['x-amz-storage-class'] = storage_class
- amz_headers['x-amz-acl'] = access_level
-
- try:
- s3_operation(access_key, secret_key, "PUT", remote_dir_base + local.path, "", amz_headers, local)
- remote_map.add_file(local)
- log("Uploaded: " + str(local) + " (" + str(op_num) + "/" + str(total_ops) + ")")
- except:
- log("Error uploading file: " + str(local) + ": " + str(sys.exc_info()[0]) + ": " + str(sys.exc_info()[1]))
- traceback.print_exc(file=sys.stdout)
- op_num += 1
- for remote_to_delete in to_delete:
- try:
- s3_operation(access_key, secret_key, "DELETE", remote_dir_base + remote_to_delete.path, "", {})
-
- # Note: I just delete it in the by_path category because only that is used for saving the map.
- del remote_map.by_path[remote_to_delete.path]
-
- log("Deleted: " + str(remote_to_delete) + " (" + str(op_num) + "/" + str(total_ops) + ")")
- except:
- log("Error deleting file: " + str(remote_to_delete) + ": " + str(sys.exc_info()[0]))
- op_num += 1
- def determine_operations(local_dir, local_map, remote_map):
- log("Comparing local and remote files...")
- to_upload = []
- to_copy = []
- to_delete = remote_map.by_path.copy()
- # eliminate duplicates in local files, replace with placeholders
- #size_cutoff = 1024
- # log("Checking for duplicate local files...")
- # local_sizes = list(local_map.by_size.keys())
- # for local_size in local_sizes:
- # if local_size > size_cutoff and len(local_map.by_size[local_size]) > 1:
- # same_size_files = local_map.by_size[local_size]
-
- #Calc MD5's, use from remote if matches
- # for local in same_size_files:
- # remote_by_path = remote_map.by_path.get(local.path)
- # if remote_by_path.size == local.size and remote_by_path.mod_time == local.mod_time:
- # local.md5 = remote_by_path.md5
- # else:
- # local.md5 = local.calc_md5(local_dir)
-
- #Find duplicates by MD5's
- # by_md5 = {}
- # for local in same_size_files:
- # if local.md5 in by_md5:
- # matching_file = by_md5[local.md5]
- # log("Duplicate local file: " + str(local) + " same as " + str(matching_file))
- # local.placeholder_file = matching_file
- # else:
- # by_md5[local.md5] = local
-
- file_list = local_map.by_path.values()
- for local in file_list:
- try:
- str(local)
- except UnicodeEncodeError:
- print("Unable to convert filename to unicode: " + local.encode(REMOTE_ENC))
- continue
-
- remote_by_path = remote_map.by_path.get(local.path)
- if remote_by_path != None:
- if local.size != remote_by_path.size:
- to_upload.append(local)
- elif local.mod_time != remote_by_path.mod_time or local.mod_time == 0 or remote_by_path.mod_time == 0:
- #log("Local mod time: " + local.mod_time
- #log("Remote mod time: " + remote_by_path.mod_time
- if local.md5 == "":
- local.md5 = local.calc_md5(local_dir)
-
- if local.md5 == remote_by_path.md5:
- remote_by_path.mod_time = local.mod_time
- else:
- to_upload.append(local)
- del to_delete[local.path]
- else:
- if local.size > 0:
- remote_by_size_bucket = remote_map.by_size.get(local.size)
- if remote_by_size_bucket == None:
- to_upload.append(local)
- else:
- #log("File not in remote map: " + local.path
- if local.md5 == "":
- local.md5 = local.calc_md5(local_dir)
- remote_by_md5 = remote_map.by_md5.get(local.md5)
- if remote_by_md5 == None or remote_by_md5.size != local.size:
- to_upload.append(local)
- else:
- to_copy.append([local, remote_by_md5])
- else:
- #log("Should upload blank file: " + local.path
- to_upload.append(local)
- to_delete = to_delete.values()
- for file in to_upload:
- log("About to upload: " + str(file))
- for dst_src in to_copy:
- log("About to copy to: " + str(dst_src[0]) + " from " + str(dst_src[1]))
- for file in to_delete:
- log("About to delete: " + str(file))
- log("Done comparing local and remote files...")
- return to_upload, to_copy, to_delete
- def get_local_file_map(local_dir, exclude):
- local_files = []
- local_dir_len = len(local_dir)
- map = FileInfoMap()
- log("Walking directory tree...")
-
- log("Excluding: " + str(exclude))
-
- paths = []
- for root, dirs, files in os.walk(local_dir):
- for name in files:
- full_path = os.path.join(root, name)
- if re.match(exclude, full_path) != None:
- #log("Excluded: " + str(full_path))
- pass
- else:
- paths.append(full_path)
-
- log("Loaded local files")
- log("Getting local file attributes and building map...")
- for full_path in paths:
- #try:
- stats = os.stat(full_path)
- info = FileInfo()
- info.base_path = local_dir
- info.path = (full_path[local_dir_len:]).replace("\\", "/")
- info.size = int(stats[ST_SIZE])
- info.mod_time = int(stats[ST_MTIME])
- map.add_file(info)
- #except:
- # log("Error getting stats for :" + full_path, sys.exc_info()[0])
- log("Built map of local files.")
- return map
- def get_remote_file_map(access_key, secret_key, bucket, prefix):
- map = FileInfoMap()
- marker = ""
- prefix_len = len(prefix)
- while True:
- url = "/" + bucket + "/"
- if marker != "" or prefix != "":
- query_str = "?"
- else:
- query_str = ""
- if marker != "":
- quoted_marker = urllib.parse.quote(marker)
- query_str += "marker=" + quoted_marker
- if prefix != "":
- query_str += "&"
- if prefix != "":
- prefix_str = "prefix=" + urllib.parse.quote(prefix)
- query_str += prefix_str
- result_dict = s3_operation(access_key, secret_key, "GET", url, query_str, {}, None)
-
- try:
- result_dict = result_dict['ListBucketResult']
- except:
- # In this case the bucket may not exist, just return an empty map
- return map
- if 'Contents' in result_dict:
- contents = result_dict['Contents']
-
- # If just one item, we need to put it in a list for the for loop
- if isinstance(contents, dict):
- contents = [contents]
-
- for file in contents:
- path_in_bucket = file['Key']
- if path_in_bucket.endswith("/"):
- # Skip the directory listings
- continue
- info = FileInfo()
- info.base_path = url + prefix
- info.path = path_in_bucket[prefix_len:]
- info.path = info.path
- info.storage_class = file['StorageClass']
- info.md5 = file['ETag'][1:-1]
- info.size = int(file['Size'])
- map.add_file(info)
- else:
- contents = []
- if result_dict['IsTruncated'] != 'true':
- break;
- else:
- marker = contents[-1]['Key']
- log("Loaded remote to: " + marker)
- return map
- def s3_operation(access_key, secret_key, method, path, query_str="", amz_headers={}, body_file=None):
- server = 's3.amazonaws.com'
- conn = http.client.HTTPConnection(server)
- #Perhaps I could get this from the body_file.md5 if it is specified.
- content_md5 = ""
- content_type = mimetypes.guess_type(path)[0]
- if content_type == None:
- content_type = ""
-
- resource_str = urllib.parse.quote(path)
- resource_str = resource_str
- date_str = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
- amz_headers_str = ""
-
- sorted_keys = list(amz_headers.keys())
- sorted_keys.sort()
- for key in sorted_keys:
- if key == 'x-amz-copy-source':
- amz_headers[key] = urllib.parse.quote(amz_headers[key])
- amz_headers_str += key + ":" + amz_headers[key] + "\n"
- # From: http://mashupguide.net/1.0/html/ch16s05.xhtml
- string_to_sign = method + "\n" + content_md5 + "\n" + content_type + "\n" + date_str + "\n" + amz_headers_str + resource_str
-
- signature = base64.b64encode(hmac.new(secret_key.encode(REMOTE_ENC), string_to_sign.encode(REMOTE_ENC), hashlib.sha1).digest())
-
- #log("String to sign"
- #log("========================"
- #log string_to_sign
- #log("========================"
- headers = amz_headers
- headers['Date'] = date_str
- # Doesn't hurt to query it again here, so that in case it changed, we have it up-to-date
- if body_file != None:
- body_file.size = os.stat(body_file.get_full_path())[ST_SIZE]
- headers['Content-Length'] = body_file.size
-
- headers['Content-Type'] = content_type
- headers['Authorization'] = "AWS " + access_key + ":" + signature.decode(REMOTE_ENC)
- conn.connect()
- url = resource_str + query_str
- if method == "PUT":
- conn.putrequest(method, url)
- # Took from S3.py
-
- header_keys = headers.keys()
- for header in header_keys:
- conn.putheader(header, headers[header])
- conn.endheaders()
- else:
- conn.request(method, url, "", headers)
- if body_file != None:
- fh = open(body_file.get_full_path(), 'rb')
- fh.seek(0)
- md5_hash = hashlib.md5()
- size_left = body_file.size
- while size_left > 0:
- data = fh.read(SEND_CHUNK_SIZE)
- md5_hash.update(data)
- conn.send(data)
- size_left -= len(data)
- body_file.md5 = md5_hash.hexdigest()
- fh.close()
- else:
- md5_hash = ""
- response = conn.getresponse()
- headers = tuple_list_to_dict(response.getheaders())
- data = response.read()
- conn.close()
- if (response.status != 200 and method != "DELETE") or (response.reason == 204 and method != "DELETE"):
- #if (response.status == 404 and amz_headers_str)
- msg = 'Error response: ' + str(response.status) + "||reason: " + response.reason + "||data: " + data.decode(REMOTE_ENC)
- log(msg)
- #raise Exception(msg)
- if body_file != None:
- if body_file.md5 != headers["ETag"][1:-1]:
- msg = "MD5 Differs for file: " + body_file.path
- log >> sys.stderr, msg
- raise Exception(msg)
- elif method == "GET":
- #log(data)
-
- return xml_to_dict(data)
-
- return response.status
- # ------------------------------------------------------------------------
- # Misc Helpers
- # ------------------------------------------------------------------------
- def tuple_list_to_dict(tuple_list):
- dict = {}
- for tuple in tuple_list:
- dict[tuple[0]] = tuple[1]
- return dict
- def get_file_md5(path):
- fh = open(path, 'rb')
- m = hashlib.md5()
- while True:
- data = fh.read(8192)
- if not data:
- break
- m.update(data)
- fh.close()
- return m.hexdigest()
- # ------------------------------------------------------------------------
- # XML Helpers
- # ------------------------------------------------------------------------
- def strip_tag_braces_part(tag):
- index = tag.find("}")
- if index == -1:
- return tag
- else:
- return tag[index + 1:]
- ## From: http://code.activestate.com/recipes/573463-converting-xml-to-dictionary-and-back/
- def xml_to_dict_recurse(node):
- nodedict = {}
- if len(node.items()) > 0:
- # if we have attributes, set them
- nodedict.update(dict(node.items()))
- for child in node:
- # recursively add the element's children
- newitem = xml_to_dict_recurse(child)
- tag = strip_tag_braces_part(child.tag)
- if tag in nodedict:
- # found duplicate tag, force a list
- if type(nodedict[tag]) is type([]):
- # append to existing list
- nodedict[tag].append(newitem)
- else:
- # convert to list
- nodedict[tag] = [nodedict[tag], newitem]
- else:
- # only one, directly set the dictionary
- nodedict[tag] = newitem
- if node.text is None:
- text = ''
- else:
- text = node.text.strip()
- if len(nodedict) > 0:
- # if we have a dictionary add the text as a dictionary value (if there is any)
- if len(text) > 0:
- nodedict['_text'] = text
- else:
- # if we don't have child nodes or attributes, just set the text
- nodedict = text
- return nodedict
- def xml_to_dict(str):
- root = ElementTree.fromstring(str)
- dict = {}
- tag = strip_tag_braces_part(root.tag)
- dict[tag] = xml_to_dict_recurse(root)
- return dict
- # ------------------------------------------------------------------------
- # AES Password Helpers
- # ------------------------------------------------------------------------
- def get_key_from_pw(password):
- random.seed(9284)
- round = 0
- num_rounds = 10000
- rand_salt_len = 20
- initial_salt = "asjkenvien732xe;'/~124*asdfze".encode(REMOTE_ENC)
- key = hashlib.sha256(password.encode(REMOTE_ENC) + initial_salt).digest()
- char_set = string.ascii_uppercase + string.digits
- while round < num_rounds:
- rand_bytes = "".join(random.sample(char_set,rand_salt_len)).encode(REMOTE_ENC)
- key = hashlib.sha256(key + rand_bytes).digest()
- round += 1
- return key
- # Taken from: http://www.codekoala.com/blog/2009/aes-encryption-python-using-pycrypto/
- # Requires PyCrypto
- def encrypt_decrypt_aes(should_encrypt, password, data):
- from Crypto.Cipher import AES
- data = data.encode(REMOTE_ENC)
- key = get_key_from_pw(password)
- PADDING = b'{'
- BLOCK_SIZE = 32
- # create a cipher object using the random secret
- cipher = AES.new(key)
- if should_encrypt:
- data += (BLOCK_SIZE - len(data) % BLOCK_SIZE) * PADDING
- data = cipher.encrypt(data)
- data = base64.b64encode(data)
- else:
- data = base64.b64decode(data)
- data = cipher.decrypt(data)
- data = data.rstrip(PADDING)
-
- return data.decode(REMOTE_ENC)
- def encrypt_secret_key(password, secret_key):
- return encrypt_decrypt_aes(True, password, secret_key)
- def decrypt_secret_key(password, secret_key):
- return encrypt_decrypt_aes(False, password, secret_key)
|