s3_helpers.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685
  1. # Amazon S3 Helper functions
  2. import sys
  3. import http.client
  4. import time
  5. import hashlib
  6. from pprint import pprint
  7. import string
  8. import random
  9. import re
  10. import hmac
  11. import mimetypes
  12. import base64
  13. import urllib.parse
  14. import os
  15. import types
  16. import traceback
  17. from stat import *
  18. from xml.etree import ElementTree
  19. from getpass import getpass
  20. REMOTE_ENC = 'utf-8'
  21. SEND_CHUNK_SIZE = 8192
  22. LOG_FILE = ''
  23. STORAGE_CLASS = 'REDUCED_REDUNDANCY'
  24. class FileInfo:
  25. def __init__(self):
  26. self.base_path = ""
  27. self.path = ""
  28. self.md5 = ""
  29. self.size = 0
  30. self.mod_time = 0
  31. self.storage_class = ""
  32. self.placeholder_file = None
  33. def __str__(self):
  34. try:
  35. return "\"" + self.path + "\" [" + str(self.size) + " bytes]"
  36. except:
  37. return "(Error preparing file info string)"
  38. def get_full_path(self):
  39. return os.path.join(self.base_path, self.path)
  40. def calc_md5(self, dir):
  41. full_path = os.path.join(dir, self.path)
  42. try:
  43. log("Calculating MD5 for " + full_path)
  44. out = get_file_md5(full_path)
  45. except:
  46. log("Error calculating MD5 for :" + full_path + " " + str(sys.exc_info()[0]))
  47. out = ""
  48. return out
  49. class FileInfoMap:
  50. def __init__(self):
  51. self.by_path = {}
  52. self.by_md5 = {}
  53. self.by_size = {}
  54. def add_file(self, info):
  55. self.by_path[info.path] = info
  56. if info.size > 0:
  57. if info.size in self.by_size:
  58. self.by_size[info.size].append(info)
  59. else:
  60. self.by_size[info.size] = [info]
  61. if info.md5 != "":
  62. self.by_md5[info.md5] = info
  63. outfile = None
  64. def close_log_file():
  65. global outfile
  66. if LOG_FILE != "":
  67. outfile.close()
  68. def log(str):
  69. global outfile
  70. if LOG_FILE != "":
  71. if outfile == None:
  72. outfile = open(LOG_FILE, "w")
  73. try:
  74. print(str)
  75. if LOG_FILE != "":
  76. outfile. write(str + "\n")
  77. except:
  78. print("Error printing or writing log file line.")
  79. def save_remote_map(remote_map, cache_file):
  80. paths = list(remote_map.by_path.keys())
  81. paths.sort()
  82. log("Saving remote map ...")
  83. fh = open(cache_file, 'w+')
  84. dir_stack = []
  85. for path in paths:
  86. line = ""
  87. dir_parts = path.split("/")
  88. file_part = dir_parts[-1]
  89. dir_parts = dir_parts[0:-1]
  90. #log("Saving map, dir parts: " + dir_parts)
  91. i = 0
  92. while i < len(dir_stack) and i < len(dir_parts) and dir_parts[i] == dir_stack[i]:
  93. line += "\t"
  94. i += 1
  95. j = i
  96. while j < len(dir_stack):
  97. dir_stack.pop()
  98. while i < len(dir_parts):
  99. line += dir_parts[i] + "/"
  100. dir_stack.append(dir_parts[i])
  101. i += 1
  102. info = remote_map.by_path[path]
  103. line += file_part + "\t" + str(info.size) + "\t" + info.md5 + "\t" + str(info.mod_time) + "\n"
  104. #log("Writing line:" + line)
  105. try:
  106. fh.write(line)
  107. except:
  108. log("Error writing line of remote map.")
  109. fh.close()
  110. log("Saved remote map.")
  111. def load_remote_map(cache_file):
  112. log("Reading remote cache file...")
  113. fh = open(cache_file, "r+")
  114. map = FileInfoMap()
  115. lines = fh.readlines()
  116. fh.close()
  117. log("Read remote cache file.")
  118. log("Creating remote map...")
  119. dir_stack = []
  120. indent = 0
  121. j = 0
  122. for line in lines:
  123. path = ""
  124. i = 0
  125. while line[i] == '\t':
  126. path += dir_stack[i] + "/"
  127. i += 1
  128. line = line[i:]
  129. while indent > i:
  130. dir_stack.pop()
  131. indent -= 1
  132. record = line.split("\t")
  133. #log("Record split: " + record
  134. dir_parts = record[0].split("/")
  135. file_part = dir_parts[-1]
  136. dir_parts = dir_parts[:-1]
  137. for part in dir_parts:
  138. dir_stack.append(part)
  139. indent += 1
  140. path += part + "/"
  141. info = FileInfo()
  142. info.path = path + file_part
  143. info.size = int(record[1])
  144. info.md5 = record[2]
  145. info.mod_time = int(record[3])
  146. #if j < 50:
  147. # log("File in map: " + info
  148. # log("File MD5: " + info.md5
  149. # log("File mod time: " + info.mod_time
  150. map.add_file(info)
  151. j+= 1
  152. log("Created remote map")
  153. return map
  154. # From: http://www.joelverhagen.com/blog/2011/02/md5-hash-of-file-in-python/
  155. def execute_operations(access_key, secret_key, local_dir, bucket, prefix, to_upload, to_copy, to_delete,
  156. remote_map, storage_class, access_level):
  157. remote_dir_base = "/" + bucket + "/" + prefix
  158. # Next thing to do: update the remote map accordingly to the things that change.
  159. total_ops = len(to_copy) + len(to_upload) + len(to_delete)
  160. op_num = 1
  161. for copy_op in to_copy:
  162. copy_local_dst = copy_op[0]
  163. copy_remote_src = copy_op[1]
  164. amz_headers = {}
  165. amz_headers['x-amz-storage-class'] = storage_class
  166. amz_headers['x-amz-acl'] = access_level
  167. amz_headers['x-amz-copy-source'] = remote_dir_base + copy_remote_src.path
  168. try:
  169. status = s3_operation(access_key, secret_key, "PUT", remote_dir_base + copy_local_dst.path, "", amz_headers)
  170. if status == 200:
  171. remote_map.add_file(copy_local_dst)
  172. log("Copied to: " + str(copy_local_dst) + " from " + str(copy_remote_src) \
  173. + " (" + str(op_num) + "/" + str(total_ops) + ")")
  174. else:
  175. log("Tried to copy, source not found, status: " + str(status))
  176. except:
  177. log("Error copying file: " + str(copy_local_dst) + " from " + str(copy_remote_src) + ": " + str(sys.exc_info()[0]))
  178. op_num += 1
  179. for local in to_upload:
  180. amz_headers = {}
  181. amz_headers['x-amz-storage-class'] = storage_class
  182. amz_headers['x-amz-acl'] = access_level
  183. try:
  184. s3_operation(access_key, secret_key, "PUT", remote_dir_base + local.path, "", amz_headers, local)
  185. remote_map.add_file(local)
  186. log("Uploaded: " + str(local) + " (" + str(op_num) + "/" + str(total_ops) + ")")
  187. except:
  188. log("Error uploading file: " + str(local) + ": " + str(sys.exc_info()[0]) + ": " + str(sys.exc_info()[1]))
  189. traceback.print_exc(file=sys.stdout)
  190. op_num += 1
  191. for remote_to_delete in to_delete:
  192. try:
  193. s3_operation(access_key, secret_key, "DELETE", remote_dir_base + remote_to_delete.path, "", {})
  194. # Note: I just delete it in the by_path category because only that is used for saving the map.
  195. del remote_map.by_path[remote_to_delete.path]
  196. log("Deleted: " + str(remote_to_delete) + " (" + str(op_num) + "/" + str(total_ops) + ")")
  197. except:
  198. log("Error deleting file: " + str(remote_to_delete) + ": " + str(sys.exc_info()[0]))
  199. op_num += 1
  200. def determine_operations(local_dir, local_map, remote_map):
  201. log("Comparing local and remote files...")
  202. to_upload = []
  203. to_copy = []
  204. to_delete = remote_map.by_path.copy()
  205. # eliminate duplicates in local files, replace with placeholders
  206. #size_cutoff = 1024
  207. # log("Checking for duplicate local files...")
  208. # local_sizes = list(local_map.by_size.keys())
  209. # for local_size in local_sizes:
  210. # if local_size > size_cutoff and len(local_map.by_size[local_size]) > 1:
  211. # same_size_files = local_map.by_size[local_size]
  212. #Calc MD5's, use from remote if matches
  213. # for local in same_size_files:
  214. # remote_by_path = remote_map.by_path.get(local.path)
  215. # if remote_by_path.size == local.size and remote_by_path.mod_time == local.mod_time:
  216. # local.md5 = remote_by_path.md5
  217. # else:
  218. # local.md5 = local.calc_md5(local_dir)
  219. #Find duplicates by MD5's
  220. # by_md5 = {}
  221. # for local in same_size_files:
  222. # if local.md5 in by_md5:
  223. # matching_file = by_md5[local.md5]
  224. # log("Duplicate local file: " + str(local) + " same as " + str(matching_file))
  225. # local.placeholder_file = matching_file
  226. # else:
  227. # by_md5[local.md5] = local
  228. file_list = local_map.by_path.values()
  229. for local in file_list:
  230. try:
  231. str(local)
  232. except UnicodeEncodeError:
  233. print("Unable to convert filename to unicode: " + local.encode(REMOTE_ENC))
  234. continue
  235. remote_by_path = remote_map.by_path.get(local.path)
  236. if remote_by_path != None:
  237. if local.size != remote_by_path.size:
  238. to_upload.append(local)
  239. elif local.mod_time != remote_by_path.mod_time or local.mod_time == 0 or remote_by_path.mod_time == 0:
  240. #log("Local mod time: " + local.mod_time
  241. #log("Remote mod time: " + remote_by_path.mod_time
  242. if local.md5 == "":
  243. local.md5 = local.calc_md5(local_dir)
  244. if local.md5 == remote_by_path.md5:
  245. remote_by_path.mod_time = local.mod_time
  246. else:
  247. to_upload.append(local)
  248. del to_delete[local.path]
  249. else:
  250. if local.size > 0:
  251. remote_by_size_bucket = remote_map.by_size.get(local.size)
  252. if remote_by_size_bucket == None:
  253. to_upload.append(local)
  254. else:
  255. #log("File not in remote map: " + local.path
  256. if local.md5 == "":
  257. local.md5 = local.calc_md5(local_dir)
  258. remote_by_md5 = remote_map.by_md5.get(local.md5)
  259. if remote_by_md5 == None or remote_by_md5.size != local.size:
  260. to_upload.append(local)
  261. else:
  262. to_copy.append([local, remote_by_md5])
  263. else:
  264. #log("Should upload blank file: " + local.path
  265. to_upload.append(local)
  266. to_delete = to_delete.values()
  267. for file in to_upload:
  268. log("About to upload: " + str(file))
  269. for dst_src in to_copy:
  270. log("About to copy to: " + str(dst_src[0]) + " from " + str(dst_src[1]))
  271. for file in to_delete:
  272. log("About to delete: " + str(file))
  273. log("Done comparing local and remote files...")
  274. return to_upload, to_copy, to_delete
  275. def get_local_file_map(local_dir, exclude):
  276. local_files = []
  277. local_dir_len = len(local_dir)
  278. map = FileInfoMap()
  279. log("Walking directory tree...")
  280. log("Excluding: " + str(exclude))
  281. paths = []
  282. for root, dirs, files in os.walk(local_dir):
  283. for name in files:
  284. full_path = os.path.join(root, name)
  285. if re.match(exclude, full_path) != None:
  286. #log("Excluded: " + str(full_path))
  287. pass
  288. else:
  289. paths.append(full_path)
  290. log("Loaded local files")
  291. log("Getting local file attributes and building map...")
  292. for full_path in paths:
  293. #try:
  294. stats = os.stat(full_path)
  295. info = FileInfo()
  296. info.base_path = local_dir
  297. info.path = (full_path[local_dir_len:]).replace("\\", "/")
  298. info.size = int(stats[ST_SIZE])
  299. info.mod_time = int(stats[ST_MTIME])
  300. map.add_file(info)
  301. #except:
  302. # log("Error getting stats for :" + full_path, sys.exc_info()[0])
  303. log("Built map of local files.")
  304. return map
  305. def get_remote_file_map(access_key, secret_key, bucket, prefix):
  306. map = FileInfoMap()
  307. marker = ""
  308. prefix_len = len(prefix)
  309. while True:
  310. url = "/" + bucket + "/"
  311. if marker != "" or prefix != "":
  312. query_str = "?"
  313. else:
  314. query_str = ""
  315. if marker != "":
  316. quoted_marker = urllib.parse.quote(marker)
  317. query_str += "marker=" + quoted_marker
  318. if prefix != "":
  319. query_str += "&"
  320. if prefix != "":
  321. prefix_str = "prefix=" + urllib.parse.quote(prefix)
  322. query_str += prefix_str
  323. result_dict = s3_operation(access_key, secret_key, "GET", url, query_str, {}, None)
  324. try:
  325. result_dict = result_dict['ListBucketResult']
  326. except:
  327. # In this case the bucket may not exist, just return an empty map
  328. return map
  329. if 'Contents' in result_dict:
  330. contents = result_dict['Contents']
  331. # If just one item, we need to put it in a list for the for loop
  332. if isinstance(contents, dict):
  333. contents = [contents]
  334. for file in contents:
  335. path_in_bucket = file['Key']
  336. if path_in_bucket.endswith("/"):
  337. # Skip the directory listings
  338. continue
  339. info = FileInfo()
  340. info.base_path = url + prefix
  341. info.path = path_in_bucket[prefix_len:]
  342. info.path = info.path
  343. info.storage_class = file['StorageClass']
  344. info.md5 = file['ETag'][1:-1]
  345. info.size = int(file['Size'])
  346. map.add_file(info)
  347. else:
  348. contents = []
  349. if result_dict['IsTruncated'] != 'true':
  350. break;
  351. else:
  352. marker = contents[-1]['Key']
  353. log("Loaded remote to: " + marker)
  354. return map
  355. def s3_operation(access_key, secret_key, method, path, query_str="", amz_headers={}, body_file=None):
  356. server = 's3.amazonaws.com'
  357. conn = http.client.HTTPConnection(server)
  358. #Perhaps I could get this from the body_file.md5 if it is specified.
  359. content_md5 = ""
  360. content_type = mimetypes.guess_type(path)[0]
  361. if content_type == None:
  362. content_type = ""
  363. resource_str = urllib.parse.quote(path)
  364. resource_str = resource_str
  365. date_str = time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())
  366. amz_headers_str = ""
  367. sorted_keys = list(amz_headers.keys())
  368. sorted_keys.sort()
  369. for key in sorted_keys:
  370. if key == 'x-amz-copy-source':
  371. amz_headers[key] = urllib.parse.quote(amz_headers[key])
  372. amz_headers_str += key + ":" + amz_headers[key] + "\n"
  373. # From: http://mashupguide.net/1.0/html/ch16s05.xhtml
  374. string_to_sign = method + "\n" + content_md5 + "\n" + content_type + "\n" + date_str + "\n" + amz_headers_str + resource_str
  375. signature = base64.b64encode(hmac.new(secret_key.encode(REMOTE_ENC), string_to_sign.encode(REMOTE_ENC), hashlib.sha1).digest())
  376. #log("String to sign"
  377. #log("========================"
  378. #log string_to_sign
  379. #log("========================"
  380. headers = amz_headers
  381. headers['Date'] = date_str
  382. # Doesn't hurt to query it again here, so that in case it changed, we have it up-to-date
  383. if body_file != None:
  384. body_file.size = os.stat(body_file.get_full_path())[ST_SIZE]
  385. headers['Content-Length'] = body_file.size
  386. headers['Content-Type'] = content_type
  387. headers['Authorization'] = "AWS " + access_key + ":" + signature.decode(REMOTE_ENC)
  388. conn.connect()
  389. url = resource_str + query_str
  390. if method == "PUT":
  391. conn.putrequest(method, url)
  392. # Took from S3.py
  393. header_keys = headers.keys()
  394. for header in header_keys:
  395. conn.putheader(header, headers[header])
  396. conn.endheaders()
  397. else:
  398. conn.request(method, url, "", headers)
  399. if body_file != None:
  400. fh = open(body_file.get_full_path(), 'rb')
  401. fh.seek(0)
  402. md5_hash = hashlib.md5()
  403. size_left = body_file.size
  404. while size_left > 0:
  405. data = fh.read(SEND_CHUNK_SIZE)
  406. md5_hash.update(data)
  407. conn.send(data)
  408. size_left -= len(data)
  409. body_file.md5 = md5_hash.hexdigest()
  410. fh.close()
  411. else:
  412. md5_hash = ""
  413. response = conn.getresponse()
  414. headers = tuple_list_to_dict(response.getheaders())
  415. data = response.read()
  416. conn.close()
  417. if (response.status != 200 and method != "DELETE") or (response.reason == 204 and method != "DELETE"):
  418. #if (response.status == 404 and amz_headers_str)
  419. msg = 'Error response: ' + str(response.status) + "||reason: " + response.reason + "||data: " + data.decode(REMOTE_ENC)
  420. log(msg)
  421. #raise Exception(msg)
  422. if body_file != None:
  423. if body_file.md5 != headers["ETag"][1:-1]:
  424. msg = "MD5 Differs for file: " + body_file.path
  425. log >> sys.stderr, msg
  426. raise Exception(msg)
  427. elif method == "GET":
  428. #log(data)
  429. return xml_to_dict(data)
  430. return response.status
  431. # ------------------------------------------------------------------------
  432. # Misc Helpers
  433. # ------------------------------------------------------------------------
  434. def tuple_list_to_dict(tuple_list):
  435. dict = {}
  436. for tuple in tuple_list:
  437. dict[tuple[0]] = tuple[1]
  438. return dict
  439. def get_file_md5(path):
  440. fh = open(path, 'rb')
  441. m = hashlib.md5()
  442. while True:
  443. data = fh.read(8192)
  444. if not data:
  445. break
  446. m.update(data)
  447. fh.close()
  448. return m.hexdigest()
  449. # ------------------------------------------------------------------------
  450. # XML Helpers
  451. # ------------------------------------------------------------------------
  452. def strip_tag_braces_part(tag):
  453. index = tag.find("}")
  454. if index == -1:
  455. return tag
  456. else:
  457. return tag[index + 1:]
  458. ## From: http://code.activestate.com/recipes/573463-converting-xml-to-dictionary-and-back/
  459. def xml_to_dict_recurse(node):
  460. nodedict = {}
  461. if len(node.items()) > 0:
  462. # if we have attributes, set them
  463. nodedict.update(dict(node.items()))
  464. for child in node:
  465. # recursively add the element's children
  466. newitem = xml_to_dict_recurse(child)
  467. tag = strip_tag_braces_part(child.tag)
  468. if tag in nodedict:
  469. # found duplicate tag, force a list
  470. if type(nodedict[tag]) is type([]):
  471. # append to existing list
  472. nodedict[tag].append(newitem)
  473. else:
  474. # convert to list
  475. nodedict[tag] = [nodedict[tag], newitem]
  476. else:
  477. # only one, directly set the dictionary
  478. nodedict[tag] = newitem
  479. if node.text is None:
  480. text = ''
  481. else:
  482. text = node.text.strip()
  483. if len(nodedict) > 0:
  484. # if we have a dictionary add the text as a dictionary value (if there is any)
  485. if len(text) > 0:
  486. nodedict['_text'] = text
  487. else:
  488. # if we don't have child nodes or attributes, just set the text
  489. nodedict = text
  490. return nodedict
  491. def xml_to_dict(str):
  492. root = ElementTree.fromstring(str)
  493. dict = {}
  494. tag = strip_tag_braces_part(root.tag)
  495. dict[tag] = xml_to_dict_recurse(root)
  496. return dict
  497. # ------------------------------------------------------------------------
  498. # AES Password Helpers
  499. # ------------------------------------------------------------------------
  500. def get_key_from_pw(password):
  501. random.seed(9284)
  502. round = 0
  503. num_rounds = 10000
  504. rand_salt_len = 20
  505. initial_salt = "asjkenvien732xe;'/~124*asdfze".encode(REMOTE_ENC)
  506. key = hashlib.sha256(password.encode(REMOTE_ENC) + initial_salt).digest()
  507. char_set = string.ascii_uppercase + string.digits
  508. while round < num_rounds:
  509. rand_bytes = "".join(random.sample(char_set,rand_salt_len)).encode(REMOTE_ENC)
  510. key = hashlib.sha256(key + rand_bytes).digest()
  511. round += 1
  512. return key
  513. # Taken from: http://www.codekoala.com/blog/2009/aes-encryption-python-using-pycrypto/
  514. # Requires PyCrypto
  515. def encrypt_decrypt_aes(should_encrypt, password, data):
  516. from Crypto.Cipher import AES
  517. data = data.encode(REMOTE_ENC)
  518. key = get_key_from_pw(password)
  519. PADDING = b'{'
  520. BLOCK_SIZE = 32
  521. # create a cipher object using the random secret
  522. cipher = AES.new(key)
  523. if should_encrypt:
  524. data += (BLOCK_SIZE - len(data) % BLOCK_SIZE) * PADDING
  525. data = cipher.encrypt(data)
  526. data = base64.b64encode(data)
  527. else:
  528. data = base64.b64decode(data)
  529. data = cipher.decrypt(data)
  530. data = data.rstrip(PADDING)
  531. return data.decode(REMOTE_ENC)
  532. def encrypt_secret_key(password, secret_key):
  533. return encrypt_decrypt_aes(True, password, secret_key)
  534. def decrypt_secret_key(password, secret_key):
  535. return encrypt_decrypt_aes(False, password, secret_key)