methods.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831
  1. #!/usr/bin/env python
  2. from azure.storage.blob import BlobSasPermissions, BlobServiceClient, generate_blob_sas, RetentionPolicy
  3. from argparse import ArgumentParser
  4. from io import StringIO
  5. import pandas as pd
  6. import numpy as np
  7. import coloredlogs
  8. import datetime
  9. import logging
  10. import getpass
  11. import keyring
  12. import pathlib
  13. import pandas
  14. import azure
  15. import time
  16. import os
  17. import re
  18. def create_parent_parser(parser, container=True):
  19. """
  20. Create a parent parser with arguments common to multiple scripts
  21. :param parser: type ArgumentParser object
  22. :param container: type bool: Boolean on whether the container argument should be added to the parser
  23. :return: subparsers: type ArgumentParser.add_subparsers
  24. :return: parent_parser: type ArgumentParser: Populated ArgumentParser object
  25. """
  26. subparsers = parser.add_subparsers(title='Available functionality')
  27. # Create a parental parser that can be inherited by subparsers
  28. parent_parser = ArgumentParser(add_help=False)
  29. if container:
  30. parent_parser.add_argument('-c', '--container_name',
  31. required=True,
  32. type=str,
  33. default=str(),
  34. help='Name of the Azure storage container. Note that container names must be '
  35. 'lowercase, between 3 and 63 characters, start with a letter or number, and '
  36. 'can contain only letters, numbers, and the dash (-) character. Consecutive '
  37. 'dashes are not permitted.')
  38. parent_parser.add_argument('-a', '--account_name',
  39. required=True,
  40. type=str,
  41. help='Name of the Azure storage account')
  42. parent_parser.add_argument('-p', '--passphrase',
  43. default='AzureStorage',
  44. type=str,
  45. help='The passphrase to use when encrypting the azure storage-specific connection '
  46. 'string to the system keyring. Default is "AzureStorage".')
  47. parent_parser.add_argument('-v', '--verbosity',
  48. choices=['debug', 'info', 'warning', 'error', 'critical'],
  49. metavar='VERBOSITY',
  50. default='info',
  51. help='Set the logging level. Options are debug, info, warning, error, and critical. '
  52. 'Default is info.')
  53. return subparsers, parent_parser
  54. def setup_logging(arguments):
  55. """
  56. Set the custom colour scheme and message format to used by coloredlogs
  57. :param arguments: type parsed ArgumentParser object
  58. """
  59. # Set up logging
  60. coloredlogs.DEFAULT_LEVEL_STYLES = {'debug': {'bold': True, 'color': 'green'},
  61. 'info': {'bold': True, 'color': 'blue'},
  62. 'warning': {'bold': True, 'color': 'yellow'},
  63. 'error': {'bold': True, 'color': 'red'},
  64. 'critical': {'bold': True, 'background': 'red'}
  65. }
  66. coloredlogs.DEFAULT_LOG_FORMAT = '%(asctime)s %(levelname)s %(message)s'
  67. coloredlogs.install(level=arguments.verbosity.upper())
  68. def setup_arguments(parser):
  69. """
  70. Finalise setting up the ArgumentParser arguments into an object, and running subparser functions, or displaying the
  71. help message
  72. :param parser: type: ArgumentParser object
  73. :return: parsed ArgumentParser object
  74. """
  75. # Get the arguments into an object
  76. arguments = parser.parse_args()
  77. # Run the appropriate function for each sub-parser.
  78. if hasattr(arguments, 'func'):
  79. # Set up logging
  80. setup_logging(arguments=arguments)
  81. arguments.func(arguments)
  82. # If the 'func' attribute doesn't exist, display the basic help for the appropriate subparser (if any)
  83. else:
  84. try:
  85. # Determine which subparser was called by extracting it from the arguments. Note that this requires the
  86. # use of the desc keyword when creating subparsers
  87. command = list(vars(arguments).keys())[0]
  88. # If the extracted command exists, use the command-specific subparser help
  89. if command:
  90. parser.parse_args([command, '-h'])
  91. # Otherwise, use the basic help
  92. else:
  93. parser.parse_args(['-h'])
  94. # If the were no subparsers specified (the list of keys in the arguments is empty), use the basic help
  95. except IndexError:
  96. parser.parse_args(['-h'])
  97. return arguments
  98. def set_account_name(passphrase, account_name=None):
  99. """
  100. Store the account name in the system keyring
  101. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  102. :param account_name: type str: Name of the Azure storage account
  103. """
  104. # Only prompt the user for the account name if it was not provided
  105. if account_name is None:
  106. # Prompt the user for the account name
  107. account_name = input('Please enter your account name\n').encode('utf-8').decode()
  108. # Set the account name into the keyring. Treat it as a password, and use the passphrase as both the service ID,
  109. # and the username
  110. keyring.set_password(passphrase, passphrase, account_name)
  111. return account_name
  112. def set_connection_string(passphrase, account_name):
  113. """
  114. Prompt the user for the connection string, and store it the system keyring
  115. Uses logic from https://stackoverflow.com/a/31882203
  116. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  117. :param account_name: type str: Name of the Azure storage account
  118. :return: connect_str: String of the connection string
  119. """
  120. # Prompt the user for the connection string. Use decode to convert from bytes. Use getpass, so the plain text
  121. # password isn't printed to the screen
  122. connect_str = getpass.getpass(prompt='Please enter the connection string for your Azure storage account:\n')\
  123. .encode('utf-8').decode()
  124. # Ensure that the account name provided and the account name specified in the connection string match
  125. confirm_account_match(account_name=account_name,
  126. connect_str=connect_str)
  127. # Set the password in the keyring. Use the passphrase as the service ID, the account name as the username,
  128. # and the connection string as the password
  129. keyring.set_password(passphrase, account_name, connect_str)
  130. logging.info('Successfully entered credentials into keyring')
  131. return connect_str
  132. def confirm_account_match(account_name, connect_str):
  133. """
  134. Ensure that the account name provided matches the account name stored in the connection string
  135. :param connect_str: type str: Connection string for the Azure storage account
  136. :param account_name: type str: Name of the Azure storage account
  137. """
  138. # Attempt to extract the account name from the connection string
  139. try:
  140. connect_str_account_name = connect_str.split(';')[1].split('AccountName=')[-1]
  141. # Ensure that the account name provided matches the account name found in the connection string
  142. if account_name != connect_str_account_name:
  143. logging.error(f'The supplied account name {account_name} does not match the account name stored in the '
  144. f'connection string ({connect_str_account_name}). Please ensure that you are providing the '
  145. f'appropriate connection string for your account.')
  146. raise SystemExit
  147. # If splitting on 'AccountName=' fails, the connection string is either malformed or invalid
  148. except IndexError:
  149. logging.error('Could not parse the account key from the connection string in the keyring. Please ensure that '
  150. 'it has been entered, and the it conforms to the proper format: '
  151. 'DefaultEndpointsProtocol=https;AccountName=[REDACTED];AccountKey=[REDACTED];'
  152. 'EndpointSuffix=core.windows.net')
  153. raise SystemExit
  154. return True
  155. def extract_connection_string(passphrase, account_name):
  156. """
  157. Extract the connection string from the keyring using the account name and passphrase
  158. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  159. :param account_name: type str: Name of the Azure storage account
  160. :return: connect_str: String of the connection string
  161. """
  162. # Use the passphrase and the account name to extract the connection string from the keyring
  163. connect_str = keyring.get_password(passphrase,
  164. account_name)
  165. # If the connection string can't be found in the keyring using the supplied passphrase, prompt the user for
  166. # the passphrase, and store it
  167. if not connect_str:
  168. logging.warning(f'Connection string linked to the provided passphrase {passphrase} and account name '
  169. f'{account_name} was not found in the system keyring. You will now be prompted to enter it.')
  170. connect_str = set_connection_string(passphrase=passphrase,
  171. account_name=account_name)
  172. # Confirm that the account name provided matches the one found in the connection string
  173. confirm_account_match(account_name=account_name,
  174. connect_str=connect_str)
  175. return connect_str
  176. def extract_account_name(passphrase):
  177. """
  178. Extract the account name from the system keyring
  179. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  180. :return: account_name: Name of the Azure storage account
  181. """
  182. # Use the passphrase to extract the account name
  183. account_name = keyring.get_password(passphrase,
  184. passphrase)
  185. # If the account name hasn't been entered into the keyring, prompt the user to enter it
  186. if not account_name:
  187. logging.warning(f'Account name linked to the provided passphrase {passphrase} was not found in the system '
  188. f'keyring. You will now be prompted to enter it.')
  189. account_name = set_account_name(passphrase=passphrase)
  190. return account_name
  191. def extract_account_key(connect_str):
  192. """
  193. Extract the account key from the connection string. This is necessary for the method that creates the blob SAS,
  194. as it doesn't accept connection strings
  195. :param connect_str: type str: Connection string for the Azure storage account
  196. :return account_key: String of the account key extracted from the connection string
  197. """
  198. # Split the connection string on ';', use the entry corresponding to the account key, and strip off the
  199. # 'AccountKey='
  200. # DefaultEndpointsProtocol=https;AccountName=[REDACTED];AccountKey=[REDACTED];EndpointSuffix=core.windows.net
  201. try:
  202. account_key = connect_str.split(';')[2].split('AccountKey=')[-1]
  203. except IndexError:
  204. logging.error('Could not parse the account key from the connection string in the keyring. Please ensure that '
  205. 'it has been entered, and the it conforms to the proper format: '
  206. 'DefaultEndpointsProtocol=https;AccountName=[REDACTED];AccountKey=[REDACTED];'
  207. 'EndpointSuffix=core.windows.net')
  208. raise SystemExit
  209. return account_key
  210. def delete_keyring_credentials(passphrase, account_name=None):
  211. """
  212. Delete the password associated with the passphrase and account name from the system keyring
  213. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  214. :param account_name: type str: Name of the Azure storage account
  215. :return: account_name: Name of the Azure storage account
  216. """
  217. if not account_name:
  218. # Prompt the user for the account name
  219. account_name = input('Please enter your account name\n').encode('utf-8').decode()
  220. try:
  221. # Delete the password from the system keyring
  222. keyring.delete_password(passphrase, account_name)
  223. except keyring.errors.PasswordDeleteError:
  224. logging.error(f'Connection string associated with passphrase {passphrase} and account name {account_name} '
  225. f'not found in system keyring. Please ensure that you supplied the correct arguments.')
  226. raise SystemExit
  227. return account_name
  228. def validate_container_name(container_name, object_type='container'):
  229. """
  230. Use a regex to check if the supplied name follows the guidelines for Azure nomenclature. If it doesn't, attempt to
  231. rename the container/object
  232. :param container_name: type str: Name of the container/object of interest
  233. :param object_type: type str: Name of the object being validated. Default is container, but target container, and
  234. target path are other options
  235. :return: container_name: String of sanitised container name
  236. """
  237. if not re.match('^[a-z0-9](?!.*--)[a-z0-9-]{1,61}[a-z0-9]$', container_name):
  238. logging.warning(f'{object_type.capitalize()} name, {container_name} is invalid. {object_type.capitalize()} '
  239. f'names must be between 3 and 63 characters, start with a letter or number, and can contain '
  240. f'only letters, numbers, and the dash (-) character. Every dash (-) character must be '
  241. f'immediately preceded and followed by a letter or number; consecutive dashes are not '
  242. f'permitted in {object_type} names. All letters in a {object_type} name must be lowercase.')
  243. logging.info(f'Attempting to fix the {object_type} name')
  244. # Swap out dashes for underscores, as they will be removed in the following regex
  245. container_name = container_name.replace('-', '_')
  246. # Use re to remove all non-word characters (including dashes)
  247. container_name = re.sub(r'[^\w]', '', container_name)
  248. # Replace multiple underscores with a single one. Uses logic from: https://stackoverflow.com/a/46701355
  249. # Also ensure that the container name is in lowercase
  250. container_name = re.sub(r'[^\w\s]|(_)(?=\1)', '', container_name).lower()
  251. # Swap out underscores for dashes
  252. container_name = container_name.replace('_', '-')
  253. # Ensure that the container name doesn't start or end with a dash
  254. container_name = re.sub(r'^-+', '', container_name)
  255. container_name = re.sub(r'-+$', '', container_name)
  256. # Ensure that the container name isn't length zero, or the while loop below will be infinite
  257. if len(container_name) == 0:
  258. logging.error(f'Attempting to fix the {object_type} name left zero valid characters! '
  259. 'Please enter a new name.')
  260. raise SystemExit
  261. # If the container name is too long, slice it to be 63 characters
  262. if len(container_name) >= 63:
  263. logging.warning(f'{object_type.capitalize()} name {container_name} was too long. Using {container_name[:62]} '
  264. f'instead')
  265. container_name = container_name[:62]
  266. # If the container name is too short, keep adding the container name to itself to bump up the length
  267. while len(container_name) < 3:
  268. logging.warning(f'{object_type.capitalize()} name {container_name} was too short (only {len(container_name)} '
  269. f'characters). Using {container_name + container_name} instead')
  270. container_name = container_name + container_name
  271. # Use the validated container name
  272. logging.info(f'Using {container_name} as the {object_type} name')
  273. return container_name
  274. def create_blob_service_client(connect_str):
  275. """
  276. Create a blob service client using the connection string
  277. :param connect_str: type str: Connection string for Azure storage
  278. :return: blob_service_client: type azure.storage.blob.BlobServiceClient
  279. """
  280. try:
  281. blob_service_client = BlobServiceClient.from_connection_string(connect_str)
  282. return blob_service_client
  283. except ValueError:
  284. logging.error('Your connection string was rejected. Please ensure that you entered it properly, and that it '
  285. 'is valid')
  286. raise SystemExit
  287. def create_container(blob_service_client, container_name):
  288. """
  289. Create a new container and container-specific client from the blob service client
  290. :param blob_service_client: type: type azure.storage.blob.BlobServiceClient
  291. :param container_name: type str: Name of the container of interest
  292. :return: container_client: type azure.storage.blob.BlobServiceClient.ContainerClient
  293. """
  294. # Hide the INFO-level messages sent to the logger from Azure by increasing the logging level to WARNING
  295. logging.getLogger().setLevel(logging.WARNING)
  296. try:
  297. container_client = blob_service_client.create_container(container_name)
  298. except azure.core.exceptions.ResourceExistsError as e:
  299. if 'The specified container already exists.' in str(e):
  300. container_client = create_container_client(blob_service_client=blob_service_client,
  301. container_name=container_name)
  302. elif 'The specified container is being deleted. Try operation later.' in str(e):
  303. logging.error(f'Could not create the requested container {container_name}. As it has recently been '
  304. f'deleted, please try again in a few moments')
  305. raise SystemExit
  306. else:
  307. logging.error(f'Could not create container {container_name}')
  308. raise SystemExit
  309. return container_client
  310. def create_container_client(blob_service_client, container_name, create=True):
  311. """
  312. Create a container-specific client from the blob service client
  313. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  314. :param container_name: type str: Name of the container of interest
  315. :param create: type bool: Boolean whether to create a container if it doesn't exist
  316. :return: container_client: type azure.storage.blob.BlobServiceClient.ContainerClient
  317. """
  318. # Create the container client from the blob service client with the get container client method
  319. # and the container name
  320. container_client = blob_service_client.get_container_client(container_name)
  321. # Create the container if it does not exist
  322. if not container_client.exists() and create:
  323. container_client = create_container(blob_service_client=blob_service_client,
  324. container_name=container_name)
  325. return container_client
  326. def create_blob_client(blob_service_client, container_name, blob_file):
  327. """
  328. Create a blob-specific client
  329. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  330. :param container_name: type str: Name of the container of interest
  331. :param blob_file: type iterable from azure.storage.blob.BlobServiceClient.ContainerClient.list_blobs
  332. :return: blob_client: type azure.storage.blob.BlobServiceClient.BlobClient
  333. """
  334. # Create a blob client for the current blob
  335. blob_client = blob_service_client.get_blob_client(container=container_name,
  336. blob=blob_file)
  337. return blob_client
  338. def create_blob_sas(blob_file, account_name, container_name, account_key, expiry, sas_urls):
  339. """
  340. Create SAS URL for blob
  341. :param blob_file: type container_client.list_blobs() object
  342. :param account_name: type str: Name of Azure storage account
  343. :param container_name: type str: Name of container in Azure storage in which the file is located
  344. :param account_key: type str: Account key of Azure storage account
  345. :param expiry: type int: Number of days that the SAS URL will be valid
  346. :param sas_urls: type dict: Dictionary of file name: SAS URL (empty)
  347. :return: populated sas_urls
  348. """
  349. # Set the name of file by removing any path information
  350. file_name = os.path.basename(blob_file.name)
  351. # Create the blob SAS. Use a start time 15 minutes in the past, and the requested expiry
  352. sas_token = generate_blob_sas(
  353. account_name=account_name,
  354. container_name=container_name,
  355. blob_name=blob_file.name,
  356. account_key=account_key,
  357. permission=BlobSasPermissions(read=True),
  358. start=datetime.datetime.utcnow() - datetime.timedelta(minutes=15),
  359. expiry=datetime.datetime.utcnow() + datetime.timedelta(days=expiry))
  360. # Create the SAS URL, and add it to the dictionary with the file_name as the key
  361. sas_urls[file_name] = create_sas_url(account_name=account_name,
  362. container_name=container_name,
  363. blob_name=blob_file.name,
  364. sas_token=sas_token)
  365. return sas_urls
  366. def client_prep(container_name, passphrase, account_name, create=True):
  367. """
  368. Validate the container name, and prepare the necessary clients
  369. :param container_name: type str: Name of the container of interest
  370. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  371. :param account_name: type str: Name of the Azure storage account
  372. :param create: type bool: Boolean whether to create a container if it doesn't exist
  373. :return: container_name: Validated container name
  374. :return: connect_str: String of the connection string
  375. :return: blob_service_client: azure.storage.blob.BlobServiceClient
  376. :return: container_client: azure.storage.blob.BlobServiceClient.ContainerClient
  377. """
  378. # Validate the container name
  379. container_name = validate_container_name(container_name=container_name)
  380. # Extract the connection string from the system keyring
  381. connect_str = extract_connection_string(passphrase=passphrase,
  382. account_name=account_name)
  383. # Create the blob service client using the connection string
  384. blob_service_client = create_blob_service_client(connect_str=connect_str)
  385. # Create the container client for the desired container with the blob service client
  386. container_client = create_container_client(blob_service_client=blob_service_client,
  387. container_name=container_name,
  388. create=create)
  389. return container_name, connect_str, blob_service_client, container_client
  390. def sas_prep(container_name, passphrase, account_name, create=True):
  391. """
  392. Validate container names, extract connection strings, and account keys, and create necessary clients for
  393. SAS URL creation
  394. :param container_name: type str: Name of the container of interest
  395. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  396. :param account_name: type str: Name of the Azure storage account
  397. :param create: type bool: Boolean whether to create a container if it doesn't exist
  398. :return: container_name: Validated container name
  399. :return: connect_str: Connection string for Azure storage
  400. :return: account_key: Account key for Azure storage
  401. :return: blob_service_client: azure.storage.blob.BlobServiceClient
  402. :return: container_client: azure.storage.blob.BlobServiceClient.ContainerClient
  403. """
  404. # Validate the container name
  405. container_name = validate_container_name(container_name=container_name)
  406. # Retrieve the connection string from the system keyring
  407. connect_str = extract_connection_string(passphrase=passphrase,
  408. account_name=account_name)
  409. # Extract the account key from the connection string
  410. account_key = extract_account_key(connect_str=connect_str)
  411. # Create the blob service client
  412. blob_service_client = create_blob_service_client(connect_str=connect_str)
  413. # Create the container client from the blob service client
  414. container_client = create_container_client(blob_service_client=blob_service_client,
  415. container_name=container_name,
  416. create=create)
  417. return container_name, connect_str, account_key, blob_service_client, container_client
  418. def create_sas_url(account_name, container_name, blob_name, sas_token):
  419. """
  420. Create the SAS URL from the required components
  421. :param account_name: type str: Name of Azure storage account
  422. :param container_name: type str: Name of the container of interest
  423. :param blob_name: type str: Name and path of the file of interest
  424. :param sas_token: type azure.storage.blob.generate_blob_sas
  425. :return: sas_url: String of the SAS URL
  426. """
  427. # Generate the SAS URL using the account name, the domain, the container name, the blob name, and the
  428. # SAS token in the following format:
  429. # 'https://' + account_name + '.blob.core.windows.net/' + container_name + '/' + blob_name + '?' + blob
  430. sas_url = f'https://{account_name}.blob.core.windows.net/{container_name}/{blob_name}?{sas_token}'
  431. return sas_url
  432. def write_sas(output_file, sas_urls):
  433. """
  434. Write the SAS URLs to the output file
  435. :param output_file: type str: Name and path of the file in which the SAS URLs are to be written
  436. :param sas_urls: type dict: Dictionary of file name: SAS URL
  437. """
  438. # Create the output file
  439. with open(output_file, 'w') as output:
  440. for file_name, sas_url in sas_urls.items():
  441. # Write the SAS URL to the output file
  442. output.write(f'{sas_url}\n')
  443. # Print the file name and SAS URL to the terminal
  444. logging.info(f'{file_name}\t{sas_url}')
  445. def set_blob_retention_policy(blob_service_client, days=8):
  446. """
  447. Set the retention policy for a blob
  448. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  449. :param days: type int: Number of days to retain deleted blobs. Default is 8
  450. :return: blob_service_client: Client with the retention policy implemented
  451. """
  452. # Create a retention policy to retain deleted blobs
  453. delete_retention_policy = RetentionPolicy(enabled=True, days=days)
  454. # Set the retention policy on the service
  455. blob_service_client.set_service_properties(delete_retention_policy=delete_retention_policy)
  456. return blob_service_client
  457. def move_prep(passphrase, account_name, container_name, target_container):
  458. """
  459. Prepare all the necessary clients for moving container/files/folders in Azure storage
  460. :param passphrase: type str: Simple passphrase to use to store the connection string in the system keyring
  461. :param account_name: type str: Name of Azure storage account
  462. :param container_name: type str: Name of the container of interest
  463. :param target_container: type str: Name of the new container into which the container/file/folder is to be copied
  464. :return: blob_service_client: type azure.storage.blob.BlobServiceClient
  465. :return: source_container_client: type azure.storage.blob.BlobServiceClient.ContainerClient for source container
  466. :return: target_container_client: type azure.storage.blob.BlobServiceClient.ContainerClient for target container
  467. """
  468. # Validate the container names
  469. container_name = validate_container_name(container_name=container_name)
  470. target_container = validate_container_name(container_name=target_container,
  471. object_type='target container')
  472. # Extract the connection string from the system keyring
  473. connect_str = extract_connection_string(passphrase=passphrase,
  474. account_name=account_name)
  475. blob_service_client = create_blob_service_client(connect_str=connect_str)
  476. source_container_client = create_container_client(blob_service_client=blob_service_client,
  477. container_name=container_name)
  478. # Hide the INFO-level messages sent to the logger from Azure by increasing the logging level to WARNING
  479. logging.getLogger().setLevel(logging.WARNING)
  480. target_container_client = create_container(blob_service_client=blob_service_client,
  481. container_name=target_container)
  482. return container_name, target_container, blob_service_client, source_container_client, target_container_client
  483. def copy_blob(blob_file, blob_service_client, container_name, target_container, path, storage_tier,
  484. object_name=None, category=None, common_path=None):
  485. """
  486. Copy a blob from one container to another
  487. :param blob_file: type iterable from azure.storage.blob.BlobServiceClient.ContainerClient.list_blobs
  488. :param container_name: type str: Name of the container in which the file is located
  489. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  490. :param target_container: type str: Name of the new container into which the file is to be copied
  491. :param path: type str: Path of folders in which the files are to be placed
  492. :param storage_tier: type str: Storage tier to use for the copied file/folder
  493. :param object_name: type str: Name and path of file/folder to download from Azure storage
  494. :param category: type str: Category of object to be copied. Limited to file or folder
  495. :param common_path: type str: Calculated common path between the specified file/folder and the blob_file.name
  496. """
  497. # Create the blob client
  498. blob_client = create_blob_client(blob_service_client=blob_service_client,
  499. container_name=container_name,
  500. blob_file=blob_file)
  501. # Extract the folder structure of the blob e.g. 220202-m05722/InterOp
  502. folder_structure = list(os.path.split(os.path.dirname(blob_file.name)))
  503. # Add the nested folder to the path as requested
  504. target_path = os.path.join(path, os.path.join(*folder_structure))
  505. # Set the name of file by removing any path information
  506. file_name = os.path.basename(blob_file.name)
  507. # Finally, set the name and the path of the output file
  508. if category is None:
  509. target_file = os.path.join(path, file_name)
  510. # If a folder is being moved, join the path, the common path between the blob file and the supplied folder name
  511. # with the file name
  512. else:
  513. if object_name is not None:
  514. target_file = os.path.join(path, common_path, os.path.basename(blob_file.name))
  515. # If a container is being moved, join the target path and the name of the directory of the blob_file to the
  516. # file name
  517. else:
  518. # Create a pathlib.Path object from the blob file
  519. file_path = pathlib.Path(blob_file.name)
  520. # Determine the parental path of the file. If the file is in the root, it will be a dot. This won't work
  521. # with the joining logic, so change it to ''
  522. nested_path = file_path.parent if file_path.parent == '.' else ''
  523. # Join the target path, nested path, and file name
  524. target_file = os.path.join(target_path, nested_path, file_name)
  525. # Create a blob client for the target blob
  526. target_blob_client = blob_service_client.get_blob_client(target_container, target_file)
  527. # Copy the source file to the target file - allow up to 1000 seconds total
  528. target_blob_client.start_copy_from_url(blob_client.url)
  529. # Set the storage tier
  530. target_blob_client.set_standard_blob_tier(standard_blob_tier=storage_tier)
  531. # Ensure that the copy is complete before proceeding
  532. for i in range(100):
  533. # Extract the properties of the target blob client
  534. target_blob_properties = target_blob_client.get_blob_properties()
  535. # Break when the status is set to 'success'. The copy is successful
  536. if target_blob_properties.copy.status == 'success':
  537. # Copy finished
  538. break
  539. # Sleep for 10 seconds
  540. time.sleep(10)
  541. def delete_container(blob_service_client, container_name, account_name):
  542. """
  543. Delete a container in Azure storage
  544. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  545. :param container_name: type str: Name of the container of interest
  546. :param account_name: type str: Name of the Azure storage account
  547. """
  548. # Delete container if it exists
  549. try:
  550. blob_service_client.delete_container(container_name)
  551. except azure.core.exceptions.ResourceNotFoundError:
  552. logging.error(f'Could not locate {container_name} in {account_name}. Perhaps it has already been deleted?')
  553. raise SystemExit
  554. def extract_common_path(object_name, blob_file):
  555. """
  556. Extract the common path (if any) between a file in Azure storage, and a user-supplied folder name
  557. :param object_name: type str: Name and path of file/folder to download from Azure storage
  558. :param blob_file: type iterable from azure.storage.blob.BlobServiceClient.ContainerClient.list_blobs
  559. :return: common_path: The calculated common path between the folder and the file in blob storage (can be None)
  560. """
  561. # Create the pathlib.Path objects for both the folder and the blob file
  562. object_path = pathlib.Path(os.path.normpath(object_name))
  563. blob_path = pathlib.Path(blob_file.name).parent
  564. # If there is a common path between the folder and the blob file path, then there is a match
  565. try:
  566. common_path = blob_path.relative_to(object_path)
  567. # Change the dot returned by an exact match to the directory with ''
  568. common_path = common_path if str(common_path) != '.' else ''
  569. except ValueError:
  570. common_path = None
  571. return common_path
  572. def delete_file(container_client, object_name, blob_service_client, container_name):
  573. """
  574. Delete a file from Azure storage
  575. :param container_client: type azure.storage.blob.BlobServiceClient.ContainerClient
  576. :param object_name: type str: Name and path of file/folder to download from Azure storage
  577. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  578. :param container_name: type str: Name of the container of interest
  579. """
  580. # Create a generator containing all the blobs in the container
  581. generator = container_client.list_blobs()
  582. # Create a boolean to determine if the blob has been located
  583. present = False
  584. for blob_file in generator:
  585. # Filter for the blob name
  586. if os.path.join(blob_file.name) == object_name:
  587. # Update the blob presence variable
  588. present = True
  589. # Create the blob client
  590. blob_client = create_blob_client(blob_service_client=blob_service_client,
  591. container_name=container_name,
  592. blob_file=blob_file)
  593. # Soft delete the blob
  594. blob_client.delete_blob()
  595. # Send a warning to the user that the blob could not be found
  596. if not present:
  597. logging.error(f'Could not locate the desired file {object_name}')
  598. raise SystemExit
  599. def delete_folder(container_client, object_name, blob_service_client, container_name, account_name):
  600. """
  601. Delete a folder from Azure storage
  602. :param container_client: type azure.storage.blob.BlobServiceClient.ContainerClient
  603. :param object_name: type str: Name and path of file/folder to download from Azure storage
  604. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  605. :param container_name: type str: Name of the container of interest
  606. :param account_name: type str: Name of the Azure storage account
  607. """
  608. # Create a generator containing all the blobs in the container
  609. generator = container_client.list_blobs()
  610. # Create a boolean to determine if the blob has been located
  611. present = False
  612. for blob_file in generator:
  613. common_path = extract_common_path(object_name=object_name,
  614. blob_file=blob_file)
  615. # Only copy the file if there is a common path between the object path and the blob path (they match)
  616. if common_path is not None:
  617. # Update the folder presence boolean
  618. present = True
  619. # Create the blob client
  620. blob_client = create_blob_client(blob_service_client=blob_service_client,
  621. container_name=container_name,
  622. blob_file=blob_file)
  623. # Soft delete the blob
  624. blob_client.delete_blob()
  625. # Log an error that the folder could not be found
  626. if not present:
  627. logging.error(
  628. f'There was an error deleting folder {object_name} in container {container_name}, '
  629. f'in Azure storage account {account_name}. Please ensure that all arguments have been '
  630. f'entered correctly')
  631. raise SystemExit
  632. def arg_dict_cleanup(arg_dict):
  633. """
  634. Clean up the argument dictionary to be consistent with the format required for the AzureStorage classes
  635. :param arg_dict: type dict: Dictionary of argument name: value e.g. storage tier: nan
  636. :return: arg_dict: Cleaned argument dictionary
  637. """
  638. try:
  639. # Double single quotes are not automatically changed into an empty string
  640. arg_dict['reset_path'] = arg_dict['reset_path'] if arg_dict['reset_path'] != "''" else str()
  641. except KeyError:
  642. pass
  643. # For optional argument, the nan value supplied for empty values will not work with downstream code; find and
  644. # change them to the appropriate empty/default value
  645. try:
  646. arg_dict['reset_path'] = arg_dict['reset_path'] if str(arg_dict['reset_path']) != str(np.nan) else None
  647. except KeyError:
  648. pass
  649. try:
  650. arg_dict['storage_tier'] = arg_dict['storage_tier'] if str(arg_dict['storage_tier']) != str(np.nan) else 'Hot'
  651. except KeyError:
  652. pass
  653. try:
  654. arg_dict['output_file'] = arg_dict['output_file'] if str(arg_dict['output_file']) != str(np.nan) \
  655. else os.path.join(os.getcwd(), 'sas_urls.txt')
  656. except KeyError:
  657. pass
  658. try:
  659. arg_dict['output_path'] = arg_dict['output_path'] if str(arg_dict['output_path']) != str(np.nan) \
  660. else os.getcwd()
  661. except KeyError:
  662. pass
  663. try:
  664. arg_dict['expiry'] = arg_dict['expiry'] if str(arg_dict['expiry']) != str(np.nan) else 10
  665. except KeyError:
  666. pass
  667. try:
  668. arg_dict['retention_time'] = arg_dict['retention_time'] if str(arg_dict['retention_time']) != str(np.nan) else 8
  669. except KeyError:
  670. pass
  671. # Reading in numerical container names e.g. 220202 returns an integer, so typecast it to string
  672. arg_dict['container'] = str(arg_dict['container'])
  673. try:
  674. arg_dict['target'] = str(arg_dict['target'])
  675. except KeyError:
  676. pass
  677. return arg_dict
  678. def create_batch_dict(batch_file, headers):
  679. """
  680. Read in the supplied file of arguments with pandas. Create a dictionary of the arguments from a transposed dataframe
  681. :param batch_file: type str: Name and path of file containing requested operations
  682. :param headers: type list: Names of all the headers present in the file
  683. :return: Pandas dataframe.transpose().to_dict() of header: value extracted from the desired operation
  684. """
  685. # Ensure that the batch file exists
  686. try:
  687. assert os.path.isfile(batch_file)
  688. except AssertionError:
  689. logging.error(f'Could not locate the supplied batch file {batch_file}. Please ensure the you entered '
  690. f'the name and path correctly')
  691. raise SystemExit
  692. # Read in the batch file using pandas.read_csv. Use tabs as the separator, and provide the header names.
  693. # Transpose the data, and convert the dataframe to a dictionary
  694. batch_dict = pd.read_csv(
  695. batch_file,
  696. sep='\t',
  697. names=headers
  698. ).transpose().to_dict()
  699. return batch_dict
  700. def parse_batch_file(line):
  701. """
  702. Extract the requested command and subcommand from a line from an AzureAutomate batch file. Create a dictionary with
  703. the appropriate header:value for that command and subcommand combination
  704. :param line: type str: Individual line of text from batch file detailing requested operations. Format is:
  705. command;subcommand;operation-specific arguments
  706. :return: command: type str: Desired command to run e.g. upload, sas, move, download, tier, delete
  707. :return: subcommand: Subcommand for operation e.g. container, file, folder
  708. :return: batch_dict: Pandas dataframe.transpose().to_dict() of header: value extracted from the desired operation
  709. """
  710. # Create a dictionary of the appropriate headers for each command and subcommand combination
  711. header_dict = {
  712. 'upload': {
  713. 'file': ['command', 'subcommand', 'container', 'file', 'reset_path', 'storage_tier'],
  714. 'folder': ['command', 'subcommand', 'container', 'folder', 'reset_path', 'storage_tier']
  715. },
  716. 'sas': {
  717. 'container': ['command', 'subcommand', 'container', 'expiry', 'output_file'],
  718. 'file': ['command', 'subcommand', 'container', 'file', 'expiry', 'output_file'],
  719. 'folder': ['command', 'subcommand', 'container', 'folder', 'expiry', 'output_file']
  720. },
  721. 'move': {
  722. 'container': ['command', 'subcommand', 'container', 'target', 'reset_path', 'storage_tier'],
  723. 'file': ['command', 'subcommand', 'container', 'target', 'file', 'reset_path', 'storage_tier'],
  724. 'folder': ['command', 'subcommand', 'container', 'target', 'folder', 'reset_path', 'storage_tier']
  725. },
  726. 'download': {
  727. 'container': ['command', 'subcommand', 'container', 'output_path'],
  728. 'file': ['command', 'subcommand', 'container', 'file', 'output_path'],
  729. 'folder': ['command', 'subcommand', 'container', 'folder', 'output_path']
  730. },
  731. 'tier': {
  732. 'container': ['command', 'subcommand', 'container', 'storage_tier'],
  733. 'file': ['command', 'subcommand', 'container', 'file', 'storage_tier'],
  734. 'folder': ['command', 'subcommand', 'container', 'folder', 'storage_tier']
  735. },
  736. 'delete': {
  737. 'container': ['command', 'subcommand', 'container'],
  738. 'file': ['command', 'subcommand', 'container', 'file', 'retention_time'],
  739. 'folder': ['command', 'subcommand', 'container', 'folder', 'retention_time']
  740. }
  741. }
  742. # Extract the command and subcommand from the line. They will be the first two entries
  743. try:
  744. command = line.split('\t')[0]
  745. subcommand = line.split('\t')[1]
  746. except IndexError:
  747. logging.error(f'Could not extract the desired command and subcommand from your file. Please review the '
  748. f'following line {line}')
  749. raise SystemExit
  750. # Use the extracted command and subcommand to determine the appropriate headers
  751. try:
  752. headers = header_dict[command][subcommand]
  753. except KeyError:
  754. logging.error(f'Could not find the requested command {command} and subcommand {subcommand} in the list of '
  755. f'commands. Please ensure that you created your batch file correctly')
  756. raise SystemExit
  757. # Use StringIO to convert the string into a format that can be read by pandas.read_csv
  758. input_string = StringIO(line.rstrip())
  759. # Read in the line using pandas.read_csv. Use tabs as the separator, and provide the header names.
  760. # Transpose the data, and convert the dataframe to a dictionary
  761. try:
  762. batch_dict = pd.read_csv(
  763. input_string,
  764. sep='\t',
  765. names=headers
  766. ).transpose().to_dict()
  767. except pandas.errors.ParserError as e:
  768. logging.error(f'Pandas error parsing data: {e}')
  769. raise SystemExit
  770. # Return the command, subcommand, and parsed dictionary
  771. return command, subcommand, batch_dict