azure_list.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. #!/usr/bin/env python
  2. from azure_storage.methods import create_blob_service_client, client_prep, create_parent_parser, \
  3. extract_connection_string, setup_arguments
  4. from argparse import ArgumentParser, RawTextHelpFormatter
  5. from termcolor import colored
  6. import coloredlogs
  7. import logging
  8. import pathlib
  9. import sys
  10. import os
  11. import re
  12. class AzureContainerList(object):
  13. def main(self):
  14. # Hide the INFO-level messages sent to the logger from Azure by increasing the logging level to WARNING
  15. logging.getLogger().setLevel(logging.WARNING)
  16. # Extract the connection string from the system keyring
  17. self.connect_str = extract_connection_string(passphrase=self.passphrase,
  18. account_name=self.account_name)
  19. # Create the blob service client using the connection string
  20. self.blob_service_client = create_blob_service_client(connect_str=self.connect_str)
  21. containers = self.list_containers(blob_service_client=self.blob_service_client,
  22. expression=self.expression,
  23. print_container=self.print_container,
  24. output_file=self.output_file)
  25. return containers
  26. @staticmethod
  27. def list_containers(blob_service_client, expression, print_container, output_file):
  28. """
  29. List all containers in a storage account. If an expression is provided, find all containers that
  30. match the expression
  31. :param blob_service_client: type: azure.storage.blob.BlobServiceClient
  32. :param expression: type str: Expression to match. Can be a regular expression or 'normal' expression
  33. :param print_container: type bool: Boolean on whether to print container matches to the terminal
  34. :param output_file: type str: Name and path of file in which container names are to be written. Optional
  35. :return: container_matches: List of containers that match the expression
  36. """
  37. # Create a generator of all the containers in the storage account
  38. containers = blob_service_client.list_containers()
  39. # Prepare a list to store the containers that match the expression
  40. container_matches = list()
  41. # Allow a quiet exit on keyboard interrupts
  42. try:
  43. for container in containers:
  44. # Boolean to determine whether the expression matched the container name
  45. match = False
  46. # If the expression contains non-alphanumeric characters either at the start or anywhere, treat
  47. # it as a regular expression
  48. if re.match(r'.*\W', expression.replace('-', '_')):
  49. # Use re.sub to convert * to .* to be consistent with regex rules
  50. # It seemed unintuitive to force the user to use .* rather than just * for simple queries.
  51. # If .* was provided, don't add the '.' by using a negative lookbehind assertion
  52. regex_expression = re.sub(r'(?<!\.)\*', '.*', expression)
  53. # Use re.fullmatch to determine if the expression matches the container name
  54. if re.fullmatch(rf'{regex_expression}$', container.name):
  55. # Update the match boolean and append the container to the list of matches
  56. match = True
  57. container_matches.append(container)
  58. # The expression doesn't appear to be a regular expression
  59. else:
  60. # Ensure a perfect match for non regex queries
  61. if expression == container.name:
  62. # Update the match boolean and append the container to the list of matches
  63. match = True
  64. container_matches.append(container)
  65. # Print the name of the container on a match
  66. if print_container and match:
  67. # Use termcolor to print the name in bold green
  68. print(colored(container.name, 'green', attrs=['bold']))
  69. # If requested, write the name of the container to the output file on a match
  70. if output_file and match:
  71. with open(output_file, 'a+') as output:
  72. output.write(f'{container.name}\n')
  73. except KeyboardInterrupt:
  74. raise SystemExit
  75. return container_matches
  76. def __init__(self, expression, account_name, output_file, passphrase, print_container=True):
  77. self.expression = expression if expression else '*'
  78. self.account_name = account_name
  79. self.passphrase = passphrase
  80. # Ensure that the output file can be used
  81. if output_file:
  82. # Output file
  83. if output_file.startswith('~'):
  84. self.output_file = os.path.abspath(os.path.expanduser(os.path.join(output_file)))
  85. else:
  86. self.output_file = os.path.abspath(os.path.join(output_file))
  87. # Ensure that the output file can be used
  88. if not os.path.isfile(self.output_file):
  89. try:
  90. # Create the parental directory for the output file as required
  91. os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
  92. except PermissionError:
  93. logging.error(f'Insufficient permissions to create output file {self.output_file}')
  94. raise SystemExit
  95. try:
  96. open(self.output_file, 'w').close()
  97. except IsADirectoryError:
  98. logging.error(
  99. f'A directory or an empty file name was provided for the output file {self.output_file}')
  100. raise SystemExit
  101. except PermissionError:
  102. logging.error(f'Insufficient permissions to create output file {self.output_file}')
  103. raise SystemExit
  104. else:
  105. self.output_file = str()
  106. self.connect_str = str()
  107. self.blob_service_client = None
  108. # Boolean on whether the container name should be printed to terminal
  109. self.print_container = print_container
  110. class AzureList(object):
  111. def main(self):
  112. # Hide the INFO-level messages sent to the logger from Azure by increasing the logging level to WARNING
  113. logging.getLogger().setLevel(logging.WARNING)
  114. # If the container name was provided, and does not look like a regular expression, run the client_prep method
  115. # to validate the container name, extract the connection string, and create the blob service client and
  116. # container client
  117. if self.container_name and not re.match(r'.*\W', self.container_name.replace('-', '_')):
  118. self.container_name, self.connect_str, self.blob_service_client, container_client = \
  119. client_prep(container_name=self.container_name,
  120. passphrase=self.passphrase,
  121. account_name=self.account_name)
  122. # List all the files that match the expression
  123. self.list_files(container_client=container_client,
  124. expression=self.expression,
  125. output_file=self.output_file,
  126. container_name=self.container_name)
  127. # If the container name wasn't provided, or looks like a regular expression, use the AzureContainerList class
  128. # to find containers that match the provided expression
  129. else:
  130. list_containers = AzureContainerList(
  131. expression=self.container_name,
  132. account_name=self.account_name,
  133. output_file=str(),
  134. passphrase=self.passphrase,
  135. print_container=False
  136. )
  137. containers = list_containers.main()
  138. # Extract the connection string from the system keyring
  139. self.connect_str = extract_connection_string(passphrase=self.passphrase,
  140. account_name=self.account_name)
  141. # Create the blob service client using the connection string
  142. self.blob_service_client = create_blob_service_client(connect_str=self.connect_str)
  143. # List all the files in each of the containers that match the provided expression
  144. for container in containers:
  145. # Create a container client for the container
  146. container_client = self.blob_service_client.get_container_client(container.name)
  147. # Run the list_files method to list and optionally filter the files
  148. self.list_files(container_client=container_client,
  149. expression=self.expression,
  150. output_file=self.output_file,
  151. container_name=container.name)
  152. @staticmethod
  153. def list_files(container_client, expression, output_file, container_name):
  154. """
  155. List and optionally filter (with a user-provided expression) all files in a container
  156. :param container_client: type azure.storage.blob.BlobServiceClient.ContainerClient
  157. :param expression: type str: Expression to match. Can be a regular expression or 'normal' expression
  158. :param output_file: type str: Name and path of file in which container names are to be written. Optional
  159. :param container_name: type str: Name of the container of interest
  160. """
  161. # Create a generator containing all the blobs in the container
  162. generator = container_client.list_blobs()
  163. # Allow a quiet exit on keyboard interrupts
  164. try:
  165. # Iterate through all the files in the container
  166. for blob_file in generator:
  167. # Store the file name and path in a variable
  168. filename = blob_file.name
  169. # Initialise a variable to track whether this file is a match to the expression
  170. match = False
  171. # Use pathlib to create a path object from the file name
  172. path_obj = pathlib.Path(os.path.normpath(filename))
  173. # Split the file name into its separate components
  174. components = path_obj.parts
  175. # Check whether the expression contains non-alphanumeric characters. If it does, treat it as a
  176. # regular expression. Ignore dashes as a non-alphanumeric character.
  177. if re.match(r'.*\W', expression.replace('-', '_')):
  178. # If the expression is targeted to nested files/folders, split the expression into its
  179. # path components e.g. reports/outputs/output.tsv contains three components
  180. expression_obj = pathlib.Path(os.path.normpath(expression))
  181. expression_components = list(expression_obj.parts)
  182. # The number of matches required is the number of path components
  183. # e.g. reports/outputs/output.tsv requires three matches
  184. matches_required = len(expression_components)
  185. # Initialise a dictionary to track matches to each of the components
  186. component_matches = dict()
  187. # Search through all the path components of the file name
  188. for i, component in enumerate(components):
  189. # Check for nested files/folders
  190. if len(expression_components) > 1:
  191. while len(expression_components) < len(components):
  192. expression_components.insert(-1, '*')
  193. # Reset the number of matches required to the new length of the expression components
  194. matches_required = len(expression_components)
  195. # Use re.sub to convert * to .* to be consistent with regex rules
  196. regex_expression = re.sub(r'(?<!\.)\*', '.*', expression_components[i])
  197. # If the components match, increment the number of matches
  198. if re.fullmatch(rf'{regex_expression}$', component):
  199. # Set the match to the current component to true
  200. component_matches[component] = True
  201. else:
  202. # Use re.sub to convert * to .* to be consistent with regex rules
  203. regex_expression = re.sub(r'(?<!\.)\*', '.*', expression)
  204. # If the component matches, set the match boolean to True
  205. if re.fullmatch(rf'{regex_expression}$', component):
  206. match = True
  207. # Check to see if the number of matches observed in a multi-component expression is the number
  208. # matches required for a match before setting the match boolean to True
  209. if len(component_matches) == matches_required:
  210. match = True
  211. # The expression does not look like a regular expression
  212. else:
  213. for component in components:
  214. # An exact match is required to be considered a match
  215. if expression == component:
  216. match = True
  217. # Only proceed if the file matches the expression
  218. if match:
  219. # If the output file has been provided, write the file name to it
  220. if output_file:
  221. with open(output_file, 'a+') as output:
  222. output.write(f'{container_name}\t{filename}\n')
  223. # Initialise a variable to store the path information of the file
  224. file_path = None
  225. # Use termcolor to print the container name in bold green
  226. container = colored(container_name, 'green', attrs=['bold'])
  227. # Determine if the file is nested in one or more folders
  228. if len(path_obj.parts) > 1:
  229. # Use termcolor to print the path in bold blue
  230. file_path = colored(f'{os.sep.join(components[:-1])}{os.sep}', 'blue', attrs=['bold'])
  231. # Remove any path information from the file name
  232. filename = os.path.basename(filename)
  233. # Use termcolor to print any archive files as bold red
  234. if filename.endswith('.gz') or filename.endswith('.bz2') or filename.endswith('.zip'):
  235. filename = colored(filename, 'red', attrs=['bold'])
  236. # If the file was nested, print the extracted path information
  237. if file_path:
  238. print(f'{container}\t{file_path}{filename}')
  239. # Otherwise, only print the file name
  240. else:
  241. print(f'{container}\t{filename}')
  242. except KeyboardInterrupt:
  243. raise SystemExit
  244. def __init__(self, container_name, expression, output_file, account_name, passphrase):
  245. # If the container name wasn't provided, set it to *
  246. self.container_name = container_name if container_name else '*'
  247. self.expression = expression if expression else '*'
  248. self.account_name = account_name
  249. if output_file:
  250. # Output file
  251. if output_file.startswith('~'):
  252. self.output_file = os.path.abspath(os.path.expanduser(os.path.join(output_file)))
  253. else:
  254. self.output_file = os.path.abspath(os.path.join(output_file))
  255. # Ensure that the output file can be used
  256. if not os.path.isfile(self.output_file):
  257. try:
  258. # Create the parental directory for the output file as required
  259. os.makedirs(os.path.dirname(self.output_file), exist_ok=True)
  260. except PermissionError:
  261. logging.error(f'Insufficient permissions to create output file {self.output_file}')
  262. raise SystemExit
  263. try:
  264. open(self.output_file, 'w').close()
  265. except IsADirectoryError:
  266. logging.error(
  267. f'A directory or an empty file name was provided for the output file {self.output_file}')
  268. raise SystemExit
  269. except PermissionError:
  270. logging.error(f'Insufficient permissions to create output file {self.output_file}')
  271. raise SystemExit
  272. else:
  273. self.output_file = str()
  274. self.passphrase = passphrase
  275. self.connect_str = str()
  276. self.blob_service_client = None
  277. def container_search(args):
  278. """
  279. Run the AzureContainerList class
  280. :param args: type ArgumentParser arguments
  281. """
  282. # Welcome message that is adjusted depending on whether an expression has been provided
  283. phrase = f'Listing containers in Azure storage account {args.account_name}.'
  284. if args.expression:
  285. phrase += f'\nFiltering containers with the expression: {args.expression}'
  286. logging.info(phrase)
  287. list_containers = AzureContainerList(
  288. expression=args.expression,
  289. account_name=args.account_name,
  290. output_file=args.output_file,
  291. passphrase=args.passphrase
  292. )
  293. list_containers.main()
  294. def azure_search(args):
  295. """
  296. Run the AzureList class with the provided command line arguments
  297. :param args: type ArgumentParser arguments
  298. """
  299. # Welcome message that is adjusted depending on whether a container and/or an expression have been provided
  300. phrase = f'Searching for files in Azure storage account {args.account_name}.'
  301. if args.container_name:
  302. phrase += f'\nFiltering containers with the expression: {args.container_name}'
  303. phrase += f'\nFiltering files with the expression: {args.expression}'
  304. logging.info(phrase)
  305. list_files = AzureList(
  306. container_name=args.container_name,
  307. expression=args.expression,
  308. account_name=args.account_name,
  309. output_file=args.output_file,
  310. passphrase=args.passphrase
  311. )
  312. list_files.main()
  313. def cli():
  314. parser = ArgumentParser(description='Explore your Azure storage account')
  315. subparsers, parent_parser = create_parent_parser(parser=parser,
  316. container=False)
  317. parent_parser.add_argument('expression',
  318. nargs='?', # This allows the argument to be optional so things behave like actual ls.
  319. default=None,
  320. type=str,
  321. help='Expression to search. This command supports regular expressions. '
  322. 'e.g. 1912* will return all containers starting with 1912, including 191216-dar '
  323. 'Note that since the regular expression is being entered on the command line, '
  324. 'you may need to escape certain characters e.g. ! should be \\!')
  325. parent_parser.add_argument('-o', '--output_file',
  326. default=str(),
  327. help='Optionally provide the name and path of file in which the outputs '
  328. 'are to be saved.')
  329. container_subparser = subparsers.add_parser(parents=[parent_parser],
  330. name='container',
  331. description='Filter and list containers in your Azure storage account',
  332. formatter_class=RawTextHelpFormatter,
  333. help='Filter and list containers in your Azure storage account')
  334. container_subparser.set_defaults(func=container_search)
  335. ls_subparser = subparsers.add_parser(parents=[parent_parser],
  336. name='search',
  337. description='Filter files in a container (or containers) in Azure storage',
  338. formatter_class=RawTextHelpFormatter,
  339. help='Filter files in a container (or containers) in Azure storage')
  340. ls_subparser.add_argument('-c', '--container_name',
  341. nargs='?',
  342. type=str,
  343. default=str(),
  344. help='Name of the Azure storage container. This command supports regular expressions '
  345. 'e.g. 1912* will return all containers starting with 1912.'
  346. 'Note that since the regular expression is being entered on the command line, '
  347. 'you may need to escape certain characters e.g. ! should be \\! '
  348. 'You can make your queries as complex as you wish: '
  349. '1912\\d{2}-\\D{3}\(\?\!*output\) will only return '
  350. 'containers that start with 1912, and have two additional digits. If '
  351. 'the word output is present, any matches are ignored. There also '
  352. 'have to be exactly three letters following a dash and the first six numbers '
  353. 'e.g. 191216-dar and 191227-dar will be returned but not 191216-dar-outputs '
  354. '191202-test, 191216dar, 1912162-dar, 191203-m05722, 191114-gta, '
  355. 'or 200105-dar (and many others)')
  356. ls_subparser.set_defaults(func=azure_search)
  357. # Set up the arguments, and run the appropriate subparser
  358. arguments = setup_arguments(parser=parser)
  359. # Return to the requested logging level, as it has been increased to WARNING to suppress the log being filled with
  360. # information from azure.core.pipeline.policies.http_logging_policy
  361. coloredlogs.install(level=arguments.verbosity.upper())
  362. # Prevent the arguments being printed to the console (they are returned in order for the tests to work)
  363. sys.stderr = open(os.devnull, 'w')
  364. return arguments